Example #1
0
def create():
    require(request.authz.logged_in)
    data = parse_request(CollectionSchema)
    role = Role.by_id(request.authz.id)
    collection = create_collection(data, role=role)
    refresh_index(collections_index())
    return view(collection.id)
Example #2
0
def delete(id):
    entity = get_db_entity(id, request.authz.WRITE)
    delete_entity(entity)
    db.session.commit()
    update_collection(entity.collection)
    refresh_index(entities_index())
    return ('', 204)
Example #3
0
def create():
    data = parse_request(EntityCreateSchema)
    collection = get_db_collection(data['collection_id'], request.authz.WRITE)
    entity = Entity.create(data, collection)
    db.session.commit()
    data = update_entity(entity)
    update_collection(collection)
    refresh_index(entities_index())
    return serialize_data(data, CombinedSchema)
Example #4
0
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, path)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, None)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    if collection.casefile:
        for document in documents:
            params = {'document': document, 'collection': collection}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    refresh_index(index=entities_index())
    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
Example #5
0
def index_records(document):
    if not document.supports_records:
        return

    clear_records(document.id)
    for attempt in count():
        try:
            bulk_op(generate_records(document))
            refresh_index(records_index())
            return
        except Exception as exc:
            log.warning('Failed to index records: %s', exc)
        backoff_cluster(failures=attempt)
Example #6
0
def index_records(document):
    if not document.supports_records:
        return

    clear_records(document.id, refresh=False)
    while True:
        try:
            bulk_op(generate_records(document))
            refresh_index(index=records_index())
            return
        except BulkIndexError as exc:
            log.exception(exc)
            time.sleep(RETRY_DELAY)
Example #7
0
def delete_documents(collection_id):
    """Delete documents from a collection."""
    records_query = {'term': {'collection_id': collection_id}}
    query_delete(records_index(), records_query)
    refresh_index(index=records_index())
    query = {
        'bool': {
            'must': [{
                'term': {
                    'schemata': 'Document'
                }
            }, {
                'term': {
                    'collection_id': collection_id
                }
            }]
        }
    }
    query_delete(entities_index(), query)
Example #8
0
 def flush_index(self):
     refresh_index()
Example #9
0
def delete(document_id):
    document = get_db_document(document_id, request.authz.WRITE)
    delete_document(document)
    update_collection(document.collection)
    refresh_index(entities_index())
    return ('', 204)
Example #10
0
def index_collection(collection):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = {
        'foreign_id': collection.foreign_id,
        'created_at': collection.created_at,
        'updated_at': collection.updated_at,
        'label': collection.label,
        'kind': collection.kind,
        'summary': collection.summary,
        'category': Collection.DEFAULT,
        'publisher': collection.publisher,
        'publisher_url': collection.publisher_url,
        'info_url': collection.info_url,
        'data_url': collection.data_url,
        'casefile': collection.casefile,
        'roles': collection.roles,
        'schemata': {},
        'team': []
    }
    texts = [v for v in data.values() if isinstance(v, str)]

    if collection.category in Collection.CATEGORIES:
        data['category'] = collection.category

    if collection.creator is not None:
        data['creator'] = {
            'id': collection.creator.id,
            'type': collection.creator.type,
            'name': collection.creator.name
        }
        texts.append(collection.creator.name)

    for role in collection.team:
        data['team'].append({
            'id': role.id,
            'type': role.type,
            'name': role.name
        })
        texts.append(role.name)

    # Compute some statistics on the content of a collection.
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter': [{
                    'term': {
                        'collection_id': collection.id
                    }
                }, {
                    'term': {
                        'schemata': Entity.THING
                    }
                }]
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 500
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = search_safe(index=entities_index(), body=query)
    aggregations = result.get('aggregations')
    data['count'] = result['hits']['total']

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    countries = collection.countries
    if countries is None or not len(countries):
        countries = aggregations['countries']['buckets']
        countries = [c['key'] for c in countries]
    data['countries'] = exactitude.countries.normalize_set(countries)

    languages = collection.languages
    if languages is None or not len(languages):
        languages = aggregations['languages']['buckets']
        languages = [c['key'] for c in languages]
    data['languages'] = exactitude.languages.normalize_set(languages)

    texts.extend([normalize(t, ascii=True) for t in texts])
    data['text'] = index_form(texts)
    data = index_safe(collections_index(), collection.id, data)
    refresh_index(index=collections_index())
    return data
Example #11
0
def delete_collection(collection_id):
    """Delete all documents from a particular collection."""
    q = {'ids': {'values': str(collection_id)}}
    query_delete(collections_index(), q)
    refresh_index(index=collections_index())
Example #12
0
def delete(id):
    collection = get_db_collection(id, request.authz.WRITE)
    delete_collection(collection)
    refresh_index(collections_index())
    return ('', 204)
Example #13
0
def delete_entity(entity_id):
    """Delete an entity from the index."""
    q = {'ids': {'values': str(entity_id)}}
    query_delete(entities_index(), q)
    refresh_index(index=entities_index())
Example #14
0
def delete_document(document_id):
    clear_records(document_id)
    delete_entity(document_id)
    refresh_index(index=records_index())
Example #15
0
def clear_records(document_id, refresh=True):
    """Delete all records associated with the given document."""
    q = {'term': {'document_id': document_id}}
    query_delete(records_index(), q)
    if refresh:
        refresh_index(index=records_index())
Example #16
0
 def flush_index(self):
     refresh_index(all_indexes())