Example #1
0
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        delete_entity(document.id)
        return

    name = document.name
    log.info("Index document [%s]: %s", document.id, name)
    data = {
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'title': document.title,
        'name': name,
        'summary': document.summary,
        'author': document.author,
        'generator': document.generator,
        'file_size': document.file_size,
        'file_name': document.file_name,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'date': document.date,
        'authored_at': document.authored_at,
        'modified_at': document.modified_at,
        'published_at': document.published_at,
        'retrieved_at': document.retrieved_at,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'ancestors': document.ancestors,
        'children': document.children.count()
    }

    texts = list(document.texts)
    texts.extend(document.columns)

    if document.parent_id is not None:
        texts.append(document.parent.title)
        data['parent'] = {
            'id': document.parent_id,
            'schema': document.parent.schema,
            'title': document.parent.title,
        }

    for (field, values) in generate_tags(document):
        if field not in data:
            data[field] = list(values)
        else:
            data[field].extend(values)
        texts.extend(values)

    return index_single(document, data, texts)
Example #2
0
def ingest(document_id, file_path=None, refresh=False):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        document.status = Document.STATUS_SUCCESS
        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)

        if document.collection.casefile and not refresh:
            params = {'collection': document.collection, 'document': document}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

        db.session.commit()
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
        document.status = Document.STATUS_FAIL
        db.session.commit()
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)

    extract_document_tags(document)
    delete_entity(document.id, exclude=document.schema)
    index_document(document)
    refresh_entity(document)
Example #3
0
def delete_entity(entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity.get('id'), clazz=Entity)
    obj = Entity.by_id(entity.get('id'))
    if obj is not None:
        obj.delete(deleted_at=deleted_at)
    doc = Document.by_id(entity.get('id'))
    if doc is not None:
        doc.delete(deleted_at=deleted_at)
    index.delete_entity(entity.get('id'), sync=sync)
    refresh_entity(entity)
Example #4
0
def delete_entity(collection, entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    entity_id = collection.ns.sign(entity.get("id"))
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    index.delete_entity(entity_id, sync=sync)
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id, sync=sync)
    delete_aggregator_entity(collection, entity_id)
    refresh_entity(collection, entity_id)
Example #5
0
def delete_document(document_id, sync=False):
    delete_records(document_id=document_id, sync=False)
    delete_entity(document_id, sync=sync)
Example #6
0
def delete_document(document_id):
    clear_records(document_id)
    delete_entity(document_id)
Example #7
0
def delete_entity(entity, deleted_at=None):
    flush_notifications(entity)
    entity.delete(deleted_at=deleted_at)
    index.delete_entity(entity.id)
Example #8
0
def delete_entity(collection, entity, sync=False, job_id=None):
    """Delete entity from index and redis, queue full prune."""
    entity_id = collection.ns.sign(entity.get("id"))
    index.delete_entity(entity_id, sync=sync)
    refresh_entity(collection, entity_id)
    queue_task(collection, OP_PRUNE_ENTITY, job_id=job_id, entity_id=entity_id)
Example #9
0
def delete_entity(entity, deleted_at=None, sync=False):
    flush_notifications(entity)
    entity.delete(deleted_at=deleted_at)
    refresh_entity(entity)
    index.delete_entity(entity.id, sync=sync)
Example #10
0
def delete_entity(entity, deleted_at=None, sync=False):
    flush_notifications(entity)
    collection = entity.collection
    entity.delete(deleted_at=deleted_at)
    index.delete_entity(entity.id, sync=sync)
    refresh_collection(collection, sync=sync)
Example #11
0
def delete_document(document_id):
    clear_records(document_id)
    delete_entity(document_id)
    refresh_index(index=records_index())
Example #12
0
def delete_entity(entity, deleted_at=None, sync=False):
    flush_notifications(entity)
    entity.delete(deleted_at=deleted_at)
    refresh_entity(entity)
    index.delete_entity(entity.id, sync=sync)
Example #13
0
def delete_entity(entity, deleted_at=None, sync=False):
    flush_notifications(entity)
    entity.delete(deleted_at=deleted_at)
    index.delete_entity(entity.id, sync=sync)
    # TODO: implement recursion?
    refresh_entity(entity)