Ejemplo n.º 1
0
def ingest(document_id, file_path=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)
        db.session.commit()
        process_document(document)
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)
Ejemplo n.º 2
0
def ingest(document_id, file_path=None, refresh=False):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        document.status = Document.STATUS_SUCCESS
        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)

        if document.collection.casefile and not refresh:
            params = {'collection': document.collection, 'document': document}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

        db.session.commit()
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
        document.status = Document.STATUS_FAIL
        db.session.commit()
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)

    extract_document_tags(document)
    # delete_entity(document.id, exclude=document.schema)
    index_document(document)
    refresh_entity(document)
Ejemplo n.º 3
0
def analyze_document_id(document_id):
    """Analyze a document after looking it up by ID."""
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    analyze_document(document)
Ejemplo n.º 4
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    try:
        log.info("Index document: %r", document)
        data = document.to_index_dict()
        data['entities'] = generate_entities(document)
        data['title_latin'] = latinize_text(data.get('title'))
        data['summary_latin'] = latinize_text(data.get('summary'))
        get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                       id=document.id)

        clear_children(document)
        if document.type == Document.TYPE_TEXT:
            bulk(get_es(), generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(get_es(), generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INDEX, component=__name__,
                          document_id=document.id, meta=document.meta,
                          source_id=document.source_id, exception=ex)
Ejemplo n.º 5
0
def get_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        raise NotFound()
    readable = [c for c in document.collection_ids if authz.collection_read(c)]
    authz.require(len(readable))
    return document
Ejemplo n.º 6
0
    def test_upload_csv_doc(self):
        _, headers = self.login(is_admin=True)
        meta = {
            "countries": ["de", "usa"],
            "languages": ["en"],
            "mime_type": "text/csv",
            "source_url": "http://pudo.org/experts.csv",
        }
        csv_path = self.get_fixture_path("experts.csv")
        data = {
            "meta": json.dumps(meta),
            "foo": open(csv_path, "rb"),
        }
        res = self.client.post(self.url, data=data, headers=headers)
        assert res.status_code == 201, (res, res.data)
        assert "id" in res.json, res.json

        db_id, _ = res.json.get("id").split(".", 1)
        doc = Document.by_id(db_id)
        assert doc.schema == Document.SCHEMA, doc.schema
        assert doc.meta["countries"] == ["de", "us"], doc.meta
        assert doc.meta["languages"] == ["eng"], doc.meta

        status = get_status(self.col)
        assert status.get("pending") == 1, status
        job = status.get("jobs")[0]
        assert job.get("pending") == 1, job
        stage = job.get("stages")[0]
        assert stage.get("stage") == OP_INGEST, stage
        assert stage.get("pending") == 1, stage
Ejemplo n.º 7
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Ejemplo n.º 8
0
def index_document_id(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    index_document(document)
    index_records(document)
Ejemplo n.º 9
0
def prune_entity(collection, entity_id=None, job_id=None):
    """Prune handles the full deletion of an entity outside of the HTTP request
    cycle. This involves cleaning up adjacent entities like xref results, notifications
    and so on."""
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    log.info("[%s] Prune entity: %s", collection, entity_id)
    for adjacent in index.iter_adjacent(collection.id, entity_id):
        log.warning("Recursive delete: %s", adjacent.get("id"))
        delete_entity(collection, adjacent, job_id=job_id)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=entity_id)
    refresh_entity(collection, entity_id)
    collection.touch()
    db.session.commit()
Ejemplo n.º 10
0
def get_document(document_id, action=Authz.READ):
    document = Document.by_id(document_id)
    if document is None:
        raise NotFound()
    collections = request.authz.collections.get(action)
    request.authz.require(document.collection_id in collections)
    return document
Ejemplo n.º 11
0
def get_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        raise NotFound()
    readable = [c for c in document.collection_ids if authz.collection_read(c)]
    authz.require(len(readable))
    return document
Ejemplo n.º 12
0
    def test_upload_csv_doc(self):
        _, headers = self.login(is_admin=True)
        meta = {
            'countries': ['de', 'us'],
            'languages': ['en'],
            'mime_type': 'text/csv',
            'source_url': 'http://pudo.org/experts.csv'
        }
        csv_path = self.get_fixture_path('experts.csv')
        data = {
            'meta': json.dumps(meta),
            'foo': open(csv_path, 'rb'),
        }
        res = self.client.post(self.url, data=data, headers=headers)
        assert res.status_code == 201, (res, res.data)
        assert 'id' in res.json, res.json

        doc = Document.by_id(res.json.get('id'))
        assert doc.schema == Document.SCHEMA, doc.schema

        status = get_status(self.col)
        assert status.get('pending') == 1, status
        op = status.get('operations')[0]
        assert op.get('pending') == 1, op
        assert op.get('operation') == OP_INGEST, op
Ejemplo n.º 13
0
    def test_upload_csv_doc(self):
        _, headers = self.login(is_admin=True)
        meta = {
            'countries': ['de', 'usa'],
            'languages': ['en'],
            'mime_type': 'text/csv',
            'source_url': 'http://pudo.org/experts.csv'
        }
        csv_path = self.get_fixture_path('experts.csv')
        data = {
            'meta': json.dumps(meta),
            'foo': open(csv_path, 'rb'),
        }
        res = self.client.post(self.url, data=data, headers=headers)
        assert res.status_code == 201, (res, res.data)
        assert 'id' in res.json, res.json

        db_id, _ = res.json.get('id').split('.', 1)
        doc = Document.by_id(db_id)
        assert doc.schema == Document.SCHEMA, doc.schema
        assert doc.meta['countries'] == ['de', 'us'], doc.meta
        assert doc.meta['languages'] == ['eng'], doc.meta

        status = get_status(self.col)
        assert status.get('pending') == 1, status
        job = status.get('jobs')[0]
        assert job.get('pending') == 1, job
        stage = job.get('stages')[0]
        assert stage.get('stage') == OP_INGEST, stage
        assert stage.get('pending') == 1, stage
Ejemplo n.º 14
0
def ingest(document_id, user_queue=False):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, user_queue=user_queue)
Ejemplo n.º 15
0
def analyze_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        cls().analyze(document, document.meta)
    index_document(document_id)
Ejemplo n.º 16
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    try:
        for cls in get_analyzers():
            cls().analyze(document, document.meta)
    except Exception as ex:
        log.exception(ex)
    index_document(document_id)
Ejemplo n.º 17
0
def ingest(document_id, role_id=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, role_id=role_id)

    pending = Document.pending_count(collection_id=document.collection.id)
    if pending == 0:
        ingest_complete(document.collection, role_id=role_id)
Ejemplo n.º 18
0
def _load_parent(collection_id, meta):
    """Determine the parent document for the document that is to be
    ingested."""
    parent_id = meta.get('parent_id')
    if parent_id is None:
        return
    parent = Document.by_id(parent_id, collection_id=collection_id)
    if parent is None:
        raise BadRequest(response=jsonify({
            'status': 'error',
            'message': 'Cannot load parent document'
        }, status=400))
    return parent_id
Ejemplo n.º 19
0
def _load_parent(collection_id, meta):
    """Determine the parent document for the document that is to be
    ingested."""
    parent_id = meta.get('parent_id')
    if parent_id is None:
        return
    parent = Document.by_id(parent_id, collection_id=collection_id)
    if parent is None:
        raise BadRequest(response=jsonify({
            'status': 'error',
            'message': 'Cannot load parent document'
        }, status=400))
    return parent_id
Ejemplo n.º 20
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE, component=cls.__name__,
                              document_id=document.id, meta=document.meta,
                              source_id=document.source_id, exception=ex)
    index_document(document_id)
Ejemplo n.º 21
0
def _load_parent(collection, meta):
    # Determine the parent document for the document that is to be
    # ingested. This can either be specified using a document ID,
    # or using a foreign ID (because the document ID may not be as
    # easily accessible to the client).
    if 'parent' not in meta:
        return
    data = meta.get('parent')
    parent = None
    if not is_mapping(data):
        parent = Document.by_id(data, collection_id=collection.id)
    elif 'id' in data:
        parent = Document.by_id(data.get('id'), collection_id=collection.id)
    elif 'foreign_id' in data:
        parent = Document.by_keys(collection=collection,
                                  foreign_id=data.get('foreign_id'))
    if parent is None:
        raise BadRequest(response=jsonify(
            {
                'status': 'error',
                'message': 'Cannot load parent document'
            },
            status=400))
    return parent.id
Ejemplo n.º 22
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Ejemplo n.º 23
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(), generate_records(document), stats_only=True,
         chunk_size=2000, request_timeout=60.0)
Ejemplo n.º 24
0
def _load_parent(collection, meta):
    """Determine the parent document for the document that is to be
    ingested."""
    parent = ensure_dict(meta.get("parent"))
    parent_id = meta.get("parent_id", parent.get("id"))
    if parent_id is None:
        return
    parent = Document.by_id(parent_id, collection=collection)
    if parent is None:
        raise BadRequest(
            response=jsonify(
                {"status": "error", "message": "Cannot load parent document"},
                status=400,
            )
        )
    return parent
Ejemplo n.º 25
0
def delete_entity(entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity.get('id'), clazz=Entity)
    obj = Entity.by_id(entity.get('id'))
    if obj is not None:
        obj.delete(deleted_at=deleted_at)
    doc = Document.by_id(entity.get('id'))
    if doc is not None:
        doc.delete(deleted_at=deleted_at)
    index.delete_entity(entity.get('id'), sync=sync)
    refresh_entity(entity)
Ejemplo n.º 26
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Ejemplo n.º 27
0
def ingest(document_id, role_id=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, role_id=role_id)

    # is this too often?
    from aleph.logic.collections import update_collection
    update_collection(document.collection)

    from aleph.logic.notifications import publish
    params = {'document': document, 'collection': document.collection}
    publish(Events.INGEST_DOCUMENT, actor_id=role_id, params=params)
Ejemplo n.º 28
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
Ejemplo n.º 29
0
def ingest_url(self, document_id, url):
    """Load the given URL into the document specified by document_id."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    tmp_path = make_tempfile(document.file_name, suffix=document.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 500:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
            return
        if res.status_code >= 400:
            document.status = Document.STATUS_FAIL
            document.error_message = "HTTP %s: %s" % (res.status_code, url)
            db.session.commit()
            return
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not document.has_meta('source_url'):
            document.source_url = res.url
        if not document.foreign_id:
            document.foreign_id = res.url
        document.headers = res.headers
        document.content_hash = archive.archive_file(tmp_path)
        db.session.commit()
        get_manager().ingest_document(document)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        document.status = Document.STATUS_FAIL
        document.error_type = type(ex).__name__
        document.error_message = six.text_type(ex)
        db.session.commit()
        log.exception(ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Ejemplo n.º 30
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
Ejemplo n.º 31
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE,
                              component=cls.__name__,
                              document_id=document.id,
                              meta=document.meta,
                              source_id=document.source_id,
                              exception=ex)
    index_document(document_id)
Ejemplo n.º 32
0
def ingest(document_id, role_id=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    get_manager().ingest_document(document, role_id=role_id)

    if document.collection.casefile:
        index_collection(document.collection)
        params = {
            'document': document,
            'collection': document.collection
        }
        publish(Events.INGEST_DOCUMENT,
                actor_id=role_id,
                params=params)
Ejemplo n.º 33
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Ejemplo n.º 34
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Ejemplo n.º 35
0
def generate_entity_references(entity):
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.
    if entity.state != Entity.STATE_ACTIVE:
        entity.delete_references(origin='regex')
        return

    log.info("Updating document references: %r", entity)
    rex = '|'.join([t for t in entity.regex_terms])
    rex = re.compile('(%s)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = match_form(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception as ex:
        log.exception(ex)

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    entity.delete_references(origin='regex')
    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = doc.id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity)
Ejemplo n.º 36
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)
Ejemplo n.º 37
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)
Ejemplo n.º 38
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(),
         generate_records(document),
         stats_only=True,
         chunk_size=2000,
         request_timeout=60.0)
Ejemplo n.º 39
0
def delete_entity(collection, entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    entity_id = collection.ns.sign(entity.get("id"))
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    index.delete_entity(entity_id, sync=sync)
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id, sync=sync)
    delete_aggregator_entity(collection, entity_id)
    refresh_entity(collection, entity_id)
Ejemplo n.º 40
0
def index_document_id(document_id, index_records=True):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    index_document(document)
Ejemplo n.º 41
0
def analyze_document_id(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    analyze_document(document)
Ejemplo n.º 42
0
def process_document_id(document_id):
    """Perform post-ingest tasks like analysis and indexing."""
    analyze_document(Document.by_id(document_id))
Ejemplo n.º 43
0
Archivo: util.py Proyecto: 01-/aleph
def get_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        raise NotFound()
    authz.require(authz.source_read(document.source_id))
    return document
Ejemplo n.º 44
0
Archivo: util.py Proyecto: afcarl/aleph
def get_db_document(document_id, action=Authz.READ):
    document = obj_or_404(Document.by_id(document_id))
    require(request.authz.can(document.collection_id, action))
    return document