Example #1
0
def reingest_collection(collection, job_id=None, index=False, flush=True):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    if flush:
        ingest_flush(collection)
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
Example #2
0
def reingest_collection(collection, job_id=None, index=False):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=OP_ANALYZE)
    aggregator.delete(origin=OP_INGEST)
    aggregator.close()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
Example #3
0
def generate_collection_docs(collection):
    q = Document.by_collection(collection.id)
    q = q.order_by(Document.id.asc())
    for idx, document in enumerate(q.yield_per(BULK_PAGE)):
        try:
            log.info("Index [%s]: %s", document.id, document.name)
            yield from generate_document(document)
        except Exception:
            log.exception("Cannot index [%s]: %s", document.id, document.name)

        if idx % 1000 == 0:
            db.session.expunge_all()
Example #4
0
def generate_collection_docs(collection):
    q = Document.by_collection(collection.id)
    q = q.order_by(Document.id.asc())
    for idx, document in enumerate(q.yield_per(BULK_PAGE)):
        try:
            log.info("Index [%s]: %s", document.id, document.name)
            yield from generate_document(document)
        except Exception:
            log.exception("Cannot index [%s]: %s", document.id, document.name)

        if idx % 1000 == 0:
            db.session.expunge_all()
Example #5
0
def aggregate_model(collection, aggregator):
    """Sync up the aggregator from the Aleph domain model."""
    log.debug("[%s] Aggregating model...", collection)
    aggregator.delete(origin=MODEL_ORIGIN)
    writer = aggregator.bulk()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        writer.put(proxy, fragment="db", origin=MODEL_ORIGIN)
    for entity in Entity.by_collection(collection.id):
        proxy = entity.to_proxy()
        aggregator.delete(entity_id=proxy.id)
        writer.put(proxy, fragment="db", origin=MODEL_ORIGIN)
    writer.flush()
Example #6
0
def _export_balkhash_collection(collection, retries=0, backoff=30, offset=0):
    MAX_RETRIES = 5
    RETRY_BACKOFF_FACTOR = 2
    try:
        from followthemoney import model
        dataset = get_dataset(collection.foreign_id)
        writer = dataset.bulk()
        q = Document.by_collection(collection.id)
        q = q.order_by(Document.id.asc()).offset(offset)
        for doc in q.yield_per(5000):
            log.debug("Export [%s:%s]: %s", doc.id, doc.schema, doc.name)
            dproxy = doc.to_proxy()
            writer.put(dproxy)
            if doc.supports_records:
                q = db.session.query(DocumentRecord)
                q = q.filter(DocumentRecord.document_id == doc.id)
                for record in q.yield_per(100):
                    rproxy = record.to_proxy()
                    writer.put(rproxy)
                    dpart = model.make_entity(doc.schema)
                    dpart.id = dproxy.id
                    dpart.add('indexText', list(record.texts))
                    writer.put(dpart, fragment=str(record.id))
            offset += 1
        dataset.close()
    except DBAPIError as exc:
        if retries < MAX_RETRIES:
            log.debug("Error occurred: %s", exc)
            log.debug("Retrying in %s seconds", backoff)
            db.session.close()
            dataset.close()
            time.sleep(backoff)
            retries = retries + 1
            backoff = backoff * RETRY_BACKOFF_FACTOR
            return _export_balkhash_collection(collection, retries, backoff,
                                               offset)
        else:
            log.exception(exc)
Example #7
0
 def _proxies(collection):
     for entity in Entity.by_collection(collection.id).yield_per(5000):
         yield entity.to_proxy()
     for document in Document.by_collection(collection.id).yield_per(5000):
         yield document.to_proxy()