Esempio n. 1
0
def xref_collection(stage, collection):
    """Cross-reference all the entities and documents in a collection."""
    delete_xref(collection, sync=True)
    delete_entities(collection.id, origin=ORIGIN, sync=True)
    index_matches(collection, _query_entities(collection))
    index_matches(collection, _query_mentions(collection))
    reindex_collection(collection, sync=False)
Esempio n. 2
0
def xref_entity(collection, proxy):
    """Cross-reference a single proxy in the context of a collection."""
    if not proxy.schema.matchable:
        return
    log.info("[%s] Generating xref: %s...", collection, proxy.id)
    delete_xref(collection, entity_id=proxy.id, sync=True)
    index_matches(collection, _query_item(proxy))
Esempio n. 3
0
def xref_collection(stage, collection):
    """Cross-reference all the entities and documents in a collection."""
    log.info("[%s] Clearing previous xref state....", collection)
    delete_xref(collection, sync=True)
    delete_entities(collection.id, origin=ORIGIN, sync=True)
    index_matches(collection, _query_entities(collection))
    index_matches(collection, _query_mentions(collection))
    log.info("[%s] Xref done, re-indexing to reify mentions...", collection)
    reindex_collection(collection, sync=False)
Esempio n. 4
0
def xref_item(stage, collection, entity_id=None, batch=50):
    "Cross-reference an entity against others to generate potential matches."
    entity_ids = [entity_id]
    # This is running as a background job. In order to avoid running each
    # entity one by one, we do it 101 at a time. This avoids sending redudant
    # queries to the database and elasticsearch, making cross-ref much faster.
    for task in stage.get_tasks(limit=batch):
        entity_ids.append(task.payload.get("entity_id"))
    matches = _query_matches(collection, entity_ids)
    index.index_matches(collection, matches, sync=False)
    stage.mark_done(len(entity_ids) - 1)