def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" delete_xref(collection, sync=True) delete_entities(collection.id, origin=ORIGIN, sync=True) index_matches(collection, _query_entities(collection)) index_matches(collection, _query_mentions(collection)) reindex_collection(collection, sync=False)
def xref_entity(collection, proxy): """Cross-reference a single proxy in the context of a collection.""" if not proxy.schema.matchable: return log.info("[%s] Generating xref: %s...", collection, proxy.id) delete_xref(collection, entity_id=proxy.id, sync=True) index_matches(collection, _query_item(proxy))
def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" log.info("[%s] Clearing previous xref state....", collection) delete_xref(collection, sync=True) delete_entities(collection.id, origin=ORIGIN, sync=True) index_matches(collection, _query_entities(collection)) index_matches(collection, _query_mentions(collection)) log.info("[%s] Xref done, re-indexing to reify mentions...", collection) reindex_collection(collection, sync=False)
def xref_item(stage, collection, entity_id=None, batch=50): "Cross-reference an entity against others to generate potential matches." entity_ids = [entity_id] # This is running as a background job. In order to avoid running each # entity one by one, we do it 101 at a time. This avoids sending redudant # queries to the database and elasticsearch, making cross-ref much faster. for task in stage.get_tasks(limit=batch): entity_ids.append(task.payload.get("entity_id")) matches = _query_matches(collection, entity_ids) index.index_matches(collection, matches, sync=False) stage.mark_done(len(entity_ids) - 1)