def xref_item(stage, collection, entity_id=None, against_collection_ids=None): "Cross-reference an entity against others to generate potential matches." entity_ids = [entity_id] # This is running as a background job. In order to avoid running each # entity one by one, we do it 101 at a time. This avoids sending redudant # queries to the database and elasticsearch, making cross-ref much faster. for task in stage.get_tasks(limit=100): entity_ids.append(task.payload.get('entity_id')) stage.mark_done(len(entity_ids) - 1) # log.debug("Have %d entity IDs for xref", len(entity_ids)) for data in entities_by_ids(entity_ids, includes=['schema', 'properties']): proxy = model.get_proxy(data) # log.info("XRef: %r", proxy) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_query_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection.id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def xref_collection(collection_id): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_proxies(collection_id=collection_id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) entity_id, document_id = None, None if Document.SCHEMA in proxy.schema.names: document_id = proxy.id else: entity_id = proxy.id dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) dq.delete() matches = xref_item(proxy) for (score, other_id, other) in matches: log.info("Xref [%.1f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = collection_id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def xref_item(stage, collection, entity_id=None, against_collection_ids=None): entity = get_entity(entity_id) proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_query_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection.id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def xref_collection(collection_id, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_proxies(collection_id=collection_id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection_id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit()
def _xref_item(item, collection_id=None): """Cross-reference an entity or document, given as an indexed document.""" name = item.get('name') or item.get('title') query = entity_query(item, collection_id=collection_id) if 'match_none' in query: return query = { 'query': query, 'size': 10, '_source': ['collection_id', 'name'], } result = search_safe(index=entities_index(), body=query) results = result.get('hits').get('hits') entity_id, document_id = None, None if Document.SCHEMA in item.get('schemata'): document_id = item.get('id') else: entity_id = item.get('id') dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) if collection_id is not None: dq = dq.filter(Match.match_collection_id == collection_id) dq.delete() for result in results: source = result.get('_source', {}) log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name, source.get('name')) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = item.get('collection_id') obj.match_id = result.get('_id') obj.match_collection_id = source.get('collection_id') obj.score = result.get('_score') db.session.add(obj) db.session.commit()
def xref_item(item, collection_id=None): """Cross-reference an entity or document, given as an indexed document.""" name = item.get('name') or item.get('title') result = es.search(index=es_index, doc_type=TYPE_ENTITY, body={ 'query': entity_query(item, collection_id), 'size': 10, '_source': ['collection_id', 'name'], }) results = result.get('hits').get('hits') entity_id, document_id = None, None if item.get('$type') == TYPE_DOCUMENT: document_id = item.get('id') else: entity_id = item.get('id') dq = db.session.query(Match) dq = dq.filter(Match.entity_id == entity_id) dq = dq.filter(Match.document_id == document_id) if collection_id is not None: dq = dq.filter(Match.match_collection_id == collection_id) dq.delete() for result in results: source = result.get('_source', {}) log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name, source.get('name')) obj = Match() obj.entity_id = entity_id obj.document_id = document_id obj.collection_id = item.get('collection_id') obj.match_id = result.get('_id') obj.match_collection_id = source.get('collection_id') obj.score = result.get('_score') db.session.add(obj) db.session.commit()
def xref_collection(queue, collection, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] count = count_entities(collection_id=collection.id, schemata=matchable) queue.progress.mark_pending(count) entities = iter_proxies(collection_id=collection.id, schemata=matchable) for entity in entities: proxy = model.get_proxy(entity) dq = db.session.query(Match) dq = dq.filter(Match.entity_id == proxy.id) dq.delete() matches = xref_item(proxy, collection_ids=against_collection_ids) for (score, other_id, other) in matches: log.info("Xref [%.3f]: %s <=> %s", score, proxy, other) obj = Match() obj.entity_id = proxy.id obj.collection_id = collection.id obj.match_id = other.id obj.match_collection_id = other_id obj.score = score db.session.add(obj) db.session.commit() queue.progress.mark_finished() queue.remove()