def _query_item(entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = { "query": query, "size": 100, "_source": { "includes": PROXY_INCLUDES } } matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match
def match_query(proxy, source_collection_id=None, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity and source collection_id: must_not = [] if proxy.id is not None: must_not.append({"ids": {"values": [proxy.id]}}) if source_collection_id is not None: must_not.append({'term': {'collection_id': source_collection_id}}) if len(must_not): query['bool']['must_not'].extend(must_not) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) filters = [] for (prop, value) in proxy.itervalues(): specificity = prop.specificity(value) if specificity > 0: filters.append((prop, value, specificity)) filters = sorted(filters, key=lambda p: p[2], reverse=True) required = [] for (prop, value, specificity) in filters: if prop.type in REQUIRED and len(required) <= MAX_CLAUSES: required.extend(_make_queries(prop, value, specificity)) scoring = [] for (prop, value, specificity) in filters: clauses = len(required) + len(scoring) if prop.type not in REQUIRED and clauses <= MAX_CLAUSES: scoring.extend(_make_queries(prop, value, specificity)) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': required, 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) filters = [] for (prop, value) in proxy.itervalues(): specificity = prop.specificity(value) if specificity > 0: filters.append((prop, value, specificity)) filters = sorted(filters, key=lambda p: p[2], reverse=True) required = [] for (prop, value, specificity) in filters: if prop.type in REQUIRED and len(required) <= MAX_CLAUSES: required.extend(_make_queries(prop, value, specificity)) scoring = [] for (prop, value, specificity) in filters: clauses = len(required) + len(scoring) if prop.type not in REQUIRED and clauses <= MAX_CLAUSES: scoring.extend(_make_queries(prop, value, specificity)) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def _query_item(entity, entitysets=True): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return log.debug("Candidate [%s]: %s", entity.schema.name, entity.caption) entityset_ids = EntitySet.entity_entitysets(entity.id) if entitysets else [] query = {"query": query, "size": 50, "_source": ENTITY_SOURCE} index = entities_read_index(schema=list(entity.schema.matchable_schemata)) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match, entityset_ids
def xref_item(proxy): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } result = search_safe(index=entities_index(), body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) yield score, result.get('collection_id'), other
def _query_item(collection, entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = {'query': query, 'size': 100, '_source': {'includes': INCLUDES}} matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get('hits').get('hits'): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: # log.debug('Match: %r <-[%.3f]-> %r', # entity.caption, score, match.caption) yield score, entity, result.get('collection_id'), match
def xref_item(proxy, collection_ids=None): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy, collection_ids=collection_ids) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } matchable = list(proxy.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) if score >= SCORE_CUTOFF: yield score, result.get('collection_id'), other
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) required = [] scoring = [] for (prop, value) in proxy.itervalues(): queries = list(_make_queries(prop, value)) if prop.type in REQUIRED: required.extend(queries) else: scoring.extend(queries) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) # Attempt to find only matches within the "matchable" set of # entity schemata. For example, a Company and be matched to # another company or a LegalEntity, but not a Person. # Real estate is "unmatchable", i.e. even if two plots of land # have almost the same name and criteria, it does not make # sense to suggest they are the same. if proxy.schema.name != Entity.THING: matchable = [s.name for s in proxy.schema.matchable_schemata] if not len(matchable): return none_query() query['bool']['must'].append({ "terms": {"schema": matchable} }) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['must'].append({ 'terms': {'collection_id': collection_ids} }) required = [] for name in proxy.names: required.append({ 'match': { 'names.text': { 'query': name, 'operator': 'and', 'minimum_should_match': '60%', } } }) fp = fingerprints.generate(name) if fp is not None: required.append({ 'match': { 'fingerprints': { 'query': fp, 'fuzziness': 1, 'operator': 'and', 'boost': 3.0 } } }) for type_ in registry.types: if not type_.strong or type_.group is None: continue for value in proxy.get_type_values(type_): required.append({ 'term': { type_.group: { 'value': value, 'boost': 3.0 } } }) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ "bool": { "should": required, "minimum_should_match": 1 } }) return query