def test_compare_countries(self): left = { "schema": "Person", "properties": {"name": ["Frank Banana"], "nationality": ["ie"]}, } data = {"schema": "Person", "properties": {"name": ["Frank Banana"]}} no_country = model.get_proxy(data) baseline = compare(model, left, no_country) self.assertGreater(compare(model, left, left), baseline)
def test_compare_quality(self): best_score = compare(model, ENTITY, ENTITY) reduced = deepcopy(ENTITY) reduced['properties'].pop('birthDate') reduced['properties'].pop('idNumber') self.assertLess(compare(model, ENTITY, reduced), best_score) reduced = deepcopy(ENTITY) reduced['properties']['name'] = 'Frank Banana' self.assertLess(compare(model, ENTITY, reduced), best_score)
def test_compare_basic(self): best_score = compare(ENTITY, ENTITY) assert best_score > 0.5, best_score comp = {'id': 'bla', 'schema': 'RealEstate'} assert compare(ENTITY, comp) == 0 assert compare(comp, comp) == 0 reduced = deepcopy(ENTITY) reduced['properties'].pop('birthDate') assert compare(ENTITY, reduced) < best_score
def test_compare_quality(self): best_score = compare(model, ENTITY, ENTITY) reduced = deepcopy(ENTITY) reduced["properties"].pop("birthDate") reduced["properties"].pop("idNumber") self.assertLess(compare(model, ENTITY, reduced), best_score) reduced = deepcopy(ENTITY) reduced["properties"]["name"] = ["Frank Banana"] self.assertLess(compare(model, ENTITY, reduced), best_score)
def test_compare_basic(self): best_score = compare(model, ENTITY, ENTITY) assert best_score > 0.5, best_score comp = {'schema': 'RealEstate'} self.assertEqual(compare(model, ENTITY, comp), 0) self.assertEqual(compare(model, comp, comp), 0) comp = {'schema': 'Person'} self.assertEqual(compare(model, ENTITY, comp), 0) comp = {'schema': 'LegalEntity'} self.assertEqual(compare(model, ENTITY, comp), 0)
def test_compare_basic(self): best_score = compare(model, ENTITY, ENTITY) assert best_score > 0.5, best_score comp = {"schema": "RealEstate"} self.assertAlmostEqual(compare(model, ENTITY, comp), 0) self.assertAlmostEqual(compare(model, comp, comp), 0) comp = {"schema": "Person"} self.assertAlmostEqual(compare(model, ENTITY, comp), 0) comp = {"schema": "LegalEntity"} self.assertAlmostEqual(compare(model, ENTITY, comp), 0)
def test_compare_countries(self): left = { 'schema': 'Person', 'properties': { 'name': ['Frank Banana'], 'nationality': ['ie'] } } data = {'schema': 'Person', 'properties': {'name': ['Frank Banana']}} no_country = model.get_proxy(data) baseline = compare(model, left, no_country) self.assertGreater(compare(model, left, left), baseline)
def test_compare_quality(self): entity = model.get_proxy(ENTITY) best_score = compare(model, entity, entity) reduced = deepcopy(ENTITY) reduced["properties"].pop("birthDate") reduced["properties"].pop("idNumber") reduced_proxy = model.get_proxy(reduced) self.assertLess(compare(model, entity, reduced_proxy), best_score) reduced = deepcopy(ENTITY) reduced["properties"]["name"] = ["Frank Banana"] reduced_proxy = model.get_proxy(reduced) self.assertLess(compare(model, entity, reduced_proxy), best_score)
def test_compare_basic(self): entity = model.get_proxy(ENTITY) best_score = compare(model, entity, entity) assert best_score > 0.5, best_score comp = model.get_proxy({"schema": "RealEstate"}) self.assertAlmostEqual(compare(model, entity, comp), 0) self.assertAlmostEqual(compare(model, comp, comp), 0) comp = model.get_proxy({"schema": "Person"}) self.assertAlmostEqual(compare(model, entity, comp), 0) comp = model.get_proxy({"schema": "LegalEntity"}) self.assertAlmostEqual(compare(model, entity, comp), 0)
def _query_item(entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = { "query": query, "size": 100, "_source": { "includes": PROXY_INCLUDES } } matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match
def similar(profile_id): """ --- get: summary: Get similar entities description: > Get a list of similar entities to the profile with id `profile_id` parameters: - in: path name: profile_id required: true schema: type: string - in: query name: 'filter:schema' schema: items: type: string type: array - in: query name: 'filter:schemata' schema: items: type: string type: array responses: '200': description: Returns a list of entities content: application/json: schema: $ref: '#/components/schemas/EntitiesResponse' tags: - Profile """ # enable_cache() profile = obj_or_404(get_profile(profile_id, authz=request.authz)) require(request.authz.can(profile.get("collection_id"), request.authz.READ)) tag_request(collection_id=profile.get("collection_id")) exclude = [item["entity_id"] for item in profile["items"]] result = MatchQuery.handle(request, entity=profile["merged"], exclude=exclude) entities = list(result.results) result.results = [] for obj in entities: item = { "score": compare(model, profile["merged"], obj), "judgement": Judgement.NO_JUDGEMENT, "collection_id": profile.get("collection_id"), "entity": obj, } result.results.append(item) return SimilarSerializer.jsonify_result(result)
def similar(entity_id): """ --- get: summary: Get similar entities description: > Get a list of similar entities to the entity with id `entity_id` parameters: - in: path name: entity_id required: true schema: type: string - in: query name: 'filter:schema' schema: items: type: string type: array - in: query name: 'filter:schemata' schema: items: type: string type: array responses: '200': description: Returns a list of scored and judged entities content: application/json: schema: $ref: '#/components/schemas/SimilarResponse' tags: - Entity """ # enable_cache() entity = get_index_entity(entity_id, request.authz.READ) tag_request(collection_id=entity.get("collection_id")) proxy = model.get_proxy(entity) result = MatchQuery.handle(request, entity=proxy) entities = list(result.results) pairs = [(entity_id, s.get("id")) for s in entities] judgements = pairwise_judgements(pairs, entity.get("collection_id")) result.results = [] for obj in entities: item = { "score": compare(model, proxy, obj), "judgement": judgements.get((entity_id, obj.get("id"))), "collection_id": entity.get("collection_id"), "entity": obj, } result.results.append(item) return SimilarSerializer.jsonify_result(result)
def xref_item(proxy): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } result = search_safe(index=entities_index(), body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) yield score, result.get('collection_id'), other
def _query_item(entity, entitysets=True): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return log.debug("Candidate [%s]: %s", entity.schema.name, entity.caption) entityset_ids = EntitySet.entity_entitysets(entity.id) if entitysets else [] query = {"query": query, "size": 50, "_source": ENTITY_SOURCE} index = entities_read_index(schema=list(entity.schema.matchable_schemata)) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match, entityset_ids
def _query_item(collection, entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = {'query': query, 'size': 100, '_source': {'includes': INCLUDES}} matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get('hits').get('hits'): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: # log.debug('Match: %r <-[%.3f]-> %r', # entity.caption, score, match.caption) yield score, entity, result.get('collection_id'), match
def xref_item(proxy, collection_ids=None): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy, collection_ids=collection_ids) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } matchable = list(proxy.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) if score >= SCORE_CUTOFF: yield score, result.get('collection_id'), other
def dedupe(cls, session, threshold=0.5): entities = [] for entity in cls.all(session): proxy = entity.proxy if not proxy.schema.matchable: continue entities.append(proxy) log.info("Loaded %s matchable entities", len(entities)) compares = 0 for (a, b) in combinations(entities, 2): if a.id >= b.id: continue compares += 1 if compares % 10000 == 0: log.info("Comparisons: %s", compares) session.commit() score = compare(model, a, b) if score > threshold: log.info("Potential match [%s]: %s ./. %s", score, a, b) # TODO: priority Match.save(session, a, b, score=score) Match.save(session, b, a, score=score)
def reconcile_op(query): """Reconcile operation for a single query.""" parser = SearchQueryParser( { 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') schema = query.get('type') or Entity.THING proxy = model.make_entity(schema) proxy.add('name', query.get('query', '')) for p in query.get('properties', []): proxy.add(p.get('pid'), p.get('v'), quiet=True) query = MatchQuery(parser, entity=proxy) matches = [] for doc in query.search().get('hits').get('hits'): entity = unpack_result(doc) if entity is None: continue entity = model.get_proxy(entity) score = math.ceil(compare(model, proxy, entity) * 100) match = { 'id': entity.id, 'name': entity.caption, 'score': score, 'uri': entity_url(entity.id), 'match': False } for type_ in get_freebase_types(): if entity.schema.name == type_['id']: match['type'] = [type_] matches.append(match) log.info("Reconciled: %r -> %d matches", name, len(matches)) return {'result': matches, 'num': len(matches)}
def score(self): if self._score is not None: return self._score if self.entity and self.canonical: self._score = compare(self.model, self.entity, self.canonical) return self._score
def score(self): if self.subject is None or self.candidate is None: return 0.0 if self.subject.id == self.candidate.id: return 1.0 return compare(model, self.subject, self.candidate)
def benchmark(): proxy = create_proxy() for i in range(10_000): compare.compare(model, proxy, proxy)