def dedup(dataset): ds = Dataset.by_name(dataset) from time import time from nomenklatura.matching import match begin = time() for value in Value.all(ds).limit(20): matches = match(value.value, ds) matches = filter(lambda (c, v, s): v != value.id, matches) print[value.value, '=?', matches[0][1], matches[0][2]] print "Time: %.2fms" % ((time() - begin) * 1000)
def dedup(dataset): ds = Dataset.by_name(dataset) from time import time from nomenklatura.matching import match begin = time() for value in Value.all(ds).limit(20): matches = match(value.value, ds) matches = filter(lambda (c,v,s): v!=value.id, matches) print [value.value, '=?', matches[0][1], matches[0][2]] print "Time: %.2fms" % ((time() - begin)*1000)
def reconcile_op(dataset, query): try: limit = max(1, min(100, int(query.get('limit')))) except ValueError: limit = 5 except TypeError: limit = 5 filters = [(p.get('p'), p.get('v')) for p in query.get('properties', [])] if dataset is None: dataset = type_to_dataset(query.get('type', '')) results = match(query.get('query', ''), dataset)[:limit] entities = Entity.id_map(dataset, map(lambda (c, e, s): e, results)) matches = [] skip = False for (candidate, entity_id, score) in results: entity = entities[entity_id] for key, fv in filters: if entity.data.get(key) != fv: skip = True if skip: continue id = url_for('entity.view', dataset=dataset.name, entity=entity.id) uri = url_for('entity.view', dataset=dataset.name, entity=entity.id, _external=True) matches.append({ 'name': entity.name, 'score': score, 'type': [{ 'id': '/' + dataset.name, 'name': dataset.label }], 'id': id, 'uri': uri, 'match': score == 100 }) return {'result': matches, 'num': len(results)}
def reconcile_op(dataset, query): try: limit = max(1, min(100, int(query.get('limit')))) except ValueError: limit = 5 except TypeError: limit = 5 filters = [(p.get('p'), p.get('v')) for p in query.get('properties', [])] if dataset is None: dataset = type_to_dataset(query.get('type', '')) results = match(query.get('query', ''), dataset)[:limit] entities = Entity.id_map(dataset, map(lambda (c,e,s): e, results)) matches = [] skip = False for (candidate, entity_id, score) in results: entity = entities[entity_id] for key, fv in filters: if entity.data.get(key) != fv: skip = True if skip: continue id = url_for('entity.view', dataset=dataset.name, entity=entity.id) uri = url_for('entity.view', dataset=dataset.name, entity=entity.id, _external=True) matches.append({ 'name': entity.name, 'score': score, 'type': [{ 'id': '/' + dataset.name, 'name': dataset.label }], 'id': id, 'uri': uri, 'match': score==100 }) return { 'result': matches, 'num': len(results) }