def search(raw_query, query_type='/geo/country'): raw_query = fingerprints.generate(raw_query) countries = get_countries() rv = [] matches = countries[fingerprints.generate(raw_query)] for m in matches: m['comparison_score'] = difflib.SequenceMatcher( None, raw_query, fingerprints.generate(m['name_to_match'])) \ .quick_ratio() for m in sorted(matches, key=lambda i: i['comparison_score'], reverse=True): score = m['comparison_score'] rv.append({ 'id': str(m['id']), 'name': m['canonical_name'], 'type': [QUERY_TYPES[0]['id']], 'score': score * 100, 'match': score == 1.0, 'all_labels': { 'score': score * 100, 'weighted': score * 100 } }) return rv
def load_entity(tx, entity): log.info("Load node [%s]: %s", entity.id, entity.name) node = Node(Vocab.Entity, fingerprint=fingerprints.generate(entity.name), name=entity.name, alephState=entity.state, alephEntity=entity.id) if entity.jurisdiction_code is not None: node['countryCode'] = entity.jurisdiction_code.upper() tx.merge(node, Vocab.Entity, 'fingerprint') for collection in entity.collections: coll_node = load_collection(tx, collection) rel = Relationship(node, Vocab.PART_OF, coll_node, alephEntity=entity.id) tx.merge(rel, Vocab.PART_OF) seen = set([node['fingerprint']]) for other_name in entity.other_names: fingerprint = fingerprints.generate(other_name.display_name) if fingerprint in seen or fingerprint is None: continue seen.add(fingerprint) alias = Node(Vocab.Entity, fingerprint=fingerprint, name=other_name.display_name, alephEntity=entity.id, isAlias=True) tx.merge(alias, Vocab.Entity, 'fingerprint') rel = Relationship(node, Vocab.AKA, alias, alephId=other_name.id) tx.merge(rel, Vocab.AKA, 'alephId') # TODO contact details, addresses return node
def fingerprint(self): self.cluster = [] for i in range(0, len(self.group)): for j in range(i + 1, len(self.group)): if fingerprints.generate( self.group[i]) == fingerprints.generate(self.group[j]): self.cluster.append([self.group[i], self.group[j]]) return self.cluster
def load_to_neo4j(project, neo4j_uri=None): neo4j_uri = neo4j_uri or env.NEO4J_URI if neo4j_uri is None: project.log.error("No $NEO4J_URI set, cannot load graph.") return project.log.info("Loading graph to Neo4J: %s", neo4j_uri) graph = Graph(neo4j_uri) tx = graph.begin() try: tx.run('MATCH (n) DETACH DELETE n') entities = {} for entity in project.iter_merged_entities(): label = entity.pop('type', None) or 'Other' node = Node(label, **normalise(entity)) tx.create(node) entities[entity['uid']] = node # create "Name" fake nodes fps = set() for name in entity.get('names', []): fp = fingerprints.generate(name) if fp is None: continue fp = fp.replace(' ', '-') if fp in fps: continue fps.add(fp) alias = Node('Name', name=name, fp=fp) tx.merge(alias, 'Name', 'fp') rel = Relationship(node, 'ALIAS', alias) tx.create(rel) address = entity.get('address') fp = fingerprints.generate(address) if fp is not None: fp = fp.replace(' ', '-') loc = Node('Address', name=address, fp=fp) tx.merge(loc, 'Address', 'fp') rel = Relationship(node, 'LOCATION', alias) tx.create(rel) for link in project.iter_merged_links(): source = entities.get(link.pop('source')) target = entities.get(link.pop('target')) if source is None or target is None: continue rel = Relationship(source, 'LINK', target, **normalise(link)) tx.create(rel) clear_leaf_nodes(tx, 'Name') clear_leaf_nodes(tx, 'Address') tx.commit() except Exception as ex: project.log.exception(ex) tx.rollback()
def fingerprint(self): key = self.key if not isinstance(key, str): key = unidecode(key) fp = fingerprints.generate(key) if TRACE_TEXT or TRACE_FP: logger_debug('Text.fingerprint:key: ', repr(self.key)) logger_debug('Text.fingerprint:fp : ', fingerprints.generate(unidecode(self.key))) self.key = fp
def entity_similarity(left, right): left_name = left.get('name') right_name = right.get('name') score = 0 if left_name is not None and right_name is not None: name_sim = jaro_winkler(chomp(left_name), chomp(right_name)) score += (name_sim * 0.6) left_fp = fingerprints.generate(left_name) right_fp = fingerprints.generate(right_name) if left_fp is not None and right_fp is not None: fp_sim = jaro_winkler(left_fp, right_fp) score += (fp_sim * 0.4) return min(1.0, score)
def _make_queries(prop, value, specificity): if prop.type == registry.name: boost = (1 + specificity) * 2 yield { "match": { "fingerprints.text": { "query": value, "operator": "and", "minimum_should_match": "60%", "boost": boost, } } } fp = fingerprints.generate(value) if fp is not None and fp != value: yield { "match": { "fingerprints.text": { "query": value, "operator": "and", "minimum_should_match": "60%", "boost": boost, } } } elif prop.type.group is not None: yield {"term": {prop.type.group: {"value": value}}}
def finalize_index(data, schema, texts): """Apply final denormalisations to the index.""" data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names properties = data.get('properties', {}) for name, prop in schema.properties.items(): if name not in properties: continue if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']: continue for value in ensure_list(properties[name]): if name == 'name': data['name'] = value texts.append(value) data = schema.invert(data) data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) # Add latinised names for name in list(names): names.append(latinize_text(name)) data['names'] = list(set(names)) if 'created_at' not in data: data['created_at'] = data.get('updated_at') return data
def format_proxy(proxy, collection, job_id=None): """Apply final denormalisations to the index.""" proxy.context = {} data = proxy.to_full_dict() data['collection_id'] = collection.id data['job_id'] = job_id names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) text.append(collection.label) data['text'] = text data['updated_at'] = collection.updated_at for updated_at in properties.pop('indexUpdatedAt', []): data['updated_at'] = updated_at # pprint(data) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def parse_company(line): results = dict() # same nomenclature for company_number in function parse_officer() # applies. results['company_number'] = line[0:8] # record_type is always 1 since we're parsing companies. results['record_type'] = line[8] # company_status (dissolved, active...) results['company_status_code'] = line[9] results['is_company'] = line[24] == 'Y' # filler, can throw away. results['filler'] = line[10:32] results['number_of_officers'] = line[32:36] # holds the length of the name variable (incl. "<" char), used for # validation, do not insert in database. results['unwanted_company_name_length'] = line[36:40] # company names will be of varying length and will always end with # '...< \n'. results['company_name'] = line[40:].strip('< \n') results["company_name_norm"] = generate(results.get("company_name", None)) return results
def _make_queries(type_, value): if type_ == registry.name: yield { "match": { "fingerprints.text": { "query": value, "operator": "and", "minimum_should_match": "60%", } } } fp = fingerprints.generate(value) if fp is None: return if fp.lower() != value.lower(): yield { "match": { "fingerprints.text": { "query": value, "operator": "and", "minimum_should_match": "60%", } } } elif type_.group is not None: yield {"term": {type_.group: {"value": value}}}
def load_entities(graph): """Load composite entities into the graph.""" tx = graph.begin() entities = {} try: for entity in Entity.iter_composite(): label = entity.schema or 'Other' data = dict(entity.data) data.pop('aliases', None) node = Node(label, origin=entity.origin, **data) project.log.info("Node [%s]: %s", label, entity.name) tx.create(node) for uid in entity.uids: entities[uid] = node for name in entity.names: fp = fingerprints.generate(name) name_node = Node(NAME, name=name, fp=fp) tx.merge(name_node, NAME, 'fp') rel = Relationship(node, 'ALIAS', name_node) tx.create(rel) clear_leaf_nodes(tx, NAME) tx.commit() return entities except Exception: tx.rollback() raise
def _make_queries(prop, value, specificity): if prop.type == registry.name: boost = (1 + specificity) * 2 yield { 'match': { 'names.text': { 'query': value, 'operator': 'and', 'minimum_should_match': '60%', 'boost': boost } } } fp = fingerprints.generate(value) if fp is not None: yield { 'term': { 'fingerprints': { 'value': fp, 'boost': boost } } } elif prop.type.group is not None: yield { 'term': { prop.type.group: { 'value': value } } }
def fingerprint(self, values): # TODO: this should not be a property thing, so that fp's can include # dates etx. fps = [] for value in values: fps.append(fingerprints.generate(value)) return [fp for fp in fps if fp is not None]
def _make_queries(prop, value, specificity): if prop.type == registry.name: boost = (1 + specificity) * 2 yield { 'match': { 'fingerprints.text': { 'query': value, 'operator': 'and', 'minimum_should_match': '60%', 'boost': boost } } } fp = fingerprints.generate(value) if fp is not None and fp != value: yield { 'match': { 'fingerprints.text': { 'query': value, 'operator': 'and', 'minimum_should_match': '60%', 'boost': boost } } } elif prop.type.group is not None: yield { 'term': { prop.type.group: { 'value': value } } }
def reconcile_op(query): """Reconcile operation for a single query.""" state = QueryState({ 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') entity = { 'id': 'fake', 'names': [name], 'fingerprints': [fingerprints.generate(name)], 'schemata': ensure_list(query.get('type')) } for p in query.get('properties', []): entity[p.get('pid')] = ensure_list(p.get('v')) suggested = similar_entities(entity, state) matches = [] for ent in suggested.get('results'): types = [t for t in get_freebase_types() if ent['schema'] == t['id']] matches.append({ 'id': ent.get('id'), 'name': ent.get('name'), 'type': types, 'score': min(100, ent.get('score') * 10), 'uri': entity_link(ent.get('id')), 'match': ent.get('name') == name }) log.info("Reconciled: %r -> %d matches", name, len(matches)) return { 'result': matches, 'num': len(matches) }
def process_single(resource): for row in resource: fp = fingerprints.generate(row['beneficiary_name']) if fp is not None: row['beneficiary_id'] = fp.capitalize() else: row['beneficiary_id'] = row['beneficiary_name'] yield row
def fingerprints(self): if not hasattr(self, '_fingerprints'): self._fingerprints = set() for name in self.names: fp = fingerprints.generate(name) if fp is not None: self._fingerprints.add(fp) return self._fingerprints
def get_declared_holders(codebase, holders_tallies): """ Return a list of declared holders from a codebase using the holders detected from key files. A declared holder is a copyright holder present in the key files who has the highest amount of refrences throughout the codebase. """ entry_by_holders = { fingerprints.generate(entry['value']): entry for entry in holders_tallies if entry['value'] } key_file_holders = get_field_values_from_codebase_resources( codebase, 'holders', key_files_only=True) entry_by_key_file_holders = { fingerprints.generate(entry['holder']): entry for entry in key_file_holders if entry['holder'] } unique_key_file_holders = unique(entry_by_key_file_holders.keys()) unique_key_file_holders_entries = [ entry_by_holders[holder] for holder in unique_key_file_holders ] holder_by_counts = defaultdict(list) for holder_entry in unique_key_file_holders_entries: count = holder_entry.get('count') if count: holder = holder_entry.get('value') holder_by_counts[count].append(holder) declared_holders = [] if holder_by_counts: highest_count = max(holder_by_counts) declared_holders = holder_by_counts[highest_count] # If we could not determine a holder, then we return a list of all the # unique key file holders if not declared_holders: declared_holders = [ entry['value'] for entry in unique_key_file_holders_entries ] return declared_holders
def index_names(data): """Handle entity names on documents and entities.""" names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) # Add latinised names for name in list(names): names.append(ascii_text(name)) data['names'] = list(set(names))
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" # Abstract entities can appear when profile fragments for a missing entity # are present. if proxy.schema.abstract: return None data = proxy.to_full_dict() data["schemata"] = list(proxy.schema.names) data["caption"] = proxy.caption names = data.get("names", []) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data["fingerprints"] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get("properties") data["text"] = properties.pop("indexText", []) # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric["dates"] = _numeric_values(registry.date, data.get("dates")) data["numeric"] = numeric # Context data - from aleph system, not followthemoney. data["collection_id"] = collection.id data["role_id"] = first(data.get("role_id")) data["profile_id"] = first(data.get("profile_id")) data["mutable"] = max(ensure_list(data.get("mutable")), default=False) data["origin"] = ensure_list(data.get("origin")) # Logical simplifications of dates: created_at = ensure_list(data.get("created_at")) if len(created_at) > 0: data["created_at"] = min(created_at) updated_at = ensure_list(data.get("updated_at")) or created_at if len(updated_at) > 0: data["updated_at"] = max(updated_at) # log.info("%s", pformat(data)) entity_id = data.pop("id") return { "_id": entity_id, "_index": entities_write_index(proxy.schema), "_source": data, }
def get_countries(): if 'countries' in g: return g.countries rv = collections.defaultdict(list) cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor) cursor.execute(LOAD_COUNTRIES_SQL) for r in cursor.fetchall(): rv[fingerprints.generate(r["name_to_match"])].append(r) g.countries = rv return g.countries
def _normalize_names(names): """Generate a sequence of comparable names for an entity. This also generates a `fingerprint`, i.e. a version of the name where all tokens are sorted alphabetically, and some parts, such as company suffixes, have been removed.""" seen = set() for name in names: plain = normalize(name, ascii=True) if plain is not None and plain not in seen: seen.add(plain) yield plain fp = fingerprints.generate(name) if fp is not None and len(fp) > 6 and fp not in seen: seen.add(fp) yield fp
def compute_key(self, record): digest = sha1(self.query.dataset.name.encode('utf-8')) has_key = False for key in self.keys: value = record.get(key) if self.key_fingerprint: value = fingerprints.generate(value) else: value = string_value(value) if value is None: continue digest.update(value.encode('utf-8')) has_key = True if has_key: return digest.hexdigest()
def format_proxy(proxy, collection, extra): """Apply final denormalisations to the index.""" proxy.context = {} proxy = collection.ns.apply(proxy) # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't # creep into `data['dates']` and mess up date sorting afterwards updated_at = proxy.pop('indexUpdatedAt', quiet=True) data = proxy.to_full_dict() data['collection_id'] = collection.id data['schemata'] = list(proxy.schema.names) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) data['text'] = text data['updated_at'] = collection.updated_at for value in updated_at: data['updated_at'] = value # integer casting numeric = {} for prop, values in properties.items(): prop = proxy.schema.get(prop) if prop.type in NUMERIC_TYPES: numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # add possible overrides data.update(extra) # log.info("%s", pformat(data)) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def compute_key(self, record): if not len(self.keys): return None digest = sha1(self.query.dataset.name.encode('utf-8')) # digest.update(self.schema.name.encode('utf-8')) has_key = False for key in self.keys: value = record.get(key) if self.key_fingerprint: value = fingerprints.generate(value) else: value = clean_text(value) if value is None: continue digest.update(value.encode('utf-8')) has_key = True if has_key: return digest.hexdigest()
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" data = proxy.to_full_dict() data["schemata"] = list(proxy.schema.names) names = data.get("names", []) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data["fingerprints"] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get("properties") text = properties.pop("indexText", []) text.extend(fps) data["text"] = text # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric["dates"] = _numeric_values(registry.date, data.get("dates")) data["numeric"] = numeric # Context data - from aleph system, not followthemoney. # FIXME: Can there ever really be multiple role_ids? data["role_id"] = first(data.get("role_id")) data["mutable"] = max(ensure_list(data.get("mutable")), default=False) data["origin"] = ensure_list(data.get("origin")) created_at = data.get("created_at") if created_at: data["updated_at"] = data.get("updated_at", created_at) data["collection_id"] = collection.id # log.info("%s", pformat(data)) entity_id = data.pop("id") return { "_id": entity_id, "_index": entities_write_index(data.get("schema")), "_source": data, }
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" data = proxy.to_full_dict() data['schemata'] = list(proxy.schema.names) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) data['text'] = text # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # Context data - from aleph system, not followthemoney. now = iso_text(datetime.utcnow()) data['created_at'] = min(ensure_list(data.get('created_at')), default=now) data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now) # FIXME: Can there ever really be multiple role_ids? data['role_id'] = first(data.get('role_id')) data['mutable'] = max(ensure_list(data.get('mutable')), default=False) data['origin'] = ensure_list(data.get('origin')) data['collection_id'] = collection.id # log.info("%s", pformat(data)) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def normalize_value(text): if text is None: return try: # see if this the cell value clearly numeric: float(text) return except: pass text = fingerprints.generate(text, keep_order=True) if text is None: return if len(text) <= 3: return text = u' %s ' % text return text.encode('utf-8')
def reconcile_op(query): """Reconcile operation for a single query.""" parser = SearchQueryParser({ 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') schema = query.get('type') or 'Thing' entity = { 'id': 'fake', 'names': [name], 'fingerprints': [fingerprints.generate(name)], 'schemata': ensure_list(schema), 'schema': schema } for p in query.get('properties', []): entity[p.get('pid')] = ensure_list(p.get('v')) query = SimilarEntitiesQuery(parser, entity=entity) matches = [] for doc in query.search().get('hits').get('hits'): source = doc.get('_source') match = { 'id': doc.get('_id'), 'name': source.get('name'), 'score': min(100, doc.get('_score') * 10), 'uri': entity_url(doc.get('_id')), 'match': source.get('name') == name } for type_ in get_freebase_types(): if source['schema'] == type_['id']: match['type'] = [type_] matches.append(match) log.info("Reconciled: %r -> %d matches", name, len(matches)) return { 'result': matches, 'num': len(matches) }
def format_proxy(proxy, collection, job_id=None): """Apply final denormalisations to the index.""" proxy.context = {} proxy = collection.ns.apply(proxy) data = proxy.to_full_dict() data['collection_id'] = collection.id data['job_id'] = job_id names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) text.append(collection.label) data['text'] = text data['updated_at'] = collection.updated_at for updated_at in properties.pop('indexUpdatedAt', []): data['updated_at'] = updated_at # integer casting numeric = {} for prop, values in properties.items(): prop = proxy.schema.get(prop) if prop.type in NUMERIC_TYPES: numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # pprint(data) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def finalize_index(proxy, context, texts): """Apply final denormalisations to the index.""" for prop, value in proxy.itervalues(): if prop.type.name in ['entity', 'date', 'url', 'country', 'language']: continue texts.append(value) entity = proxy.to_full_dict() data = merge_data(context, entity) data['name'] = proxy.caption data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) if not data.get('created_at'): data['created_at'] = data.get('updated_at') data.pop('id', None) return clean_dict(data)
def index_operation(data): """Apply final denormalisations to the index.""" data['bulk'] = data.get('bulk', False) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. texts = data.pop('text', []) texts.extend(data.get('properties', {}).pop('indexText', [])) texts.extend(fps) data['text'] = texts if not data.get('created_at'): data['created_at'] = data.get('updated_at') entity_id = str(data.pop('id')) data.pop('_index', None) index = entities_write_index(data.get('schema')) return entity_id, index, data
def _make_queries(prop, value): specificity = prop.type.specificity(value) if specificity == 0: return if prop.type == registry.name: boost = (1 + specificity) * 2 yield { 'match': { 'names.text': { 'query': value, 'operator': 'and', 'minimum_should_match': '60%', 'boost': boost } } } fp = fingerprints.generate(value) if fp is not None: yield { 'term': { 'fingerprints': { 'value': fp, 'boost': boost } } } return if prop.type.group is None: return yield { 'term': { prop.type.group: { 'value': value, 'boost': specificity } } }
def generate_linktab(self, chunk_size=10000): with self.config.engine.begin() as connection: q = self.config.linktab.delete() q = q.where(self.config.linktab.c.view == self.name) connection.execute(q) chunk = [] for i, value in enumerate(self.distinct_key()): fp = fingerprints.generate(value) if fp is None: continue # this is due to postgres' levenshtein fp = fp[:255] chunk.append({ 'view': self.name, 'serial': self.serial, 'key': value, 'fingerprint': fp }) if len(chunk) == chunk_size: log.info('Linktab %s (%s): %s', self.name, self.key_ref, i) connection.execute(self.config.linktab.insert(), chunk) chunk = [] if len(chunk): connection.execute(self.config.linktab.insert(), chunk)
def make_fingerprint(text, **kwargs): """Generate a normalised entity name, used for the graph.""" return fingerprints.generate(string_value(text))
def normalizeaddress(value): return fingerprints.generate(value)
def normalize_value(self, value, prop, record): return [fingerprints.generate(value)]
def fingerprint(value, **kwargs): return fingerprints.generate(string_value(value))
def addressfp(value, **kwargs): value = string_value(value) if value is None: return value = value.replace("<br/>", " ") return fingerprints.generate(value, keep_order=True)
# coding: utf-8 import fingerprints tests = [ u'Foo (Bar) Corp', u'ähnlIIch', 'Open S.A.R.L.', 'Mr. Boaty McBoatface', u'РАДИК ІВАН ЛЬВОВИЧ', u'КУШНАРЬОВ ДМИТРО ВІТАЛІЙОВИЧ', u'Foo (Bar) CORPORATION', 'Mr. Sherlock Holmes', 'Siemens Aktiengesellschaft', 'New York, New York', u'Foo Gesellschaft mit beschränkter Haftung', 'Software und- Systemgesellschaft mit beschr Haftung' ] for test in tests: out = fingerprints.generate(test) print out