Python generateの例、fingerprints.generate Pythonの例

コード例 #1

0

ファイルを表示

def search(raw_query, query_type='/geo/country'):
    raw_query = fingerprints.generate(raw_query)
    countries = get_countries()

    rv = []

    matches = countries[fingerprints.generate(raw_query)]
    for m in matches:
        m['comparison_score'] = difflib.SequenceMatcher(
            None, raw_query, fingerprints.generate(m['name_to_match'])) \
            .quick_ratio()

    for m in sorted(matches, key=lambda i: i['comparison_score'],
                    reverse=True):
        score = m['comparison_score']
        rv.append({
            'id': str(m['id']),
            'name': m['canonical_name'],
            'type': [QUERY_TYPES[0]['id']],
            'score': score * 100,
            'match': score == 1.0,
            'all_labels': {
                'score': score * 100,
                'weighted': score * 100
            }
        })

    return rv

コード例 #2

0

ファイルを表示

ファイル: entities.py プロジェクト: simonwoerpel/aleph

def load_entity(tx, entity):
    log.info("Load node [%s]: %s", entity.id, entity.name)
    node = Node(Vocab.Entity,
                fingerprint=fingerprints.generate(entity.name),
                name=entity.name,
                alephState=entity.state,
                alephEntity=entity.id)
    if entity.jurisdiction_code is not None:
        node['countryCode'] = entity.jurisdiction_code.upper()

    tx.merge(node, Vocab.Entity, 'fingerprint')
    for collection in entity.collections:
        coll_node = load_collection(tx, collection)
        rel = Relationship(node, Vocab.PART_OF, coll_node,
                           alephEntity=entity.id)
        tx.merge(rel, Vocab.PART_OF)

    seen = set([node['fingerprint']])
    for other_name in entity.other_names:
        fingerprint = fingerprints.generate(other_name.display_name)
        if fingerprint in seen or fingerprint is None:
            continue
        seen.add(fingerprint)

        alias = Node(Vocab.Entity,
                     fingerprint=fingerprint,
                     name=other_name.display_name,
                     alephEntity=entity.id,
                     isAlias=True)
        tx.merge(alias, Vocab.Entity, 'fingerprint')
        rel = Relationship(node, Vocab.AKA, alias,
                           alephId=other_name.id)
        tx.merge(rel, Vocab.AKA, 'alephId')
    # TODO contact details, addresses
    return node

コード例 #3

0

ファイルを表示

 def fingerprint(self):
     self.cluster = []
     for i in range(0, len(self.group)):
         for j in range(i + 1, len(self.group)):
             if fingerprints.generate(
                     self.group[i]) == fingerprints.generate(self.group[j]):
                 self.cluster.append([self.group[i], self.group[j]])
     return self.cluster

コード例 #4

0

ファイルを表示

def load_to_neo4j(project, neo4j_uri=None):
    neo4j_uri = neo4j_uri or env.NEO4J_URI
    if neo4j_uri is None:
        project.log.error("No $NEO4J_URI set, cannot load graph.")
        return
    project.log.info("Loading graph to Neo4J: %s", neo4j_uri)
    graph = Graph(neo4j_uri)
    tx = graph.begin()
    try:
        tx.run('MATCH (n) DETACH DELETE n')
        entities = {}
        for entity in project.iter_merged_entities():
            label = entity.pop('type', None) or 'Other'
            node = Node(label, **normalise(entity))
            tx.create(node)
            entities[entity['uid']] = node

            # create "Name" fake nodes
            fps = set()
            for name in entity.get('names', []):
                fp = fingerprints.generate(name)
                if fp is None:
                    continue
                fp = fp.replace(' ', '-')
                if fp in fps:
                    continue
                fps.add(fp)
                alias = Node('Name', name=name, fp=fp)
                tx.merge(alias, 'Name', 'fp')
                rel = Relationship(node, 'ALIAS', alias)
                tx.create(rel)

            address = entity.get('address')
            fp = fingerprints.generate(address)
            if fp is not None:
                fp = fp.replace(' ', '-')
                loc = Node('Address', name=address, fp=fp)
                tx.merge(loc, 'Address', 'fp')
                rel = Relationship(node, 'LOCATION', alias)
                tx.create(rel)

        for link in project.iter_merged_links():
            source = entities.get(link.pop('source'))
            target = entities.get(link.pop('target'))
            if source is None or target is None:
                continue
            rel = Relationship(source, 'LINK', target, **normalise(link))
            tx.create(rel)

        clear_leaf_nodes(tx, 'Name')
        clear_leaf_nodes(tx, 'Address')
        tx.commit()
    except Exception as ex:
        project.log.exception(ex)
        tx.rollback()

コード例 #5

0

ファイルを表示

ファイル: copyright_summary.py プロジェクト: akugarg/scancode-toolkit

    def fingerprint(self):
        key = self.key
        if not isinstance(key, str):
            key = unidecode(key)
        fp = fingerprints.generate(key)

        if TRACE_TEXT or TRACE_FP:
            logger_debug('Text.fingerprint:key: ', repr(self.key))
            logger_debug('Text.fingerprint:fp :    ',
                         fingerprints.generate(unidecode(self.key)))

        self.key = fp

コード例 #6

0

ファイルを表示

ファイル: similarity.py プロジェクト: bigrayhicks/corpint

def entity_similarity(left, right):
    left_name = left.get('name')
    right_name = right.get('name')
    score = 0
    if left_name is not None and right_name is not None:
        name_sim = jaro_winkler(chomp(left_name), chomp(right_name))
        score += (name_sim * 0.6)

    left_fp = fingerprints.generate(left_name)
    right_fp = fingerprints.generate(right_name)
    if left_fp is not None and right_fp is not None:
        fp_sim = jaro_winkler(left_fp, right_fp)
        score += (fp_sim * 0.4)

    return min(1.0, score)

コード例 #7

0

ファイルを表示

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                    "boost": boost,
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                        "boost": boost,
                    }
                }
            }
    elif prop.type.group is not None:
        yield {"term": {prop.type.group: {"value": value}}}

コード例 #8

0

ファイルを表示

def finalize_index(data, schema, texts):
    """Apply final denormalisations to the index."""
    data['schema'] = schema.name
    # Get implied schemata (i.e. parents of the actual schema)
    data['schemata'] = schema.names

    properties = data.get('properties', {})
    for name, prop in schema.properties.items():
        if name not in properties:
            continue
        if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']:
            continue
        for value in ensure_list(properties[name]):
            if name == 'name':
                data['name'] = value
            texts.append(value)

    data = schema.invert(data)
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    if 'created_at' not in data:
        data['created_at'] = data.get('updated_at')
    return data

コード例 #9

0

ファイルを表示

def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

コード例 #10

0

ファイルを表示

ファイル: read_directors.py プロジェクト: Transparency-International-UK/companies_house_bulk

def parse_company(line):

    results = dict()

    # same nomenclature for company_number in function parse_officer()
    # applies.
    results['company_number'] = line[0:8]

    # record_type is always 1 since we're parsing companies.
    results['record_type'] = line[8]

    # company_status (dissolved, active...)
    results['company_status_code'] = line[9]

    results['is_company'] = line[24] == 'Y'

    # filler, can throw away.
    results['filler'] = line[10:32]

    results['number_of_officers'] = line[32:36]

    # holds the length of the name variable (incl. "<" char), used for
    # validation, do not insert in database.
    results['unwanted_company_name_length'] = line[36:40]

    # company names will be of varying length and will always end with
    # '...< \n'.
    results['company_name'] = line[40:].strip('< \n')
    results["company_name_norm"] = generate(results.get("company_name", None))
    return results

コード例 #11

0

ファイルを表示

ファイル: matching.py プロジェクト: sunu/aleph

def _make_queries(type_, value):
    if type_ == registry.name:
        yield {
            "match": {
                "fingerprints.text": {
                    "query": value,
                    "operator": "and",
                    "minimum_should_match": "60%",
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is None:
            return
        if fp.lower() != value.lower():
            yield {
                "match": {
                    "fingerprints.text": {
                        "query": value,
                        "operator": "and",
                        "minimum_should_match": "60%",
                    }
                }
            }
    elif type_.group is not None:
        yield {"term": {type_.group: {"value": value}}}

コード例 #12

0

ファイルを表示

def load_entities(graph):
    """Load composite entities into the graph."""
    tx = graph.begin()
    entities = {}
    try:
        for entity in Entity.iter_composite():
            label = entity.schema or 'Other'
            data = dict(entity.data)
            data.pop('aliases', None)
            node = Node(label, origin=entity.origin, **data)
            project.log.info("Node [%s]: %s", label, entity.name)
            tx.create(node)
            for uid in entity.uids:
                entities[uid] = node

            for name in entity.names:
                fp = fingerprints.generate(name)
                name_node = Node(NAME, name=name, fp=fp)
                tx.merge(name_node, NAME, 'fp')

                rel = Relationship(node, 'ALIAS', name_node)
                tx.create(rel)

        clear_leaf_nodes(tx, NAME)
        tx.commit()
        return entities
    except Exception:
        tx.rollback()
        raise

コード例 #13

0

ファイルを表示

ファイル: match.py プロジェクト: pudo/aleph

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }

コード例 #14

0

ファイルを表示

 def fingerprint(self, values):
     # TODO: this should not be a property thing, so that fp's can include
     # dates etx.
     fps = []
     for value in values:
         fps.append(fingerprints.generate(value))
     return [fp for fp in fps if fp is not None]

コード例 #15

0

ファイルを表示

def _make_queries(prop, value, specificity):
    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'fingerprints.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None and fp != value:
            yield {
                'match': {
                    'fingerprints.text': {
                        'query': value,
                        'operator': 'and',
                        'minimum_should_match': '60%',
                        'boost': boost
                    }
                }
            }
    elif prop.type.group is not None:
        yield {
            'term': {
                prop.type.group: {
                    'value': value
                }
            }
        }

コード例 #16

0

ファイルを表示

def reconcile_op(query):
    """Reconcile operation for a single query."""
    state = QueryState({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(query.get('type'))
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    suggested = similar_entities(entity, state)
    matches = []
    for ent in suggested.get('results'):
        types = [t for t in get_freebase_types() if ent['schema'] == t['id']]
        matches.append({
            'id': ent.get('id'),
            'name': ent.get('name'),
            'type': types,
            'score': min(100, ent.get('score') * 10),
            'uri': entity_link(ent.get('id')),
            'match': ent.get('name') == name
        })
    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }

コード例 #17

0

ファイルを表示

def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row

コード例 #18

0

ファイルを表示

ファイル: entity.py プロジェクト: nstallbaumer/corpint

 def fingerprints(self):
     if not hasattr(self, '_fingerprints'):
         self._fingerprints = set()
         for name in self.names:
             fp = fingerprints.generate(name)
             if fp is not None:
                 self._fingerprints.add(fp)
     return self._fingerprints

コード例 #19

0

ファイルを表示

ファイル: fingerprint_beneficiaries.py プロジェクト: os-data/eu-structural-funds

def process_single(resource):
    for row in resource:
        fp = fingerprints.generate(row['beneficiary_name'])
        if fp is not None:
            row['beneficiary_id'] = fp.capitalize()
        else:
            row['beneficiary_id'] = row['beneficiary_name']
        yield row

コード例 #20

0

ファイルを表示

def get_declared_holders(codebase, holders_tallies):
    """
    Return a list of declared holders from a codebase using the holders
    detected from key files.

    A declared holder is a copyright holder present in the key files who has the
    highest amount of refrences throughout the codebase.
    """
    entry_by_holders = {
        fingerprints.generate(entry['value']): entry
        for entry in holders_tallies if entry['value']
    }
    key_file_holders = get_field_values_from_codebase_resources(
        codebase, 'holders', key_files_only=True)
    entry_by_key_file_holders = {
        fingerprints.generate(entry['holder']): entry
        for entry in key_file_holders if entry['holder']
    }
    unique_key_file_holders = unique(entry_by_key_file_holders.keys())
    unique_key_file_holders_entries = [
        entry_by_holders[holder] for holder in unique_key_file_holders
    ]

    holder_by_counts = defaultdict(list)
    for holder_entry in unique_key_file_holders_entries:
        count = holder_entry.get('count')
        if count:
            holder = holder_entry.get('value')
            holder_by_counts[count].append(holder)

    declared_holders = []
    if holder_by_counts:
        highest_count = max(holder_by_counts)
        declared_holders = holder_by_counts[highest_count]

    # If we could not determine a holder, then we return a list of all the
    # unique key file holders
    if not declared_holders:
        declared_holders = [
            entry['value'] for entry in unique_key_file_holders_entries
        ]

    return declared_holders

コード例 #21

0

ファイルを表示

def index_names(data):
    """Handle entity names on documents and entities."""
    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))

コード例 #22

0

ファイルを表示

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    # Abstract entities can appear when profile fragments for a missing entity
    # are present.
    if proxy.schema.abstract:
        return None

    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)
    data["caption"] = proxy.caption

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    data["text"] = properties.pop("indexText", [])

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    data["collection_id"] = collection.id
    data["role_id"] = first(data.get("role_id"))
    data["profile_id"] = first(data.get("profile_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    # Logical simplifications of dates:
    created_at = ensure_list(data.get("created_at"))
    if len(created_at) > 0:
        data["created_at"] = min(created_at)
    updated_at = ensure_list(data.get("updated_at")) or created_at
    if len(updated_at) > 0:
        data["updated_at"] = max(updated_at)

    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(proxy.schema),
        "_source": data,
    }

コード例 #23

0

ファイルを表示

def get_countries():
    if 'countries' in g:
        return g.countries

    rv = collections.defaultdict(list)

    cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)
    cursor.execute(LOAD_COUNTRIES_SQL)
    for r in cursor.fetchall():
        rv[fingerprints.generate(r["name_to_match"])].append(r)

    g.countries = rv
    return g.countries

コード例 #24

0

ファイルを表示

def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp

コード例 #25

0

ファイルを表示

ファイル: mapper.py プロジェクト: wcyn/aleph

 def compute_key(self, record):
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = string_value(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()

コード例 #26

0

ファイルを表示

ファイル: entities.py プロジェクト: x0rzkov/aleph

def format_proxy(proxy, collection, extra):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't
    # creep into `data['dates']` and mess up date sorting afterwards
    updated_at = proxy.pop('indexUpdatedAt', quiet=True)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for value in updated_at:
        data['updated_at'] = value

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # add possible overrides
    data.update(extra)

    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

コード例 #27

0

ファイルを表示

ファイル: mapper.py プロジェクト: pudo-attic/leadgraph

 def compute_key(self, record):
     if not len(self.keys):
         return None
     digest = sha1(self.query.dataset.name.encode('utf-8'))
     # digest.update(self.schema.name.encode('utf-8'))
     has_key = False
     for key in self.keys:
         value = record.get(key)
         if self.key_fingerprint:
             value = fingerprints.generate(value)
         else:
             value = clean_text(value)
         if value is None:
             continue
         digest.update(value.encode('utf-8'))
         has_key = True
     if has_key:
         return digest.hexdigest()

コード例 #28

0

ファイルを表示

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data["schemata"] = list(proxy.schema.names)

    names = data.get("names", [])
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data["fingerprints"] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get("properties")
    text = properties.pop("indexText", [])
    text.extend(fps)
    data["text"] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric["dates"] = _numeric_values(registry.date, data.get("dates"))
    data["numeric"] = numeric

    # Context data - from aleph system, not followthemoney.
    # FIXME: Can there ever really be multiple role_ids?
    data["role_id"] = first(data.get("role_id"))
    data["mutable"] = max(ensure_list(data.get("mutable")), default=False)
    data["origin"] = ensure_list(data.get("origin"))
    created_at = data.get("created_at")
    if created_at:
        data["updated_at"] = data.get("updated_at", created_at)
    data["collection_id"] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop("id")
    return {
        "_id": entity_id,
        "_index": entities_write_index(data.get("schema")),
        "_source": data,
    }

コード例 #29

0

ファイルを表示

ファイル: entities.py プロジェクト: djoffrey/aleph

def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # Context data - from aleph system, not followthemoney.
    now = iso_text(datetime.utcnow())
    data['created_at'] = min(ensure_list(data.get('created_at')), default=now)
    data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now)
    # FIXME: Can there ever really be multiple role_ids?
    data['role_id'] = first(data.get('role_id'))
    data['mutable'] = max(ensure_list(data.get('mutable')), default=False)
    data['origin'] = ensure_list(data.get('origin'))
    data['collection_id'] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

コード例 #30

0

ファイルを表示

ファイル: util.py プロジェクト: occrp/tabref

def normalize_value(text):
    if text is None:
        return

    try:
        # see if this the cell value clearly numeric:
        float(text)
        return
    except:
        pass

    text = fingerprints.generate(text, keep_order=True)
    if text is None:
        return

    if len(text) <= 3:
        return

    text = u' %s ' % text
    return text.encode('utf-8')

コード例 #31

0

ファイルを表示

def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or 'Thing'
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }

コード例 #32

0

ファイルを表示

ファイル: entities.py プロジェクト: wdsn/aleph

def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }

コード例 #33

0

ファイルを表示

ファイル: entities.py プロジェクト: mustafaascha/aleph

def finalize_index(proxy, context, texts):
    """Apply final denormalisations to the index."""
    for prop, value in proxy.itervalues():
        if prop.type.name in ['entity', 'date', 'url', 'country', 'language']:
            continue
        texts.append(value)

    entity = proxy.to_full_dict()
    data = merge_data(context, entity)
    data['name'] = proxy.caption
    data['text'] = index_form(texts)

    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')
    data.pop('id', None)
    return clean_dict(data)

コード例 #34

0

ファイルを表示

ファイル: entities.py プロジェクト: pudo/aleph

def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data

コード例 #35

0

ファイルを表示

ファイル: match.py プロジェクト: modulexcite/aleph

def _make_queries(prop, value):
    specificity = prop.type.specificity(value)
    if specificity == 0:
        return

    if prop.type == registry.name:
        boost = (1 + specificity) * 2
        yield {
            'match': {
                'names.text': {
                    'query': value,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                    'boost': boost
                }
            }
        }
        fp = fingerprints.generate(value)
        if fp is not None:
            yield {
                'term': {
                    'fingerprints': {
                        'value': fp,
                        'boost': boost
                    }
                }
            }
        return

    if prop.type.group is None:
        return
    yield {
        'term': {
            prop.type.group: {
                'value': value,
                'boost': specificity
            }
        }
    }

コード例 #36

0

ファイルを表示

ファイル: model.py プロジェクト: pudo/linkage

 def generate_linktab(self, chunk_size=10000):
     with self.config.engine.begin() as connection:
         q = self.config.linktab.delete()
         q = q.where(self.config.linktab.c.view == self.name)
         connection.execute(q)
         chunk = []
         for i, value in enumerate(self.distinct_key()):
             fp = fingerprints.generate(value)
             if fp is None:
                 continue
             # this is due to postgres' levenshtein
             fp = fp[:255]
             chunk.append({
                 'view': self.name,
                 'serial': self.serial,
                 'key': value,
                 'fingerprint': fp
             })
             if len(chunk) == chunk_size:
                 log.info('Linktab %s (%s): %s', self.name, self.key_ref, i)
                 connection.execute(self.config.linktab.insert(), chunk)
                 chunk = []
         if len(chunk):
             connection.execute(self.config.linktab.insert(), chunk)

コード例 #37

0

ファイルを表示

ファイル: common.py プロジェクト: CodeForAfrica/aleph

def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))

コード例 #38

0

ファイルを表示

ファイル: util.py プロジェクト: pudo-attic/leadgraph

def normalizeaddress(value):
    return fingerprints.generate(value)

コード例 #39

0

ファイルを表示

ファイル: types.py プロジェクト: pudo-attic/leadgraph

 def normalize_value(self, value, prop, record):
     return [fingerprints.generate(value)]

コード例 #40

0

ファイルを表示

ファイル: converter.py プロジェクト: nivertech/aleph

def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))

コード例 #41

0

ファイルを表示

ファイル: converter.py プロジェクト: CodeForAfrica/aleph

def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)

コード例 #42

0

ファイルを表示

ファイル: test.py プロジェクト: backgroundcheck/fingerprints

# coding: utf-8
import fingerprints

tests = [
    u'Foo (Bar) Corp',
    u'ähnlIIch',
    'Open S.A.R.L.',
    'Mr. Boaty McBoatface',
    u'РАДИК ІВАН ЛЬВОВИЧ',
    u'КУШНАРЬОВ ДМИТРО ВІТАЛІЙОВИЧ',
    u'Foo (Bar) CORPORATION',
    'Mr. Sherlock Holmes',
    'Siemens Aktiengesellschaft',
    'New York, New York',
    u'Foo Gesellschaft mit beschränkter Haftung',
    'Software und- Systemgesellschaft mit beschr Haftung'
]

for test in tests:
    out = fingerprints.generate(test)
    print out