def __init__(self, entity_info_file, name, get_field, norm=None):
     logging.info('building entity kb...')
     with open(entity_info_file, 'rb') as f:
         [self.entity_ids, self.entity_names] = pickle.load(f)
     self.emap = dict()
     self.missing_entities = ['army', 'navy']
     if not os.path.exists(entity_info_file + '.cache.pkl'):
         for idx in range(len(self.entity_ids)):
             logging.log_first_n(logging.INFO, 'entity kb: %s -> %s', 10, self.entity_names[idx], idx)
             logging.log_every_n_seconds(logging.INFO, 'entity kb: %s of %s', 10, idx, len(self.entity_ids))
             self.emap[self.entity_names[idx].lower()] = idx
             normalized = normalize_name(self.entity_names[idx])
             splt = split(normalized)
             cleaned = clean(splt)
             nostop = remove_stopwords(cleaned)
             if normalized not in self.emap:
                 self.emap[normalized] = idx
             if splt not in self.emap:
                 self.emap[splt] = idx
             if cleaned not in self.emap:
                 self.emap[cleaned] = idx
             if nostop not in self.emap:
                 self.emap[nostop] = idx
         for me in self.missing_entities:
             self.emap[me] = len(self.emap)
         with open(entity_info_file + '.cache.pkl', 'wb') as fout:
             pickle.dump(self.emap, fout)
     else:
         with open(entity_info_file + '.cache.pkl', 'rb') as fin:
             self.emap = pickle.load(fin)
     self.name = name
     self.get_field = get_field
     logging.info('building entity kb...done')
Beispiel #2
0
 def __init__(self, uuid, name_hash, canopies, name_features, mention_ids, record_ids, location_strings,
              unique_exact_strings):
     self.uuid = uuid
     self.name_hash = name_hash
     self.name_features = name_features
     self.mention_ids = mention_ids
     self.record_ids = record_ids
     self.location_strings = location_strings
     self.canopies = canopies
     self.unique_exact_strings = unique_exact_strings
     self.normalized_most_frequent = normalize_name(max(self.unique_exact_strings.items(), key=lambda x: x[1])[0])
Beispiel #3
0
def main():
    granted_db = pymysql.connect(read_default_file="~/.mylogin.cnf",
                                 database='patent_20200630')

    old_disambig_data_query = """
    select count(1) as cluster_size, assignee_id
    from rawassignee
    group by assignee_id;
    """

    old_disambig_data = get_dataframe_from_pymysql_cursor(
        granted_db, old_disambig_data_query)

    mention_query = """
    SELECT rawassignee.uuid, disambiguated_id, organization, name_first, name_last 
    from rawassignee INNER JOIN tmp_assignee_disambiguation_granted ON rawassignee.uuid=tmp_assignee_disambiguation_granted.uuid;
    """

    mention_data = get_dataframe_from_pymysql_cursor(granted_db,
                                                     mention_query).to_numpy()

    entity_id_query = """
        SELECT DISTINCT(disambiguated_id)
        from tmp_assignee_disambiguation_granted;
    """

    entity_data = get_dataframe_from_pymysql_cursor(
        granted_db, entity_id_query).to_numpy()

    entity2namecount = collections.defaultdict(dict)
    for i in tqdm(range(mention_data.shape[0]), 'counting',
                  mention_data.shape[0]):
        name = mention_data[i][2] if mention_data[i][2] else '%s %s' % (
            mention_data[i][3], mention_data[i][4])
        if name not in entity2namecount[mention_data[i][1]]:
            entity2namecount[mention_data[i][1]][name] = 1
        else:
            entity2namecount[mention_data[i][1]][name] += 1

    # Algorithm:
    # If linked, use name of PermID entity
    # Otherwise pick most frequently appearing name in the cluster (i.e. largest number of patents determines frequency)
    # Normalize characters not displayed in html
    # TODO: Normalize & Tie-break!

    entity_kb = EntityKBFeatures('data/assignee/permid/permid_entity_info.pkl',
                                 None, None)
    canonical_names = dict()
    for entity, name2count in tqdm(entity2namecount.items(), 'canonicalizing'):
        sorted_pairs = sorted([(n, c) for n, c in name2count.items()],
                              key=lambda x: x[1],
                              reverse=True)
        for n, c in sorted_pairs:
            if normalize_name(n) in entity_kb.emap:
                canonical_names[entity] = n
                break
        if entity in canonical_names:
            continue
        else:
            canonical_names[entity] = sorted_pairs[0][0]

    with open('assignee_canonical.pkl', 'wb') as fout:
        import pickle
        pickle.dump(canonical_names, fout)