def __init__(self, entity_info_file, name, get_field, norm=None): logging.info('building entity kb...') with open(entity_info_file, 'rb') as f: [self.entity_ids, self.entity_names] = pickle.load(f) self.emap = dict() self.missing_entities = ['army', 'navy'] if not os.path.exists(entity_info_file + '.cache.pkl'): for idx in range(len(self.entity_ids)): logging.log_first_n(logging.INFO, 'entity kb: %s -> %s', 10, self.entity_names[idx], idx) logging.log_every_n_seconds(logging.INFO, 'entity kb: %s of %s', 10, idx, len(self.entity_ids)) self.emap[self.entity_names[idx].lower()] = idx normalized = normalize_name(self.entity_names[idx]) splt = split(normalized) cleaned = clean(splt) nostop = remove_stopwords(cleaned) if normalized not in self.emap: self.emap[normalized] = idx if splt not in self.emap: self.emap[splt] = idx if cleaned not in self.emap: self.emap[cleaned] = idx if nostop not in self.emap: self.emap[nostop] = idx for me in self.missing_entities: self.emap[me] = len(self.emap) with open(entity_info_file + '.cache.pkl', 'wb') as fout: pickle.dump(self.emap, fout) else: with open(entity_info_file + '.cache.pkl', 'rb') as fin: self.emap = pickle.load(fin) self.name = name self.get_field = get_field logging.info('building entity kb...done')
def __init__(self, uuid, name_hash, canopies, name_features, mention_ids, record_ids, location_strings, unique_exact_strings): self.uuid = uuid self.name_hash = name_hash self.name_features = name_features self.mention_ids = mention_ids self.record_ids = record_ids self.location_strings = location_strings self.canopies = canopies self.unique_exact_strings = unique_exact_strings self.normalized_most_frequent = normalize_name(max(self.unique_exact_strings.items(), key=lambda x: x[1])[0])
def main(): granted_db = pymysql.connect(read_default_file="~/.mylogin.cnf", database='patent_20200630') old_disambig_data_query = """ select count(1) as cluster_size, assignee_id from rawassignee group by assignee_id; """ old_disambig_data = get_dataframe_from_pymysql_cursor( granted_db, old_disambig_data_query) mention_query = """ SELECT rawassignee.uuid, disambiguated_id, organization, name_first, name_last from rawassignee INNER JOIN tmp_assignee_disambiguation_granted ON rawassignee.uuid=tmp_assignee_disambiguation_granted.uuid; """ mention_data = get_dataframe_from_pymysql_cursor(granted_db, mention_query).to_numpy() entity_id_query = """ SELECT DISTINCT(disambiguated_id) from tmp_assignee_disambiguation_granted; """ entity_data = get_dataframe_from_pymysql_cursor( granted_db, entity_id_query).to_numpy() entity2namecount = collections.defaultdict(dict) for i in tqdm(range(mention_data.shape[0]), 'counting', mention_data.shape[0]): name = mention_data[i][2] if mention_data[i][2] else '%s %s' % ( mention_data[i][3], mention_data[i][4]) if name not in entity2namecount[mention_data[i][1]]: entity2namecount[mention_data[i][1]][name] = 1 else: entity2namecount[mention_data[i][1]][name] += 1 # Algorithm: # If linked, use name of PermID entity # Otherwise pick most frequently appearing name in the cluster (i.e. largest number of patents determines frequency) # Normalize characters not displayed in html # TODO: Normalize & Tie-break! entity_kb = EntityKBFeatures('data/assignee/permid/permid_entity_info.pkl', None, None) canonical_names = dict() for entity, name2count in tqdm(entity2namecount.items(), 'canonicalizing'): sorted_pairs = sorted([(n, c) for n, c in name2count.items()], key=lambda x: x[1], reverse=True) for n, c in sorted_pairs: if normalize_name(n) in entity_kb.emap: canonical_names[entity] = n break if entity in canonical_names: continue else: canonical_names[entity] = sorted_pairs[0][0] with open('assignee_canonical.pkl', 'wb') as fout: import pickle pickle.dump(canonical_names, fout)