def __init__(self, entity_info_file, name, get_field, norm=None): logging.info('building entity kb...') with open(entity_info_file, 'rb') as f: [self.entity_ids, self.entity_names] = pickle.load(f) self.emap = dict() self.missing_entities = ['army', 'navy'] if not os.path.exists(entity_info_file + '.cache.pkl'): for idx in range(len(self.entity_ids)): logging.log_first_n(logging.INFO, 'entity kb: %s -> %s', 10, self.entity_names[idx], idx) logging.log_every_n_seconds(logging.INFO, 'entity kb: %s of %s', 10, idx, len(self.entity_ids)) self.emap[self.entity_names[idx].lower()] = idx normalized = normalize_name(self.entity_names[idx]) splt = split(normalized) cleaned = clean(splt) nostop = remove_stopwords(cleaned) if normalized not in self.emap: self.emap[normalized] = idx if splt not in self.emap: self.emap[splt] = idx if cleaned not in self.emap: self.emap[cleaned] = idx if nostop not in self.emap: self.emap[nostop] = idx for me in self.missing_entities: self.emap[me] = len(self.emap) with open(entity_info_file + '.cache.pkl', 'wb') as fout: pickle.dump(self.emap, fout) else: with open(entity_info_file + '.cache.pkl', 'rb') as fin: self.emap = pickle.load(fin) self.name = name self.get_field = get_field logging.info('building entity kb...done')
def from_config(config): logging.info('Building Assignee Model...') # Features: # name_features = HashingVectorizerFeatures('name_features', lambda x: x.name_features) locations = HashingVectorizerFeatures('locations', lambda x: x.location_strings) canopy_feat = HashingVectorizerFeatures('canopy', lambda x: x.canopies) entity_kb_feat = EntityKBFeatures('resources/permid_entity_info.pkl', 'entitykb', lambda x: x) # PatentID Features # patent_id = HashingVectorizerFeatures('patentid', lambda x: x.record_id) name_tfidf = SKLearnVectorizerFeatures(config['assignee']['assignee_name_model'], 'name_tfidf', lambda x: clean(split(x.normalized_most_frequent))) triples = [(locations, FeatCalc.DOT, CentroidType.NORMED, False, False), (entity_kb_feat, FeatCalc.NO_MATCH, CentroidType.BINARY, False, True), (name_tfidf, FeatCalc.DOT, CentroidType.NORMED, False, False)] encoders = [t[0] for t in triples] feature_types = [t[1] for t in triples] centroid_types = [t[2] for t in triples] must_links = set([t[0].name for t in triples if t[3]]) must_not_links = set([t[0].name for t in triples if t[4]]) assert len(encoders) == len(feature_types) assert len(feature_types) == len(centroid_types) return EncodingModel(encoders, 'AssigneeModelWithApps', {}, feature_types, centroid_types, must_links, must_not_links)
def encode(self, things_to_encode): res = -1 * np.ones(len(things_to_encode), dtype=np.int32) for idx, x in enumerate(things_to_encode): if x.normalized_most_frequent in self.emap: logging.log_first_n(logging.INFO, 'in entity kb (normalized): %s %s', 10, x.normalized_most_frequent, self.emap[x.normalized_most_frequent]) res[idx] = self.emap[x.normalized_most_frequent] else: splt_x = split(x.normalized_most_frequent) cleaned = clean(splt_x) # nostop = remove_stopwords(cleaned) if splt_x in self.emap: logging.log_first_n(logging.INFO, 'in entity kb (split): %s %s', 10, splt_x, self.emap[splt_x]) res[idx] = self.emap[splt_x] elif cleaned in self.emap: logging.log_first_n(logging.INFO, 'in entity kb (cleaned: %s %s', 10, cleaned, self.emap[cleaned]) res[idx] = self.emap[cleaned] # elif nostop in self.emap: # logging.log_first_n(logging.INFO, 'in entity kb (nostop): %s %s', 10, nostop, self.emap[nostop]) # res[idx] = self.emap[nostop] return np.expand_dims(res, axis=-1)