def find_pair( self, left_df: pd.DataFrame, right_df: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) pairs = rltk.get_record_pairs(ds1, ds2, block=block) pairs_list = [] pairs_column = [[] for _ in range(right_df.shape[0])] for r1, r2 in pairs: pairs_column[int(r2.id)].append(int(r1.id)) pairs_list.append((r1.id, r2.id)) right_df["joining_pairs"] = pairs_column return right_df, pairs_list
def worker(): tokenizer = rltk.CrfTokenizer() # load Datasets ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) valid_match = [] for r_imdb in ds_imdb: # test this record with AFI records optimum = (None, MY_TRESH) for r_afi in ds_afi: result, confidence = rule_based_method(r_imdb, r_afi) if result and confidence > optimum[1]: optimum = (r_afi, confidence) if optimum[0] is not None: r_afi, confidence = optimum valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': r_afi.raw_object['url'] }) else: valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': None }) fout = open(result_file, 'w') fout.write(json.dumps(valid_match, indent=4)) fout.close()
def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2): dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1), record_class=record_1, adapter=rltk.MemoryAdapter()) dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2), record_class=record_2, adapter=rltk.MemoryAdapter()) return dataset_1, dataset_2
def featurize(mode, output_filename=None): """ Catch all method to featurize either train or test dataset and save to CSV Params: mode: (str) TRAIN or TEST output_filename: (str) Optional- name of the csv to save the data """ MODE = mode if not os.path.exists('train/') or not os.path.exists('test/'): train_test_split() if not os.path.exists('block_files/'): os.mkdir('block_files/') BLOCK_FILE = 'block_files/'+MODE+'.jl' CORPUS_FREQ_FILE = MODE+'/corpus_freq.json' ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')), record_class=AmazonRecord, adapter=rltk.MemoryAdapter()) ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')), record_class=GoogleRecord, adapter=rltk.MemoryAdapter()) try: block_handler = open(BLOCK_FILE,'r') print("Block file exists. Reading from disk...") except FileNotFoundError: block_handler = rltk.InvertedIndexBlockGenerator( ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate() features = ['id1', 'id2', 'price_difference', 'desc_jaccard', 'desc_tf_idf', 'desc_trigram', 'manufacturer_jaccard', 'manufacturer_jaro_winkler', 'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler', 'name_trigram','label'] pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler)) freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog) if MODE == "train": print("Featurizing train") if not output_filename: output_filename = 'train/features_train.csv' featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE) elif MODE == "test": print("Featurizing test") if not output_filename: output_filename = 'test/features_test.csv' featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
def entity_links_stage_4(): # load Datasets ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_issue_location.generate_dataframe().head(5)) print(ds_wikia_location.generate_dataframe().head(5)) tot_counter = 0 for item in ds_issue_location: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile: print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA))) json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1(): # load Datasets ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter()) ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_movie_char.generate_dataframe().head(5)) print(ds_wikia_char.generate_dataframe().head(5)) tot_counter = 0 for item in ds_movie_char: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_char) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile: print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA))) json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict, right_metadata: dict) -> JoinResult: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) left_df = left_df.set_index('id') right_df = right_df.set_index('id') pairs = rltk.get_record_pairs(ds1, ds2, block=block) df_joined = pd.DataFrame() column_names_to_join = None for r1, r2 in pairs: left_res = left_df.loc[r1.id] right_res = right_df.loc[r2.id] if column_names_to_join is None: column_names_to_join = right_res.index.difference( left_res.index) matched_rows = right_res.index.intersection(left_res.index) columns_new = left_res.index.tolist() columns_new.extend(column_names_to_join.tolist()) new = pd.concat([left_res, right_res[column_names_to_join]]) df_joined = df_joined.append(new, ignore_index=True) # ensure that the original dataframe columns are at the first left part df_joined = df_joined[columns_new] return JoinResult(df_joined, matched_rows)
@rltk.cached_property def brand(self): return set(self.raw_object['brand']) @rltk.cached_property def ingredients(self): return set(self.raw_object['ingredients_ids']) product_file = './output/sephora_skincare_product_ingredient_list.jl' with open(product_file) as json_products: products = [json.loads(line) for line in json_products] ds_products = rltk.Dataset(reader=rltk.JsonLinesReader(product_file), record_class=Product, adapter=rltk.MemoryKeyValueAdapter()) df_products = ds_products.generate_dataframe() def name_token_similarity(prod1, prod2): '''set''' set1 = prod1.name_tokens set2 = prod2.name_tokens return rltk.dice_similarity(set1, set2) def name_string_similarity(prod1, prod2): s1 = prod1.name_string s2 = prod2.name_string return rltk.jaro_winkler_similarity(s1, s2)
def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset: ''' Create rltk dataset from a given jl file ''' assert Path(input_file).suffix == ".jl" return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())
def process(): df_entity = pd.DataFrame() logger.info('loading entity dataframes') for infile in glob.glob( os.path.join(config['temp_dir'], config['run_name'], '*/*.entity.h5')): source = os.path.basename(infile).split('.')[0] df_entity = df_entity.append(pd.read_hdf(infile)) df_entity = df_entity.reset_index(drop=True) logger.info('Total number of entities: %d', len(df_entity)) df_entity['type'] = df_entity['type'].apply(lambda x: x[ 0]) # only pick the fist type (compatible with old pipeline) df_entity_ori = df_entity.copy() ### filtering logger.info('filtering out some entity types') all_types = set(df_entity['type']) # all_types = set([t for tu in df_entity['type'] for t in tu]) # multi-type support selected_types = filter( lambda x: x.startswith( ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')), all_types) df_entity = df_entity.loc[df_entity['type'].isin(selected_types)] # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support df_entity = df_entity[df_entity['name'].notnull()] df_entity = df_entity.where(pd.notnull(df_entity), None) df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])] ### generate rltk components logger.info('generating rltk components') ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=GaiaRecord) bg_kb = rltk.TokenBlocker() blocks_kb = bg_kb.block(ds, function_=lambda r: list(r.selected_target) if r.selected_target else ['None']) bg_fb = rltk.TokenBlocker() blocks_fb = bg_fb.block(ds, function_=lambda r: r.selected_fbid if r.selected_fbid else ['None']) ### clustering logger.info('clustering entity') # build cluster based on type all_clusters = [] for bid, data in blocks_kb.key_set_adapter: if bid == 'None': continue c = Cluster(ds) for _, r_id in data: r = ds.get_record(r_id) for id_ in r.selected_target: c.kb_id.add(id_) if r.fbid: for id_ in r.selected_fbid: c.fb_id.add(id_) if r.wikidata: for id_ in r.selected_wikidata: c.wd_id.add(id_) c.add(r) all_clusters.append(c) # fb only clusters fb_only_clusters = {} for bid, data in blocks_fb.key_set_adapter: if bid == 'None': continue fb_only_clusters[bid] = set() for _, r_id in data: r = ds.get_record(r_id) if r.selected_target: continue fb_only_clusters[bid].add(r_id) if len(fb_only_clusters[bid]) == 0: del fb_only_clusters[bid] for bid, cluster in fb_only_clusters.items(): c = Cluster(ds) for r_id in cluster: c.add(r_id) r = ds.get_record(r_id) if r.fbid: for id_ in r.selected_fbid: c.fb_id.add(id_) if r.wikidata: for id_ in r.selected_wikidata: c.wd_id.add(id_) all_clusters.append(c) # validation for idx, c in enumerate(all_clusters): if len(c.kb_id) > 1: logger.error('mulitple kb_ids in cluster: %s', c.kb_id) break kb_ids = set() for r_id in c.all_records: r = ds.get_record(r_id) if r.selected_target: for id_ in r.selected_target: kb_ids.add(id_) if len(kb_ids) > 1: logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id) break # split based on types all_clusters_splitted = [] for c in all_clusters: types = {} for r_id in c.all_records: r = ds.get_record(r_id) type_ = normalize_type(r.type) if type_ not in types: cc = Cluster(ds) cc.type = type_ types[type_] = cc cc = types[type_] cc.add(r_id) if r.selected_target: for id_ in r.selected_target: cc.kb_id.add(id_) if r.selected_fbid: for id_ in r.selected_fbid: cc.fb_id.add(id_) if r.selected_wikidata: for id_ in r.selected_wikidata: cc.wd_id.add(id_) for cc in types.values(): all_clusters_splitted.append(cc) # merge singleton final_clusters = deepcopy(all_clusters_splitted) MIN_SIM = 0.4 clustered_entity_ids = set( [r for c in all_clusters for r in c.all_records]) for _, e in df_entity['e'].items(): if e not in clustered_entity_ids: r = ds.get_record(e) r_type = normalize_type(r.type) local_best = [None, 0] # first item: cluster id, second item: score for c in final_clusters: sim = c.similarity(r) if r_type != c.type: continue if sim >= MIN_SIM: if sim > local_best[1]: local_best = [c, sim] c = local_best[0] if c is not None: c.add(r, contribute=False) else: # still singleton, construct singleton cluster c = Cluster(ds) c.type = r_type c.add(r) final_clusters.append(c) # filtered-out entities # create cluster with fake record for _, e in df_entity_left.iterrows(): c = Cluster(None) c.type = normalize_type(e['type']) c.add(e['e'], contribute=False) final_clusters.append(c) logger.info('Total number of clusters: %d', len(final_clusters)) # create entity to cluster mapping entity_to_cluster = defaultdict(list) for c in final_clusters: for r in c.all_records: entity_to_cluster[r].append(c) for e, c in entity_to_cluster.items(): if len(c) > 1: logger.error('Entity in multiple clusters detected, entity id: %s', e) ### generate cluster properties logger.info('generating cluster properties') for c in final_clusters: c.generate() ### export logger.info('exporting clusters') df_entity_cluster = df_entity_ori.copy() df_entity_cluster['cluster'] = None df_entity_cluster['synthetic'] = False logger.info('updating cluster info for each entity') for idx, e in df_entity_cluster['e'].items(): clusters = tuple(set([c.full_id for c in entity_to_cluster[e]])) df_entity_cluster.at[idx, 'cluster'] = clusters logger.info('creating prototypes') proto_to_cluster_mapping = {} for c in final_clusters: proto_to_cluster_mapping[c.feature_entity_id] = c proto_dict = [] for idx, row in df_entity_cluster.iterrows(): eid = row['e'] if eid not in proto_to_cluster_mapping: # not a prototype continue c = proto_to_cluster_mapping[eid] # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0] row = row.to_dict() row['synthetic'] = True row['cluster'] = tuple([c.full_id]) row['e'] = c.prototype proto_dict.append(row) df_prototypes = pd.DataFrame.from_dict(proto_dict) logger.info('appending dataframes') df_complete_entity_clusters = df_entity_cluster.append(df_prototypes) df_complete_entity_clusters.reset_index(drop=True) logger.info('writing to disk') output_file = os.path.join(config['temp_dir'], config['run_name'], 'entity_cluster.h5') with warnings.catch_warnings(): warnings.simplefilter('ignore') df_complete_entity_clusters.to_hdf(output_file, 'entity', mode='w', format='fixed') df_complete_entity_clusters.to_csv(output_file + '.csv')
@property def address(self): return self.raw_object['Address'] @property def phone(self): return self.raw_object['Phone'] @property def cuisine(self): return self.raw_object['Cuisine'] ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) '''bg = rltk.HashBlockGenerator() blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine')) pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)''' pairs = rltk.get_record_pairs(ds1, ds2) f = open('similarities.txt', 'w+') for r1, r2 in pairs: a_d = rltk.levenshtein_similarity(r1.address, r2.address)
def genre_set(self): return set(self.genre_string.split(',')) @rltk.cached_property def year(self): if re.search("(\d{4})", self.date_string): return str(re.search("(\d{4})", self.date_string).group(0)) else: return '' imdb_file = 'imdb.jl' afi_file = 'afi.jl' ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) def name_similarity(r_imdb, r_afi): s1 = r_imdb.name_string s2 = r_afi.name_string return rltk.jaro_winkler_similarity(s1, s2) def genre_similarity(r_imdb, r_afi): s1 = r_imdb.genre_set s2 = r_afi.genre_set
def process(): df_entity = pd.DataFrame() df_event = pd.DataFrame() df_event_role = pd.DataFrame() df_relation = pd.DataFrame() df_relation_role = pd.DataFrame() logger.info('loading entity dataframes') for infile in glob.glob( os.path.join(config['temp_dir'], config['run_name'], '*/*.entity.h5')): source = os.path.basename(infile).split('.')[0] # entity df_entity = df_entity.append(pd.read_hdf(infile)) # event event_file = infile[:-len('entity.h5')] + 'event.h5' df_event = df_event.append(pd.read_hdf(event_file)) event_role_file = infile[:-len('entity.h5')] + 'event_role.h5' df_event_role = df_event_role.append(pd.read_hdf(event_role_file)) # relation relation_file = infile[:-len('entity.h5')] + 'relation.h5' df_relation = df_relation.append(pd.read_hdf(relation_file)) relation_role_file = infile[:-len('entity.h5')] + 'relation_role.h5' df_relation_role = df_relation_role.append( pd.read_hdf(relation_role_file)) logger.info('Read in {} entities, {} events, {} relations'.format( len(df_entity), len(df_event), len(df_relation))) df_entity = df_entity.drop_duplicates( subset=['e'], keep='last') # cmu data has cross document entities, only keep one df_entity = df_entity.reset_index(drop=True) df_entity['type'] = df_entity['type'].apply(lambda x: x[ 0]) # only pick the fist type (compatible with old pipeline) df_entity_ori = df_entity.copy() df_event = df_event.drop_duplicates(subset=['e'], keep='last').reset_index(drop=True) df_event_role = df_event_role.drop_duplicates().reset_index(drop=True) df_relation = df_relation.drop_duplicates().reset_index(drop=True) df_relation_role = df_relation_role.drop_duplicates().reset_index( drop=True) logger.info( 'After deduplication: {} entities, {} events, {} relations'.format( len(df_entity), len(df_event), len(df_relation))) ### filtering logger.info('filtering out some entity types') all_types = set(df_entity['type']) # all_types = set([t for tu in df_entity['type'] for t in tu]) # multi-type support selected_types = filter( lambda x: x.startswith( ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')), all_types) df_entity = df_entity.loc[df_entity['type'].isin(selected_types)] # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support df_entity = df_entity[df_entity['name'].notnull()] df_entity = df_entity.where(pd.notnull(df_entity), None) df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])] ### generate rltk components logger.info('generating rltk components') ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=GaiaRecord) # for r in ds: # print(r.concatenated_labels) # print(r.name, r.target, r.wikidata, r.selected_target_index, r.selected_wikidata_index) bg_kb = rltk.TokenBlocker() blocks_kb = bg_kb.block(ds, function_=lambda r: [r.selected_target] if r.selected_target else ['None']) bg_wd = rltk.TokenBlocker() blocks_wd = bg_wd.block(ds, function_=lambda r: [r.selected_wikidata] if r.selected_wikidata else ['None']) ### clustering logger.info('clustering entity') # build cluster based on type all_clusters = [] for bid, data in blocks_kb.key_set_adapter: if bid == 'None': continue c = Cluster(ds) for _, r_id in data: r = ds.get_record(r_id) if r.target and not c.kb_id: c.kb_id = r.selected_target c.kb_labels = set(r.selected_target_labels) if r.wikidata: if r.selected_wikidata not in c.wd_candidate: c.wd_candidate[r.selected_wikidata] = set( r.selected_wikidata_labels) c.add(r) c.elect_wd_id() all_clusters.append(c) # find all wd only blocks wd_only_clusters = {} for bid, data in blocks_wd.key_set_adapter: if bid == 'None': continue wd_only_clusters[bid] = set() for _, r_id in data: r = ds.get_record(r_id) if r.selected_target: continue wd_only_clusters[bid].add(r_id) if len(wd_only_clusters[bid]) == 0: del wd_only_clusters[bid] # if wd block overlaps with kb clusters for c in all_clusters: if c.wd_id and c.wd_id in wd_only_clusters: for r in wd_only_clusters[c.wd_id]: c.add(r) del wd_only_clusters[c.wd_id] # construct clusters based on blocks for bid, cluster in wd_only_clusters.items(): c = Cluster(ds) for r_id in cluster: c.add(r_id) r = ds.get_record(r_id) if not c.wd_id: c.wd_id = r.selected_wikidata c.wd_labels = set(r.selected_wikidata_labels) all_clusters.append(c) # validation # for idx, c in enumerate(all_clusters): # if len(c.kb_id) > 1: # logger.error('mulitple kb_ids in cluster: %s', c.kb_id) # break # # kb_ids = set() # for r_id in c.all_records: # r = ds.get_record(r_id) # if r.selected_target: # for id_ in r.selected_target: # kb_ids.add(id_) # if len(kb_ids) > 1: # logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id) # break # split based on types all_clusters_splitted = [] for c in all_clusters: types = {} for r_id in c.all_records: r = ds.get_record(r_id) type_ = normalize_type(r.type) if type_ not in types: cc = Cluster(ds) cc.type = type_ types[type_] = cc cc = types[type_] cc.add(r_id) cc.kb_id = c.kb_id cc.kb_labels = c.kb_labels cc.wd_id = c.wd_id cc.wd_labels = c.wd_labels for cc in types.values(): all_clusters_splitted.append(cc) # merge singleton final_clusters = deepcopy(all_clusters_splitted) # MIN_SIM = 0.4 clustered_entity_ids = set( [r for c in all_clusters for r in c.all_records]) for _, e in df_entity['e'].items(): if e not in clustered_entity_ids: r = ds.get_record(e) r_type = normalize_type(r.type) local_best = [None, 0] # first item: cluster id, second item: score for c in final_clusters: sim = c.similarity(r) if r_type != c.type: continue if sim > local_best[1]: local_best = [c, sim] c = local_best[0] if c is not None: c.add(r) else: # still singleton, construct singleton cluster c = Cluster(ds) c.type = r_type c.add(r) c.name_labels = set(r.name) final_clusters.append(c) # filtered-out entities # create cluster with fake record for _, e in df_entity_left.iterrows(): c = Cluster(None) c.type = normalize_type(e['type']) c.add(e['e']) final_clusters.append(c) logger.info('Total number of clusters: %d', len(final_clusters)) # create entity to cluster mapping entity_to_cluster = defaultdict(list) for c in final_clusters: for r in c.all_records: entity_to_cluster[r].append(c) for e, c in entity_to_cluster.items(): if len(c) > 1: logger.error('Entity in multiple clusters detected, entity id: %s', e) entity_to_cluster = {e: c[0] for e, c in entity_to_cluster.items()} ### generate cluster properties logger.info('generating cluster properties') for c in final_clusters: c.generate() ### event and relation cluster # these clusters URIs will be {event/relation uri}-cluster # prototype URIs hence will be just {event/relation uri} ### event role event_role_se_dict = { 'prototype1': [], 'prototype2': [], 'role': [], 'just': [] } for idx, v in df_event_role.iterrows(): event_role_se_dict['prototype1'].append(v['event']) event_role_se_dict['prototype2'].append( entity_to_cluster[v['entity']].prototype) event_role_se_dict['role'].append(v['role']) event_role_se_dict['just'].append(v['just']) df_event_role_se = pd.DataFrame.from_dict(event_role_se_dict) ### relation role relation_role_se_dict = { 'prototype1': [], 'prototype2': [], 'role': [], 'just': [] } for idx, v in df_relation_role.iterrows(): relation_role_se_dict['prototype1'].append(v['relation']) if v['type'] == 'entity': relation_role_se_dict['prototype2'].append( entity_to_cluster[v['e']].prototype) elif v['type'] == 'event': relation_role_se_dict['prototype2'].append(v['e']) relation_role_se_dict['role'].append(v['role']) relation_role_se_dict['just'].append(v['just']) df_relation_role_se = pd.DataFrame.from_dict(relation_role_se_dict) ### export logger.info('exporting clusters') df_entity_cluster = df_entity_ori.copy() df_entity_cluster['cluster'] = None df_entity_cluster['synthetic'] = False df_entity_cluster['cluster_member_confidence'] = None logger.info('updating cluster info for each entity') for idx, e in df_entity_cluster['e'].items(): clusters = [entity_to_cluster[e]] cluster_ids = tuple([c.full_id for c in clusters]) confidences = tuple([c.member_confidence[e] for c in clusters]) df_entity_cluster.at[idx, 'cluster'] = cluster_ids df_entity_cluster.at[idx, 'cluster_member_confidence'] = confidences logger.info('creating prototypes') proto_to_cluster_mapping = {} for c in final_clusters: proto_to_cluster_mapping[c.feature_entity_id] = c proto_dict = [] for idx, row in df_entity_cluster.iterrows(): eid = row['e'] if eid not in proto_to_cluster_mapping: # not a prototype continue c = proto_to_cluster_mapping[eid] # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0] row = row.to_dict() row['synthetic'] = True row['cluster'] = tuple([c.full_id]) row['e'] = c.prototype proto_dict.append(row) df_prototypes = pd.DataFrame.from_dict(proto_dict) logger.info('appending dataframes') df_complete_entity_clusters = df_entity_cluster.append(df_prototypes) df_complete_entity_clusters.reset_index(drop=True) logger.info('writing to disk') entity_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'entity_cluster') with warnings.catch_warnings(): warnings.simplefilter('ignore') df_complete_entity_clusters.to_hdf(entity_cluster_output_file + '.h5', 'entity', mode='w', format='fixed') df_complete_entity_clusters.to_csv(entity_cluster_output_file + '.h5.csv') with open(entity_cluster_output_file + '.cluster.jl', 'w') as f: for c in final_clusters: f.write(json.dumps(c.debug()) + '\n') with warnings.catch_warnings(): warnings.simplefilter('ignore') # event event_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'event_cluster.h5') df_event.to_hdf(event_cluster_output_file, 'event') event_role_output_file = os.path.join(config['temp_dir'], config['run_name'], 'event_role.h5') df_event_role_se.to_hdf(event_role_output_file, 'event_role') df_event_role_se.to_csv(event_role_output_file + '.csv') # relation relation_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'relation_cluster.h5') df_relation.to_hdf(relation_cluster_output_file, 'relation') relation_role_output_file = os.path.join(config['temp_dir'], config['run_name'], 'relation_role.h5') df_relation_role_se.to_hdf(relation_role_output_file, 'relation_role', mode='w', format='fixed') df_relation_role_se.to_csv(relation_role_output_file + '.csv')
import pandas as pd import rltk print('from dataframe...') df = pd.read_csv('ds1.csv', encoding='latin-1') df['id'] = df['doc_id'].astype('str') class DFRecord(rltk.AutoGeneratedRecord): pass ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord) for r in ds: print(r.id, r.doc_id, r.doc_value) print('set id column...') @rltk.set_id('col1', function_=lambda x: str(x), keep_original=True) class DFRecord2(rltk.AutoGeneratedRecord): pass df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2) for r in ds: print(r.id, r.col1, r.col2)
@property def id_and_value(self): print('--> compute id_and_value') return self.id + '-' + self.value arr = [{ 'doc_id': '1', 'doc_value': 'a' }, { 'doc_id': '2', 'doc_value': 'b' }, { 'doc_id': '3', 'doc_value': 'c' }] # adapter = rltk.RedisKeyValueAdapter(host='127.0.0.1', key_prefix='cached_) adapter = rltk.HBaseKeyValueAdapter(host='127.0.0.1', key_prefix='test_', table='rltk_test1') ds1 = rltk.Dataset(reader=rltk.ArrayReader(arr), record_class=Record1, adapter=adapter) for r1 in ds1: print('------------') print('id:', r1.id) print('value:', r1.value) print('id_and_value:', r1.id_and_value) print('cache in dict:', r1.__dict__)
@rltk.cached_property def phone(self): phone = self.raw_object['Phone'].replace('/', '-').replace( ' ', '') #.replace('and','or').split('or') # print(phone.strip()[:15]) return phone.strip()[:15] @rltk.cached_property def cuisine(self): cs = self.raw_object['Cuisine'] return cs if cs else '' ds_fod = rltk.Dataset(rltk.CSVReader(file_F), record_class=DBFod, adapter=rltk.MemoryKeyValueAdapter()) # dFod = [[k+1,dblp.id,dblp.cuisine,dblp.address] for k,dblp in enumerate(ds_fod)] # print(dFod[506]) # for r_dblp in ds_fod: # print(r_dblp.name) tokenizer = rltk.CrfTokenizer() i = 0 def tokenize_id(t): tokens = tokenizer.tokenize(t) global i i += 1 t = str(i)
def _init_rltk_dataset(df, record_class): rltk_dataset = rltk.Dataset(reader=DataFrameReader(df, True), record_class=record_class) return rltk_dataset
return '4' if self.id == '1' else None class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def value(self): v = self.raw_object.get('values', list()) return v[0] if len(v) > 0 else 'empty' ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.DBMAdapter('file_index')) pairs = rltk.get_record_pairs(ds1, ds2) for r1, r2 in pairs: print('-------------') print(r1.id, r1.value, '\t', r2.id, r2.value) if r1.parent_id: print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value) print('levenshtein_distance:', rltk.levenshtein_distance(r1.value, r2.value)) print('levenshtein_similarity:', rltk.levenshtein_similarity(r1.value, r2.value))
return rltk.hybrid_jaccard_similarity(r_museum.name_tokens, r_ulan.name_tokens, threshold=0.67) if __name__ == '__main__': ulan_ds_adapter = rltk.RedisKeyValueAdapter('127.0.0.1', key_prefix='ulan_ds_') bg = rltk.TokenBlockGenerator() ulan_block = rltk.Block( rltk.RedisKeySetAdapter('127.0.0.1', key_prefix='ulan_block_')) # pre computing for ulan data if rltk.cli.confirm('Regenerate ULAN data caches?', default=False): ds_ulan = rltk.Dataset( reader=rltk.JsonLinesReader('../../datasets/museum/ulan.json'), record_class=RecordULAN, adapter=ulan_ds_adapter) b_ulan = bg.block(ds_ulan, function_=block_on_name_prefix, block=ulan_block) # load ulan ds_ulan = rltk.Dataset(adapter=ulan_ds_adapter) b_ulan = ulan_block # compare against museums' data museums = list( map(lambda x: os.path.splitext(os.path.basename(x))[0], glob.glob('../../datasets/museum/*.json'))) museums.remove('ulan') for museum in museums:
@rltk.cached_property def brand_cleaned(self): _ = self.name_tokens manufacturer = self.manufacturer return process_brand_alias( manufacturer if manufacturer != '' else self.brand) @rltk.cached_property def model_cleaned(self): m = self.model return BuyRecord._clean(m) ds_abt = rltk.Dataset(reader=rltk.CSVReader( open('../../datasets/Abt-Buy/Abt.csv', encoding='latin-1')), record_class=AbtRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_buy = rltk.Dataset(reader=rltk.CSVReader( open('../../datasets/Abt-Buy/Buy.csv', encoding='latin-1')), record_class=BuyRecord, adapter=rltk.MemoryKeyValueAdapter()) # statistics print_details = False name_count = model_count = description_count = price_count = brand_count = 0 for r in ds_abt: name_count += 1 print('------\nname:', r.name) if print_details else '' if len(r.description) > 0: description_count += 1
import pandas as pd from sklearn import svm from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, precision_recall_curve from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier import matplotlib.pyplot as plt import numpy as np from featurize import featurize, get_document_frequency, featurize_record_pair, TRAIN_DOC_SIZE from utils import impute_df, DATASET_DIR from amazon_record import AmazonRecord from google_record import GoogleRecord ds_amzn = rltk.Dataset(reader=rltk.CSVReader( open(DATASET_DIR + 'Amazon.csv', encoding='latin-1')), record_class=AmazonRecord, adapter=rltk.MemoryAdapter()) ds_goog = rltk.Dataset(reader=rltk.CSVReader( open(DATASET_DIR + 'GoogleProducts.csv', encoding='latin-1')), record_class=GoogleRecord, adapter=rltk.MemoryAdapter()) def generate_features(gt_train): """ Generate features from stratifed ground truth DataFrames Params: gt_train: (DataFrame) Df containing statified training data ids and labels """
import rltk raw_inputs = [ {'name': 'a1', 'age': 10, 'id': 1}, {'name': 'a2', 'age': 20, 'id': 2}, {'name': 'a3', 'age': 30, 'id': 3}, {'name': 'a3', 'age': 30, 'id': 4}, {'name': 'a1', 'age': 10, 'id': 5}, ] class MyRecord(rltk.Record): @property def id(self): return str(self.raw_object['id']) @property def name(self): return self.raw_object['name'] @property def age(self): return self.raw_object['age'] ds = rltk.Dataset(reader=rltk.ArrayReader(raw_inputs), record_class=MyRecord) for r, r_ in rltk.get_record_pairs(ds): print('comparing', r.id, r_.id, r.name == r_.name and r.age == r_.age)
def name(self): return self.raw_object['name'] @rltk.cached_property def laptop(self): return self.raw_object['laptop_brand'] @rltk.remove_raw_object class EvaluationRecord2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['id'] @rltk.cached_property def name(self): return self.raw_object['name'] @rltk.cached_property def laptop(self): return self.raw_object['laptop'] dataset_1_file_name = 'data_1.csv' dataset_2_file_name = 'data_2.csv' ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name), record_class=EvaluationRecord) ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name), record_class=EvaluationRecord2)
return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] @property def full_name(self): return self.first_name + ' ' + self.last_name ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2) ngram = rltk.NGramTokenizer() bg = rltk.TokenBlockGenerator() block1 = bg.block(ds1, function_=lambda r: ngram.basic(r.first_name, 3), block=rltk.Block( rltk.LevelDbKeySetAdapter('block_store', 'b1', clean=True))) block2 = bg.block(ds2, function_=lambda r: ngram.basic(r.first_name, 3), block=rltk.Block( rltk.LevelDbKeySetAdapter('block_store',
class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.MemoryAdapter()) # for r in ds1: # print(r.id, r.first_name, r.last_name) # for r in ds2: # print(r.id, r.first_name, r.last_name) block_writer = rltk.BlockFileWriter('blocks.jl') # block_writer = rltk.BlockArrayWriter() block_writer.write('1', 'a') block_writer.write('2', 'b') block_writer.write('2', 'd')
def main(): with open("dblp_final_JSON.json", "r") as f: dblp_dict = json.load(f) professors = set() for key in dblp_dict: professors.add(key['person']) #print(professors) #print(len(professors)) coauthor_dict = defaultdict(list) for key in dblp_dict: author = key['person'] for items in key['papers']: co_authors = items['co_authors'] if author in co_authors: co_authors.remove(author) if co_authors: coauthor_dict[author].extend(co_authors) list_of_coauthors = [] for key in coauthor_dict: list_of_coauthors.extend(coauthor_dict[key]) #print(len(list_of_coauthors)) ### String / Data Matching for Entity linking using RLTK ### Remove duplicates in the coauthor_dict using String Matching ### Compare with professors and do entity linking / remove duplicates df1 = pd.DataFrame(list(professors), columns=['name']) #print(df1) df2 = pd.DataFrame(list_of_coauthors, columns=['name']) #print(len(df2)) df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1) df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df1['id'] = (df1.index + 1).astype(str) #print(df1) df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1) df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df2['id'] = (df2.index + 1).astype(str) ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, property_='fname'), bg.block(ds2, property_='fname')) pairs = rltk.get_record_pairs(ds1, ds2, block=block) num_pairs = 0 sim_pairs = [] sim_dict = {} for r1, r2 in pairs: num_pairs += 1 sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname) if 0.9 < sim < 1: sim_pairs.append( (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname)) sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname #print(r1.lname,r2.lname,sim) #print(sim_pairs) #print("Blocking using Cuisine - Number of pairs:",num_pairs) for key in coauthor_dict: lis = coauthor_dict[key] for ind in range(len(lis)): if lis[ind] in sim_dict: lis[ind] = sim_dict[lis[ind]] with open("co_authors.json", "w") as jf: json.dump(coauthor_dict, jf, indent=2)