def find_pair( self, left_df: pd.DataFrame, right_df: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) pairs = rltk.get_record_pairs(ds1, ds2, block=block) pairs_list = [] pairs_column = [[] for _ in range(right_df.shape[0])] for r1, r2 in pairs: pairs_column[int(r2.id)].append(int(r1.id)) pairs_list.append((r1.id, r2.id)) right_df["joining_pairs"] = pairs_column return right_df, pairs_list
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict, right_metadata: dict) -> JoinResult: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) left_df = left_df.set_index('id') right_df = right_df.set_index('id') pairs = rltk.get_record_pairs(ds1, ds2, block=block) df_joined = pd.DataFrame() column_names_to_join = None for r1, r2 in pairs: left_res = left_df.loc[r1.id] right_res = right_df.loc[r2.id] if column_names_to_join is None: column_names_to_join = right_res.index.difference( left_res.index) matched_rows = right_res.index.intersection(left_res.index) columns_new = left_res.index.tolist() columns_new.extend(column_names_to_join.tolist()) new = pd.concat([left_res, right_res[column_names_to_join]]) df_joined = df_joined.append(new, ignore_index=True) # ensure that the original dataframe columns are at the first left part df_joined = df_joined[columns_new] return JoinResult(df_joined, matched_rows)
import pandas as pd import rltk print('from dataframe...') df = pd.read_csv('ds1.csv', encoding='latin-1') df['id'] = df['doc_id'].astype('str') class DFRecord(rltk.AutoGeneratedRecord): pass ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord) for r in ds: print(r.id, r.doc_id, r.doc_value) print('set id column...') @rltk.set_id('col1', function_=lambda x: str(x), keep_original=True) class DFRecord2(rltk.AutoGeneratedRecord): pass df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2) for r in ds: print(r.id, r.col1, r.col2)
return self.raw_object['id'] @property def address(self): return self.raw_object['Address'] @property def phone(self): return self.raw_object['Phone'] @property def cuisine(self): return self.raw_object['Cuisine'] ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) '''bg = rltk.HashBlockGenerator() blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine')) pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)''' pairs = rltk.get_record_pairs(ds1, ds2) f = open('similarities.txt', 'w+') for r1, r2 in pairs:
def process(): df_entity = pd.DataFrame() logger.info('loading entity dataframes') for infile in glob.glob( os.path.join(config['temp_dir'], config['run_name'], '*/*.entity.h5')): source = os.path.basename(infile).split('.')[0] df_entity = df_entity.append(pd.read_hdf(infile)) df_entity = df_entity.reset_index(drop=True) logger.info('Total number of entities: %d', len(df_entity)) df_entity['type'] = df_entity['type'].apply(lambda x: x[ 0]) # only pick the fist type (compatible with old pipeline) df_entity_ori = df_entity.copy() ### filtering logger.info('filtering out some entity types') all_types = set(df_entity['type']) # all_types = set([t for tu in df_entity['type'] for t in tu]) # multi-type support selected_types = filter( lambda x: x.startswith( ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')), all_types) df_entity = df_entity.loc[df_entity['type'].isin(selected_types)] # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support df_entity = df_entity[df_entity['name'].notnull()] df_entity = df_entity.where(pd.notnull(df_entity), None) df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])] ### generate rltk components logger.info('generating rltk components') ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=GaiaRecord) bg_kb = rltk.TokenBlocker() blocks_kb = bg_kb.block(ds, function_=lambda r: list(r.selected_target) if r.selected_target else ['None']) bg_fb = rltk.TokenBlocker() blocks_fb = bg_fb.block(ds, function_=lambda r: r.selected_fbid if r.selected_fbid else ['None']) ### clustering logger.info('clustering entity') # build cluster based on type all_clusters = [] for bid, data in blocks_kb.key_set_adapter: if bid == 'None': continue c = Cluster(ds) for _, r_id in data: r = ds.get_record(r_id) for id_ in r.selected_target: c.kb_id.add(id_) if r.fbid: for id_ in r.selected_fbid: c.fb_id.add(id_) if r.wikidata: for id_ in r.selected_wikidata: c.wd_id.add(id_) c.add(r) all_clusters.append(c) # fb only clusters fb_only_clusters = {} for bid, data in blocks_fb.key_set_adapter: if bid == 'None': continue fb_only_clusters[bid] = set() for _, r_id in data: r = ds.get_record(r_id) if r.selected_target: continue fb_only_clusters[bid].add(r_id) if len(fb_only_clusters[bid]) == 0: del fb_only_clusters[bid] for bid, cluster in fb_only_clusters.items(): c = Cluster(ds) for r_id in cluster: c.add(r_id) r = ds.get_record(r_id) if r.fbid: for id_ in r.selected_fbid: c.fb_id.add(id_) if r.wikidata: for id_ in r.selected_wikidata: c.wd_id.add(id_) all_clusters.append(c) # validation for idx, c in enumerate(all_clusters): if len(c.kb_id) > 1: logger.error('mulitple kb_ids in cluster: %s', c.kb_id) break kb_ids = set() for r_id in c.all_records: r = ds.get_record(r_id) if r.selected_target: for id_ in r.selected_target: kb_ids.add(id_) if len(kb_ids) > 1: logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id) break # split based on types all_clusters_splitted = [] for c in all_clusters: types = {} for r_id in c.all_records: r = ds.get_record(r_id) type_ = normalize_type(r.type) if type_ not in types: cc = Cluster(ds) cc.type = type_ types[type_] = cc cc = types[type_] cc.add(r_id) if r.selected_target: for id_ in r.selected_target: cc.kb_id.add(id_) if r.selected_fbid: for id_ in r.selected_fbid: cc.fb_id.add(id_) if r.selected_wikidata: for id_ in r.selected_wikidata: cc.wd_id.add(id_) for cc in types.values(): all_clusters_splitted.append(cc) # merge singleton final_clusters = deepcopy(all_clusters_splitted) MIN_SIM = 0.4 clustered_entity_ids = set( [r for c in all_clusters for r in c.all_records]) for _, e in df_entity['e'].items(): if e not in clustered_entity_ids: r = ds.get_record(e) r_type = normalize_type(r.type) local_best = [None, 0] # first item: cluster id, second item: score for c in final_clusters: sim = c.similarity(r) if r_type != c.type: continue if sim >= MIN_SIM: if sim > local_best[1]: local_best = [c, sim] c = local_best[0] if c is not None: c.add(r, contribute=False) else: # still singleton, construct singleton cluster c = Cluster(ds) c.type = r_type c.add(r) final_clusters.append(c) # filtered-out entities # create cluster with fake record for _, e in df_entity_left.iterrows(): c = Cluster(None) c.type = normalize_type(e['type']) c.add(e['e'], contribute=False) final_clusters.append(c) logger.info('Total number of clusters: %d', len(final_clusters)) # create entity to cluster mapping entity_to_cluster = defaultdict(list) for c in final_clusters: for r in c.all_records: entity_to_cluster[r].append(c) for e, c in entity_to_cluster.items(): if len(c) > 1: logger.error('Entity in multiple clusters detected, entity id: %s', e) ### generate cluster properties logger.info('generating cluster properties') for c in final_clusters: c.generate() ### export logger.info('exporting clusters') df_entity_cluster = df_entity_ori.copy() df_entity_cluster['cluster'] = None df_entity_cluster['synthetic'] = False logger.info('updating cluster info for each entity') for idx, e in df_entity_cluster['e'].items(): clusters = tuple(set([c.full_id for c in entity_to_cluster[e]])) df_entity_cluster.at[idx, 'cluster'] = clusters logger.info('creating prototypes') proto_to_cluster_mapping = {} for c in final_clusters: proto_to_cluster_mapping[c.feature_entity_id] = c proto_dict = [] for idx, row in df_entity_cluster.iterrows(): eid = row['e'] if eid not in proto_to_cluster_mapping: # not a prototype continue c = proto_to_cluster_mapping[eid] # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0] row = row.to_dict() row['synthetic'] = True row['cluster'] = tuple([c.full_id]) row['e'] = c.prototype proto_dict.append(row) df_prototypes = pd.DataFrame.from_dict(proto_dict) logger.info('appending dataframes') df_complete_entity_clusters = df_entity_cluster.append(df_prototypes) df_complete_entity_clusters.reset_index(drop=True) logger.info('writing to disk') output_file = os.path.join(config['temp_dir'], config['run_name'], 'entity_cluster.h5') with warnings.catch_warnings(): warnings.simplefilter('ignore') df_complete_entity_clusters.to_hdf(output_file, 'entity', mode='w', format='fixed') df_complete_entity_clusters.to_csv(output_file + '.csv')
def process(): df_entity = pd.DataFrame() df_event = pd.DataFrame() df_event_role = pd.DataFrame() df_relation = pd.DataFrame() df_relation_role = pd.DataFrame() logger.info('loading entity dataframes') for infile in glob.glob( os.path.join(config['temp_dir'], config['run_name'], '*/*.entity.h5')): source = os.path.basename(infile).split('.')[0] # entity df_entity = df_entity.append(pd.read_hdf(infile)) # event event_file = infile[:-len('entity.h5')] + 'event.h5' df_event = df_event.append(pd.read_hdf(event_file)) event_role_file = infile[:-len('entity.h5')] + 'event_role.h5' df_event_role = df_event_role.append(pd.read_hdf(event_role_file)) # relation relation_file = infile[:-len('entity.h5')] + 'relation.h5' df_relation = df_relation.append(pd.read_hdf(relation_file)) relation_role_file = infile[:-len('entity.h5')] + 'relation_role.h5' df_relation_role = df_relation_role.append( pd.read_hdf(relation_role_file)) logger.info('Read in {} entities, {} events, {} relations'.format( len(df_entity), len(df_event), len(df_relation))) df_entity = df_entity.drop_duplicates( subset=['e'], keep='last') # cmu data has cross document entities, only keep one df_entity = df_entity.reset_index(drop=True) df_entity['type'] = df_entity['type'].apply(lambda x: x[ 0]) # only pick the fist type (compatible with old pipeline) df_entity_ori = df_entity.copy() df_event = df_event.drop_duplicates(subset=['e'], keep='last').reset_index(drop=True) df_event_role = df_event_role.drop_duplicates().reset_index(drop=True) df_relation = df_relation.drop_duplicates().reset_index(drop=True) df_relation_role = df_relation_role.drop_duplicates().reset_index( drop=True) logger.info( 'After deduplication: {} entities, {} events, {} relations'.format( len(df_entity), len(df_event), len(df_relation))) ### filtering logger.info('filtering out some entity types') all_types = set(df_entity['type']) # all_types = set([t for tu in df_entity['type'] for t in tu]) # multi-type support selected_types = filter( lambda x: x.startswith( ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')), all_types) df_entity = df_entity.loc[df_entity['type'].isin(selected_types)] # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support df_entity = df_entity[df_entity['name'].notnull()] df_entity = df_entity.where(pd.notnull(df_entity), None) df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])] ### generate rltk components logger.info('generating rltk components') ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=GaiaRecord) # for r in ds: # print(r.concatenated_labels) # print(r.name, r.target, r.wikidata, r.selected_target_index, r.selected_wikidata_index) bg_kb = rltk.TokenBlocker() blocks_kb = bg_kb.block(ds, function_=lambda r: [r.selected_target] if r.selected_target else ['None']) bg_wd = rltk.TokenBlocker() blocks_wd = bg_wd.block(ds, function_=lambda r: [r.selected_wikidata] if r.selected_wikidata else ['None']) ### clustering logger.info('clustering entity') # build cluster based on type all_clusters = [] for bid, data in blocks_kb.key_set_adapter: if bid == 'None': continue c = Cluster(ds) for _, r_id in data: r = ds.get_record(r_id) if r.target and not c.kb_id: c.kb_id = r.selected_target c.kb_labels = set(r.selected_target_labels) if r.wikidata: if r.selected_wikidata not in c.wd_candidate: c.wd_candidate[r.selected_wikidata] = set( r.selected_wikidata_labels) c.add(r) c.elect_wd_id() all_clusters.append(c) # find all wd only blocks wd_only_clusters = {} for bid, data in blocks_wd.key_set_adapter: if bid == 'None': continue wd_only_clusters[bid] = set() for _, r_id in data: r = ds.get_record(r_id) if r.selected_target: continue wd_only_clusters[bid].add(r_id) if len(wd_only_clusters[bid]) == 0: del wd_only_clusters[bid] # if wd block overlaps with kb clusters for c in all_clusters: if c.wd_id and c.wd_id in wd_only_clusters: for r in wd_only_clusters[c.wd_id]: c.add(r) del wd_only_clusters[c.wd_id] # construct clusters based on blocks for bid, cluster in wd_only_clusters.items(): c = Cluster(ds) for r_id in cluster: c.add(r_id) r = ds.get_record(r_id) if not c.wd_id: c.wd_id = r.selected_wikidata c.wd_labels = set(r.selected_wikidata_labels) all_clusters.append(c) # validation # for idx, c in enumerate(all_clusters): # if len(c.kb_id) > 1: # logger.error('mulitple kb_ids in cluster: %s', c.kb_id) # break # # kb_ids = set() # for r_id in c.all_records: # r = ds.get_record(r_id) # if r.selected_target: # for id_ in r.selected_target: # kb_ids.add(id_) # if len(kb_ids) > 1: # logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id) # break # split based on types all_clusters_splitted = [] for c in all_clusters: types = {} for r_id in c.all_records: r = ds.get_record(r_id) type_ = normalize_type(r.type) if type_ not in types: cc = Cluster(ds) cc.type = type_ types[type_] = cc cc = types[type_] cc.add(r_id) cc.kb_id = c.kb_id cc.kb_labels = c.kb_labels cc.wd_id = c.wd_id cc.wd_labels = c.wd_labels for cc in types.values(): all_clusters_splitted.append(cc) # merge singleton final_clusters = deepcopy(all_clusters_splitted) # MIN_SIM = 0.4 clustered_entity_ids = set( [r for c in all_clusters for r in c.all_records]) for _, e in df_entity['e'].items(): if e not in clustered_entity_ids: r = ds.get_record(e) r_type = normalize_type(r.type) local_best = [None, 0] # first item: cluster id, second item: score for c in final_clusters: sim = c.similarity(r) if r_type != c.type: continue if sim > local_best[1]: local_best = [c, sim] c = local_best[0] if c is not None: c.add(r) else: # still singleton, construct singleton cluster c = Cluster(ds) c.type = r_type c.add(r) c.name_labels = set(r.name) final_clusters.append(c) # filtered-out entities # create cluster with fake record for _, e in df_entity_left.iterrows(): c = Cluster(None) c.type = normalize_type(e['type']) c.add(e['e']) final_clusters.append(c) logger.info('Total number of clusters: %d', len(final_clusters)) # create entity to cluster mapping entity_to_cluster = defaultdict(list) for c in final_clusters: for r in c.all_records: entity_to_cluster[r].append(c) for e, c in entity_to_cluster.items(): if len(c) > 1: logger.error('Entity in multiple clusters detected, entity id: %s', e) entity_to_cluster = {e: c[0] for e, c in entity_to_cluster.items()} ### generate cluster properties logger.info('generating cluster properties') for c in final_clusters: c.generate() ### event and relation cluster # these clusters URIs will be {event/relation uri}-cluster # prototype URIs hence will be just {event/relation uri} ### event role event_role_se_dict = { 'prototype1': [], 'prototype2': [], 'role': [], 'just': [] } for idx, v in df_event_role.iterrows(): event_role_se_dict['prototype1'].append(v['event']) event_role_se_dict['prototype2'].append( entity_to_cluster[v['entity']].prototype) event_role_se_dict['role'].append(v['role']) event_role_se_dict['just'].append(v['just']) df_event_role_se = pd.DataFrame.from_dict(event_role_se_dict) ### relation role relation_role_se_dict = { 'prototype1': [], 'prototype2': [], 'role': [], 'just': [] } for idx, v in df_relation_role.iterrows(): relation_role_se_dict['prototype1'].append(v['relation']) if v['type'] == 'entity': relation_role_se_dict['prototype2'].append( entity_to_cluster[v['e']].prototype) elif v['type'] == 'event': relation_role_se_dict['prototype2'].append(v['e']) relation_role_se_dict['role'].append(v['role']) relation_role_se_dict['just'].append(v['just']) df_relation_role_se = pd.DataFrame.from_dict(relation_role_se_dict) ### export logger.info('exporting clusters') df_entity_cluster = df_entity_ori.copy() df_entity_cluster['cluster'] = None df_entity_cluster['synthetic'] = False df_entity_cluster['cluster_member_confidence'] = None logger.info('updating cluster info for each entity') for idx, e in df_entity_cluster['e'].items(): clusters = [entity_to_cluster[e]] cluster_ids = tuple([c.full_id for c in clusters]) confidences = tuple([c.member_confidence[e] for c in clusters]) df_entity_cluster.at[idx, 'cluster'] = cluster_ids df_entity_cluster.at[idx, 'cluster_member_confidence'] = confidences logger.info('creating prototypes') proto_to_cluster_mapping = {} for c in final_clusters: proto_to_cluster_mapping[c.feature_entity_id] = c proto_dict = [] for idx, row in df_entity_cluster.iterrows(): eid = row['e'] if eid not in proto_to_cluster_mapping: # not a prototype continue c = proto_to_cluster_mapping[eid] # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0] row = row.to_dict() row['synthetic'] = True row['cluster'] = tuple([c.full_id]) row['e'] = c.prototype proto_dict.append(row) df_prototypes = pd.DataFrame.from_dict(proto_dict) logger.info('appending dataframes') df_complete_entity_clusters = df_entity_cluster.append(df_prototypes) df_complete_entity_clusters.reset_index(drop=True) logger.info('writing to disk') entity_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'entity_cluster') with warnings.catch_warnings(): warnings.simplefilter('ignore') df_complete_entity_clusters.to_hdf(entity_cluster_output_file + '.h5', 'entity', mode='w', format='fixed') df_complete_entity_clusters.to_csv(entity_cluster_output_file + '.h5.csv') with open(entity_cluster_output_file + '.cluster.jl', 'w') as f: for c in final_clusters: f.write(json.dumps(c.debug()) + '\n') with warnings.catch_warnings(): warnings.simplefilter('ignore') # event event_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'event_cluster.h5') df_event.to_hdf(event_cluster_output_file, 'event') event_role_output_file = os.path.join(config['temp_dir'], config['run_name'], 'event_role.h5') df_event_role_se.to_hdf(event_role_output_file, 'event_role') df_event_role_se.to_csv(event_role_output_file + '.csv') # relation relation_cluster_output_file = os.path.join(config['temp_dir'], config['run_name'], 'relation_cluster.h5') df_relation.to_hdf(relation_cluster_output_file, 'relation') relation_role_output_file = os.path.join(config['temp_dir'], config['run_name'], 'relation_role.h5') df_relation_role_se.to_hdf(relation_role_output_file, 'relation_role', mode='w', format='fixed') df_relation_role_se.to_csv(relation_role_output_file + '.csv')
from random import randrange def random_date(start, end): delta = end - start int_delta = (delta.days * 24 * 60 * 60) + delta.seconds random_second = randrange(int_delta) return start + timedelta(seconds=random_second) selected = dict() df_gt = pd.read_csv('../datasets/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv', encoding='latin-1').sample(100) df_gt = df_gt[df_gt['idDBLP'].map(len) < 50] df_gt = df_gt[df_gt['idScholar'].map(len) < 50] for d in rltk.DataFrameReader(df_gt): date = random_date(datetime(2018, 12, 20), datetime(2018, 12, 30)).date() selected[d['idDBLP']] = date selected[d['idScholar']] = date df_gt.to_csv('dblp_scholar_gt.csv', index=False) df_dblp = pd.read_csv('../datasets/DBLP-Scholar/DBLP1.csv', encoding='latin-1') df_dblp_out = {'id': [], 'names': [], 'date': []} for _, row in df_dblp.iterrows(): # print(row['id'], row['authors'], row['year']) if row['id'] in selected: if not isinstance(row['authors'], str): continue df_dblp_out['id'].append(row['id']) df_dblp_out['names'].append(row['authors']) df_dblp_out['date'].append(selected[row['id']])
def main(): with open("dblp_final_JSON.json", "r") as f: dblp_dict = json.load(f) professors = set() for key in dblp_dict: professors.add(key['person']) #print(professors) #print(len(professors)) coauthor_dict = defaultdict(list) for key in dblp_dict: author = key['person'] for items in key['papers']: co_authors = items['co_authors'] if author in co_authors: co_authors.remove(author) if co_authors: coauthor_dict[author].extend(co_authors) list_of_coauthors = [] for key in coauthor_dict: list_of_coauthors.extend(coauthor_dict[key]) #print(len(list_of_coauthors)) ### String / Data Matching for Entity linking using RLTK ### Remove duplicates in the coauthor_dict using String Matching ### Compare with professors and do entity linking / remove duplicates df1 = pd.DataFrame(list(professors), columns=['name']) #print(df1) df2 = pd.DataFrame(list_of_coauthors, columns=['name']) #print(len(df2)) df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1) df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df1['id'] = (df1.index + 1).astype(str) #print(df1) df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1) df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df2['id'] = (df2.index + 1).astype(str) ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, property_='fname'), bg.block(ds2, property_='fname')) pairs = rltk.get_record_pairs(ds1, ds2, block=block) num_pairs = 0 sim_pairs = [] sim_dict = {} for r1, r2 in pairs: num_pairs += 1 sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname) if 0.9 < sim < 1: sim_pairs.append( (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname)) sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname #print(r1.lname,r2.lname,sim) #print(sim_pairs) #print("Blocking using Cuisine - Number of pairs:",num_pairs) for key in coauthor_dict: lis = coauthor_dict[key] for ind in range(len(lis)): if lis[ind] in sim_dict: lis[ind] = sim_dict[lis[ind]] with open("co_authors.json", "w") as jf: json.dump(coauthor_dict, jf, indent=2)