def create_mtable(table, key=None, ltable=None, rtable=None, foreign_key_ltable=None, foreign_key_rtable=None): """ Create mtable from dataframe """ out_table = MTable(table, key=key) truth_vals = [ ltable is not None, rtable is not None, foreign_key_ltable is not None, foreign_key_rtable is not None ] if all(truth_vals) == True: out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', foreign_key_ltable) out_table.set_property('foreign_key_rtable', foreign_key_rtable) else: if any(truth_vals) == True: logging.getLogger(__name__).warning( 'Not all the properties for vtable are given; so not setting ' 'any of them') return out_table
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None): """ Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- ltable, rtable : MTable Input MTables l_block_attr, r_block_attr : string, attribute names in ltable, rtable l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs) # remove nans l_df = self.rem_nan(ltable, l_block_attr) r_df = self.rem_nan(rtable, r_block_attr) candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) candset = candset[retain_cols] candset.columns = final_cols candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return candset
def block_union_combine(candset_list): ltable, rtable = lr_tables(candset_list) key_l = 'ltable.' + ltable.get_key() key_r = 'rtable.' + rtable.get_key() # get the set of id tuples id_set = set([(r[key_l], r[key_r]) for c in candset_list for i, r in c.iterrows()]) # get the union set of columns col_set = set([x for c in candset_list for x in c.columns]) col_l, col_r = lr_cols(col_set) dict_list = [get_dict(ltable.ix[x[0]], rtable.ix[x[1]], col_l, col_r) for x in id_set] table = pd.DataFrame(dict_list) col_f = fin_cols(col_l, col_r, ltable.get_key(), rtable.get_key()) table = MTable(table[col_f]) table._add_key('_id') table.set_property('ltable', ltable) table.set_property('rtable', rtable) return table
def create_mtable(table, key=None, ltable=None, rtable=None, foreign_key_ltable=None, foreign_key_rtable=None): """ Create mtable from dataframe """ out_table = MTable(table, key=key) truth_vals = [ltable is not None, rtable is not None, foreign_key_ltable is not None, foreign_key_rtable is not None] if all(truth_vals) == True: out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', foreign_key_ltable) out_table.set_property('foreign_key_rtable', foreign_key_rtable) else: if any(truth_vals) == True: logging.getLogger(__name__).warning('Not all the properties for vtable are given; so not setting ' 'any of them') return out_table
def block_tables(self, ltable, rtable, ltable_block_attribute, rtable_block_attribute, ltable_output_colnames=None, rtable_output_colnames=None): # integrity checks ltable_output_colnames, rtable_output_colnames = check_columns(ltable, rtable, ltable_block_attribute, rtable_block_attribute, ltable_output_colnames, rtable_output_colnames) # remove rows with nan values in block attribute column m_ltable, m_rtable = rem_nans(ltable, rtable, ltable_block_attribute, rtable_block_attribute) candset = pd.merge(m_ltable, m_rtable, left_on=ltable_block_attribute, right_on=rtable_block_attribute, suffixes=('_ltable', '_rtable'), copy=False) ret_cols, fin_cols = out_cols(ltable.get_key(), rtable.get_key(), list(candset.columns), ltable_output_colnames, rtable_output_colnames) candset = MTable(candset[ret_cols]) candset._add_key('_id') candset.columns = fin_cols candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) return candset
def read_csv(file_path, **kwargs): """ Read CSV (comma-separated) file into MTable Parameters ---------- args : arguments to pandas read_csv command kwargs : arguments to pandas read_csv command along with optional "key" if its MTable or "key", "ltable", "rtable", "foreign_key_ltable", "foreign_key_rtable" if its VTable Returns ------- result : MTable Note ---- read_csv can read in the meta data mentioned at the beginning of the file like this: #key=id A user can override or supply metadata as key-value args to the function """ properties, num_lines = get_properties_from_file(file_path) properties, kwargs = update_properties(properties, **kwargs) check_properties(properties) kwargs['skiprows']=num_lines df = pd.read_csv(file_path, **kwargs) # get key key = properties.pop('key', None) if key is not None: df = MTable(df, key=key) else: df = MTable(df) for k, v in properties.iteritems(): df.set_property(k, v) return df
def combine_block_outputs_via_union(blocker_output_list): """ Combine blocker outputs by unioning ltable, rtable ids in candidate set Parameters ---------- blocker_output_list : list List of blocker outputs Returns ------- combined_blocker_output : MTable With combined blocker outputs Notes ----- Combined_blocker_output contains the following attributes * _id * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs * union of non-id attributes from each of blocker output """ ltable, rtable = lr_tables(blocker_output_list) # get the attribute names in blocker output that represents ltable, rtable l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() # get the union of attribute names from blocker output list col_set = set([x for c in blocker_output_list for x in c.columns]) l_col, r_col = lr_cols(col_set) l_col = list(l_col) r_col = list(r_col) l_df = l_df[l_col] # minimally the projection must contain id column r_df = r_df[r_col] col_names = ['ltable.'+c for c in l_df.columns] l_df.columns = col_names col_names = ['rtable.'+c for c in r_df.columns] r_df.columns = col_names l_df.set_index(l_key, inplace=True, drop=False) r_df.set_index(r_key, inplace=True, drop=False) # get id pairs id_set = [] for c in blocker_output_list: lfid_idx = c.get_attr_names().index(c.get_property('foreign_key_ltable')) rfid_idx = c.get_attr_names().index(c.get_property('foreign_key_rtable')) for r in c.itertuples(index=False): id_set.append((r[lfid_idx], r[rfid_idx])) id_set = list(set(id_set)) f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key()) if len(id_set) > 0: id_df = pd.DataFrame(id_set) l_consol_table = l_df.ix[id_df[0]] r_consol_table = r_df.ix[id_df[1]] l_consol_table.reset_index(inplace=True, drop=True) r_consol_table.reset_index(inplace=True, drop=True) table = pd.concat([l_consol_table, r_consol_table], axis=1) table.sort([l_key, r_key], inplace=True) table.reset_index(inplace=True, drop=True) table = MTable(table[f_cols]) else: table = MTable([], columns=f_cols) # project df and convert to MTable table.set_property('ltable', ltable) table.set_property('rtable', rtable) table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return table
def block_candset(self, vtable, l_block_attr, r_block_attr): """ Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- vtable : MTable Input candidate set l_block_attr, r_block_attr : string, attribute names in ltable, rtable Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None) l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() # convert to dataframes l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() # set index for convenience l_df.set_index(ltable.get_key(), inplace=True) r_df.set_index(rtable.get_key(), inplace=True) if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) # keep track of valid ids valid = [] # iterate candidate set and process each row for idx, row in vtable.iterrows(): if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() # get the value of block attribute from ltuple l_val = l_df.ix[row[l_key], l_block_attr] r_val = r_df.ix[row[r_key], r_block_attr] if l_val != np.NaN and r_val != np.NaN: if l_val == r_val: valid.append(True) else: valid.append(False) else: valid.append(False) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) out_table.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return out_table
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None): """ Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- ltable, rtable : MTable Input MTables l_block_attr, r_block_attr : string, attribute names in ltable, rtable l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs( ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs) # remove nans l_df = self.rem_nan(ltable, l_block_attr) r_df = self.rem_nan(rtable, r_block_attr) candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) candset = candset[retain_cols] candset.columns = final_cols candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return candset
def block_candset(self, vtable, l_block_attr, r_block_attr): """ Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- vtable : MTable Input candidate set l_block_attr, r_block_attr : string, attribute names in ltable, rtable Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None) l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() # convert to dataframes l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() # set index for convenience l_df.set_index(ltable.get_key(), inplace=True) r_df.set_index(rtable.get_key(), inplace=True) if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) # keep track of valid ids valid = [] # iterate candidate set and process each row for idx, row in vtable.iterrows(): if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() # get the value of block attribute from ltuple l_val = l_df.ix[row[l_key], l_block_attr] r_val = r_df.ix[row[r_key], r_block_attr] if l_val != np.NaN and r_val != np.NaN: if l_val == r_val: valid.append(True) else: valid.append(False) else: valid.append(False) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return out_table
def block_tables_skd(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None): """ Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- ltable, rtable : MTable Input MTables l_block_attr, r_block_attr : string, attribute names in ltable, rtable l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs) # remove nans l_df = self.rem_nan(ltable, l_block_attr) r_df = self.rem_nan(rtable, r_block_attr) #print 'cpu_count() = %d\n' % multiprocessing.cpu_count() cpu_count = multiprocessing.cpu_count() m = int(math.sqrt(cpu_count)) # no. of splits of l_df n = cpu_count/m # no. of splits of r_df print "m: ", m, ", n: ", n t0 = time.time() l_splits = np.array_split(l_df, m) t1 = time.time() r_splits = np.array_split(r_df, n) t2 = time.time() l_key = ltable.get_key() r_key = rtable.get_key() lr_splits = [(l, r, l_block_attr, r_block_attr, l_key, r_key, l_output_attrs, r_output_attrs) for l in l_splits for r in r_splits] t3 = time.time() #pool = Pool(4) pool = mp.ProcessingPool(processes=cpu_count, maxtasksperchild=1) t4 = time.time() c_splits = pool.map(self.block_data_frames_skd, lr_splits) t5 = time.time() pool.close() t6 = time.time() pool.join() t7 = time.time() candset = pd.concat(c_splits, ignore_index=True) #candset = c_splits[0].append(c_splits[1], ignore_index=True) t8 = time.time() print "Time taken to split table A:", (t1 - t0) print "Time taken to split table B:", (t2 - t1) print "Time taken to get AB splits:", (t3 - t2) print "Time taken to start workers:", (t4 - t3) print "Time taken to get C splits:", (t5 - t4) print "Time taken to close pool:", (t6 - t5) print "Time taken to join pool:", (t7 - t6) print "Time taken to combine splits:", (t8 - t7) final_cols = self.get_final_cols(l_key, r_key, l_output_attrs, r_output_attrs) candset.columns = final_cols candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return candset
def block_candset_opt_1(self, vtable, l_block_attr, r_block_attr): """ Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- vtable : MTable Input candidate set l_block_attr, r_block_attr : string, attribute names in ltable, rtable Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ start_time = time.time() # do integrity checks ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None) l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() t000 = time.time() print "Time taken to do integrity checks:", (t000 - start_time) # convert to dataframes l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() t001 = time.time() print "Time taken to convert tables A and B to data frames:", (t001 - t000) # set index for convenience l_df.set_index(ltable.get_key(), inplace=True) r_df.set_index(rtable.get_key(), inplace=True) t002 = time.time() print "Time taken to set indexes for tables A and B:", (t002 - t001) if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) column_names = list(vtable.columns) #lid_idx = column_names.index(l_key) #rid_idx = column_names.index(r_key) l_block_attr_idx = column_names.index('ltable.' + l_block_attr) r_block_attr_idx = column_names.index('rtable.' + r_block_attr) # create look up table for quick access of rows #l_dict = {} #r_dict = {} # keep track of valid ids valid = [] # iterate candidate set and process each row for row in vtable.itertuples(index=False): if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() # get the value of block attribute from ltuple #row_lid = row[lid_idx] #if row_lid not in l_dict: # l_dict[row_lid] = row[l_block_attr_idx] #l_val = l_dict[row_lid] l_val = row[l_block_attr_idx] # get the value of block attribute from rtuple #row_rid = row[rid_idx] #if row_rid not in r_dict: # r_dict[row_rid] = row[r_block_attr_idx] #r_val = r_dict[row_rid] r_val = row[r_block_attr_idx] if l_val != np.NaN and r_val != np.NaN: if l_val == r_val: valid.append(True) else: valid.append(False) else: valid.append(False) t6 = time.time() print "Time taken to get valid ids:", (t6 - t002) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) t7 = time.time() print "Time taken to create mtable for candset:", (t7 - t6) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) end_time = time.time() print "Time taken to set properties of candset: ", (end_time - t7) print "Total time to block candset: ", (end_time - start_time) return out_table
def block_tables_opt(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None): """ Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- ltable, rtable : MTable Input MTables l_block_attr, r_block_attr : string, attribute names in ltable, rtable l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs) # remove nans l_df = self.rem_nan(ltable, l_block_attr) r_df = self.rem_nan(rtable, r_block_attr) t00 = time.time() lk = ltable.get_key() l_output_attrs_1 = l_output_attrs; if lk not in l_output_attrs_1: l_output_attrs_1.append(lk) if l_block_attr not in l_output_attrs_1: l_output_attrs_1.append(l_block_attr) r_output_attrs_1 = r_output_attrs rk = rtable.get_key() if rk not in r_output_attrs_1: r_output_attrs_1.append(rk) if r_block_attr not in r_output_attrs_1: r_output_attrs_1.append(r_block_attr) print l_output_attrs_1 print r_output_attrs_1 l_df_1 = l_df[l_output_attrs_1] r_df_1 = r_df[r_output_attrs_1] #l_df_1.set_index(l_block_attr) #r_df_1.set_index(r_block_attr) t0 = time.time() candset = pd.merge(l_df_1, r_df_1, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) print list(candset) t1 = time.time() # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) print "retain_cols: ", retain_cols print "final_cols: ", final_cols t2 = time.time() candset = candset[retain_cols] t3 = time.time() candset.columns = final_cols t4 = time.time() candset = MTable(candset) t5 = time.time() # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) t6 = time.time() print "Time taken to project A and B:", (t0 - t00) print "Time taken to merge A and B:", (t1 - t0) print "Time taken to get output cols:", (t2 - t1) print "Time taken to project C cols:", (t3 - t2) print "Time taken to set C final cols:", (t4 - t3) print "Time taken to create table C:", (t5 - t4) print "Time taken to set props for C:", (t6 - t5) return candset
def block_candset_skd(self, vtable, l_block_attr, r_block_attr): """ Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join) Parameters ---------- vtable : MTable Input candidate set l_block_attr, r_block_attr : string, attribute names in ltable, rtable Returns ------- blocked_table : MTable Containing tuple pairs whose l_block_attr and r_block_attr values are same Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ start_time = time.time() # do integrity checks ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None) l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() t000 = time.time() print "Time taken to do integrity checks:", (t000 - start_time) # convert to dataframes l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() t001 = time.time() print "Time taken to convert tables A and B to data frames:", (t001 - t000) # set index for convenience l_df.set_index(ltable.get_key(), inplace=True) r_df.set_index(rtable.get_key(), inplace=True) t002 = time.time() print "Time taken to set indexes for tables A and B:", (t002 - t001) cpu_count = multiprocessing.cpu_count() #pool = Pool(4) t00 = time.time() pool = mp.ProcessingPool(processes=cpu_count, maxtasksperchild=1) t01 = time.time() print "Time taken to initialize the pool of workers:", (t01 - t00) c_df = vtable.to_dataframe() t0 = time.time() print "Time taken to convert mtable to data frame:", (t0 - t01) c_splits = np.array_split(c_df, cpu_count) t1 = time.time() print "Time taken to split table C:", (t1 - t0) args_splits = [(c, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr) for c in c_splits] t2 = time.time() print "Time taken to get args splits:", (t2 - t1) valid_splits = pool.map(self.get_valid_ids, args_splits) t3 = time.time() print "Time taken to get valid splits:", (t3 - t2) pool.close() t4 = time.time() print "Time taken to close pool:", (t4 - t3) pool.join() t5 = time.time() print "Time taken to join pool:", (t5 - t4) #valid = pd.concat(valid_splits, ignore_index=True) #valid = list(chain(valid_splits)) valid = sum(valid_splits, []) t6 = time.time() print "Time taken to combine valid splits:", (t6 - t5) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) t7 = time.time() print "Time taken to create mtable from data frame:", (t7 - t6) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) end_time = time.time() print "Time taken to set properties of mtable:", (end_time - t7) print "Time taken to block candset:", (end_time - start_time) return out_table
def _combine_block_outputs_via_union(blocker_output_list): """ Combine blocker outputs by unioning ltable, rtable ids in candidate set Parameters ---------- blocker_output_list : list List of blocker outputs Returns ------- combined_blocker_output : MTable With combined blocker outputs Notes ----- Combined_blocker_output contains the following attributes * _id * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs * union of non-id attributes from each of blocker output """ ltable, rtable = lr_tables(blocker_output_list) # get the attribute names in blocker output that represents ltable, rtable l_key = 'ltable.' + ltable.get_key() r_key = 'rtable.' + rtable.get_key() # get the set of id pairs from all blocker output list id_set = set([(r[l_key], r[r_key]) for c in blocker_output_list for i, r in c.iterrows()]) # get the union of attribute names from blocker output list col_set = set([x for c in blocker_output_list for x in c.columns]) l_col, r_col = lr_cols(col_set) # convert ltable, rtable to dfs and set index l_df = ltable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df = rtable.to_dataframe() r_df.set_index(rtable.get_key(), inplace=True, drop=False) # get the l_col, r_col from ltable and rtable respectively dict_list = [get_dict(l_df.ix[x[0]], r_df.ix[x[1]], l_col, r_col) for x in id_set] # convert list of dicts to dataframe table = pd.DataFrame(dict_list) # get the final column names for output table f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key()) if len(table) > 0: table.sort([l_key, r_key], inplace=True) table.reset_index(inplace=True, drop=True) table = MTable(table[f_cols]) else: table = MTable(table, columns=f_cols) # project df and convert to MTable table.set_property('ltable', ltable) table.set_property('rtable', rtable) table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return table
def debug_blocker(ltable, rtable, candidate_set, pred_list_size=200, field_corres_list=None): """ Debug the blocker. The basic idea is trying to suggest the user a list of record pairs out of the candidate set with high (document) jaccard similarity. The object of similarity measurement (document) is generated based on a string concatenation method with field selection. Given the suggestion list, the user should go through the pairs and determine if there are, and how many true matches in it. And based on this information, the user can determine if further improvement on the blocking step is necessary (Ex. if there are many true matches in the list, the user may conclude that the blocking step is flawed, and should revise it to produce a better candidate set). Parameters ---------- ltable, rtable : MTable Input MTables candidate_set : MTable The candidate set table after performing blocking on ltable and rtable pred_list_size : int The size of the output suggestion list field_corres_list : list (of tuples), defaults to None The list of field pairs from ltable and rtable. Each pair indicates a field correspondence between two tables. Since ltable and rtable can have different schemas, it' necessary to have this parameter to build the field correspondence to make sure the string concatenation algorithm runs correctly. Note each pair in the list should be represented as a tuple in the following format: (some_ltable_field, some_rtable_field) Returns ------- suggestion_table : MTable Contains a list of pair suggestions with high jaccard similarity. The output MTable contains the following fields: * _id * similarity (of the record pair) * ltable record key value * rtable record key value * field pairs from filtered corres_list (removing the numeric types) ltable_field_1 rtable_field_1 (corresponding to ltable_field_1) ltable_field_2 rtable_field_2 (corresponding to ltable_field_2) . . ltable_field_k rtable_field_k (corresponding to ltable_field_k) """ # Basic checks. if len(ltable) == 0: raise StandardError('Error: ltable is empty!') if len(rtable) == 0: raise StandardError('Error: rtable is empty!') if pred_list_size <= 0: raise StandardError('The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!') logging.info('\nPreparing for debugging blocker') # Check the user input field correst list (if exists) and get the raw version of # our internal correst list. check_input_field_correspondence_list(ltable, rtable, field_corres_list) corres_list = get_field_correspondence_list(ltable, rtable, field_corres_list) # Build the (col_name: col_index) dict to speed up locating a field in the schema. ltable_col_dict = build_col_name_index_dict(ltable) rtable_col_dict = build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider string types # for document concatenation. filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict, corres_list) # logging.info('\nFiltered field correspondence list:\n' + str(corres_list)) # Get field filtered new table. ltable_filtered, rtable_filtered = get_filtered_table(ltable, rtable, corres_list) # Select features. """TODO(hanli): currently we don't select the key fields even if they have the largest score. # This is because the values of the key field could be simply domain-specific serial numbers, # which might be meaningless or even harmful (when two tables use different key formats). # Modify it if this ituition is not proper.""" feature_list = select_features(ltable_filtered, rtable_filtered) if len(feature_list) == 0: raise StandardError('\nError: the selected field list is empty, nothing could be done! ' + 'Please check if all table fields are numeric types.') # logging.info('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i], # rtable_filtered.columns[i]) for i in feature_list])) # Get each table kgram dict. ltable_kgram_dict = get_kgram_dict(ltable_filtered, ltable_filtered.get_key(), feature_list, 3) rtable_kgram_dict = get_kgram_dict(rtable_filtered, rtable_filtered.get_key(), feature_list, 3) # Build inverted index on ltable kgrams to speed up debugging. inverted_index = build_inverted_index(ltable_kgram_dict) ltable_key = candidate_set.get_property('foreign_key_ltable') rtable_key = candidate_set.get_property('foreign_key_rtable') indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key], drop=False) candidate_index_key_set = set(indexed_candidate_set[rtable_key]) # logging.info('\nCandidate set size: %d' %(len(indexed_candidate_set))) rtable_len = len(rtable_filtered) progress_dict = {} for i in range(10): progress_dict[int((i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10 logging.info('\nStart debugging blocker') pred_index_list = [] count = 0 for rkey in rtable_kgram_dict: count += 1 # if count == 500: # print pred_index_list # break if rtable_len <= 10: logging.info('\nDebugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len))) else: if count in progress_dict: logging.info('\nDebugging %s' %('{percent:.2%}'.format(percent=progress_dict[count]))) rkgram_set = rtable_kgram_dict[rkey] if len(rkgram_set) == 0: continue cand_set = {} if rkey in candidate_index_key_set: cand_set = indexed_candidate_set.ix[rkey].index.values ltable_index_set = get_potential_match_set(rkgram_set, inverted_index) for lkey in ltable_index_set: if lkey in cand_set: continue jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set) if len(pred_index_list) == pred_list_size: hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey)) else: hq.heappush(pred_index_list, (jac_sim, lkey, rkey)) ret_data_frame = generate_prediction_table(ltable_filtered, rtable_filtered, pred_index_list) """This print is for debugging""" #print ret_data_frame ret_mtable = MTable(ret_data_frame) ret_mtable.set_property('foreign_key_ltable', ltable_key) ret_mtable.set_property('foreign_key_rtable', rtable_key) ret_mtable.set_property('ltable', ltable) ret_mtable.set_property('rtable', rtable) logging.info('\nFinish debugging blocker') return ret_mtable
def debug_blocker(ltable, rtable, candidate_set, output_size=200, attr_corres=None): """ Debug the blocker. The basic idea is trying to suggest the user a list of record pairs out of the candidate set with high (document) jaccard similarity. The object of similarity measurement (document) is generated based on a string concatenation method with field selection. Given the suggestion list, the user should go through the pairs and determine if there are, and how many true matches in it. And based on this information, the user can determine if further improvement on the blocking step is necessary (Ex. if there are many true matches in the list, the user may conclude that the blocking step is flawed, and should revise it to produce a better candidate set). Parameters ---------- ltable, rtable : MTable Input MTables candidate_set : MTable The candidate set table after performing blocking on ltable and rtable pred_list_size : int The size of the output suggestion list field_corres_list : list (of tuples), defaults to None The list of field pairs from ltable and rtable. Each pair indicates a field correspondence between two tables. Since ltable and rtable can have different schemas, it' necessary to have this parameter to build the field correspondence to make sure the string concatenation algorithm runs correctly. Note each pair in the list should be represented as a tuple in the following format: (some_ltable_field, some_rtable_field) Returns ------- suggestion_table : MTable Contains a list of pair suggestions with high jaccard similarity. The output MTable contains the following fields: * _id * similarity (of the record pair) * ltable record key value * rtable record key value * field pairs from filtered corres_list (removing the numeric types) ltable_field_1 rtable_field_1 (corresponding to ltable_field_1) ltable_field_2 rtable_field_2 (corresponding to ltable_field_2) . . ltable_field_k rtable_field_k (corresponding to ltable_field_k) """ # Basic checks. if len(ltable) == 0: raise StandardError('Error: ltable is empty!') if len(rtable) == 0: raise StandardError('Error: rtable is empty!') if output_size <= 0: raise StandardError( 'The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!' ) # logging.info('\nPreparing for debugging blocker') # Check the user input field correst list (if exists) and get the raw version of # our internal correst list. check_input_field_correspondence_list(ltable, rtable, attr_corres) corres_list = get_field_correspondence_list(ltable, rtable, attr_corres) # Build the (col_name: col_index) dict to speed up locating a field in the schema. ltable_col_dict = build_col_name_index_dict(ltable) rtable_col_dict = build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider string types # for document concatenation. filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict, corres_list) #print('\nFiltered field correspondence list:\n' + str(corres_list)) # Get field filtered new table. ltable_filtered, rtable_filtered = get_filtered_table( ltable, rtable, corres_list) # Select features. """TODO: currently we don't select the key fields even if they have the largest score. # This is because the values of the key field could be simply domain-specific serial numbers, # which might be meaningless or even harmful (when two tables use different key formats). # Modify it if this ituition is not proper.""" feature_list = select_features(ltable_filtered, rtable_filtered) if len(feature_list) == 0: raise StandardError( '\nError: the selected field list is empty, nothing could be done! ' + 'Please check if all table fields are numeric types.') #print('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i], rtable_filtered.columns[i]) for i in feature_list])) # Get each table kgram dict. ltable_kgram_dict = get_kgram_dict(ltable_filtered, ltable_filtered.get_key(), feature_list, 3) rtable_kgram_dict = get_kgram_dict(rtable_filtered, rtable_filtered.get_key(), feature_list, 3) # Build inverted index on ltable kgrams to speed up debugging. inverted_index = build_inverted_index(ltable_kgram_dict) ltable_key = candidate_set.get_property('foreign_key_ltable') rtable_key = candidate_set.get_property('foreign_key_rtable') indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key], drop=False) candidate_index_key_set = set(indexed_candidate_set[rtable_key]) #print('\nCandidate set size: %d' %(len(indexed_candidate_set))) rtable_len = len(rtable_filtered) progress_dict = {} for i in range(10): progress_dict[int( (i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10 #print('\nStart debugging blocker') # print('Start debugging blocker') pred_index_list = [] count = 0 if mg._verbose: count_ = 0 per_count = math.ceil(mg._percent / 100.0 * len(rtable)) elif mg._progbar: bar = pyprind.ProgBar(len(rtable)) for rkey in rtable_kgram_dict: count += 1 #if count == 500: #print pred_index_list # break # if rtable_len <= 10: # print('Debugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len))) # else: # if count in progress_dict: # print('Debugging %s' %('{percent:.2%}'.format(percent=progress_dict[count]))) if mg._verbose: count_ += 1 if count_ % per_count == 0: print str( mg._percent * count_ / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() rkgram_set = rtable_kgram_dict[rkey] if len(rkgram_set) == 0: continue cand_set = {} if rkey in candidate_index_key_set: cand_set = indexed_candidate_set.ix[rkey].index.values ltable_index_set = get_potential_match_set(rkgram_set, inverted_index) for lkey in ltable_index_set: if lkey in cand_set: continue jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set) if len(pred_index_list) == output_size: hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey)) else: hq.heappush(pred_index_list, (jac_sim, lkey, rkey)) ret_data_frame = generate_prediction_table(ltable_filtered, rtable_filtered, pred_index_list) # """This print is for debugging""" #print ret_data_frame ret_mtable = MTable(ret_data_frame) ret_mtable.set_property('foreign_key_ltable', ltable_key) ret_mtable.set_property('foreign_key_rtable', rtable_key) ret_mtable.set_property('ltable', ltable) ret_mtable.set_property('rtable', rtable) return ret_mtable