def block_candset(self, vtable): """ Block candidate set (virtual MTable) Parameters ---------- vtable : MTable Input candidate set Returns ------- blocked_table : MTable Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, None, None) l_key = vtable.get_property('foreign_key_ltable') r_key = vtable.get_property('foreign_key_rtable') # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(ltable.get_key(), drop=False) r_tbl = rtable.set_index(rtable.get_key(), drop=False) # create look up table for quick access of rows l_dict = {} for k, r in l_tbl.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_tbl.iterrows(): r_dict[k] = r # keep track of valid ids valid = [] # iterate candidate set and process each row if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(vtable)) elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) column_names = list(vtable.columns) lid_idx = column_names.index(l_key) rid_idx = column_names.index(r_key) for row in vtable.itertuples(index=False): if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l_row = l_dict[row[lid_idx]] r_row = r_dict[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res is True: valid.append(True) else: valid.append(False) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable')) out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable')) return out_table
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words = False, qgram=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None ): """ Block tables with overlap blocker Parameters ---------- ltable, rtable : MTables, input MTables to block l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table rem_stop_words : flag to indicate whether stop words should be removed qgram : int, value of q in qgram tokenizer. Default value is None word_level : boolean, flag to indicate to use word level tokenizer overlap_size : int, number of tokens to overlap l_output_attrs, r_output_attrs - list of attribtues to be included in the output table Returns ------- result : MTable """ # do some integrity checks if l_overlap_attr not in ltable.columns: raise AssertionError('Left overlap attribute not in ltable columns') if r_overlap_attr not in rtable.columns: raise AssertionError('Right overlap attribute not in rtable columns') l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_output_attrs, r_output_attrs) if word_level == True and qgram != None: raise SyntaxError('Parameters word_level and qgram cannot be set together; Note that world_level is set' 'to true by default, so explicity set word_level=False to use qgram') # remove nans l_df = self.rem_nan(ltable, l_overlap_attr) r_df = self.rem_nan(rtable, r_overlap_attr) l_df.reset_index(inplace=True, drop=True) r_df.reset_index(inplace=True, drop=True) l_df['_dummy_'] = 1 r_df['_dummy_'] = 1 if l_df.dtypes[l_overlap_attr] != object: logger.warning('Left overlap attribute is not of type string; converting to string temporarily') l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str) if r_df.dtypes[r_overlap_attr] != object: logger.warning('Right overlap attribute is not of type string; converting to string temporarily') r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str) l_dict = {} r_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r for k, r in r_df.iterrows(): r_dict[k] = r l_col_values_chopped = self.process_table(l_df, l_overlap_attr, qgram, rem_stop_words)# zip token list with index-val zipped_l_col_values = zip(l_col_values_chopped, range(0, len(l_col_values_chopped))) appended_l_col_idx_values = [self.append_index_values(v[0], v[1]) for v in zipped_l_col_values] inv_idx = {} if mg._verbose: print 'Creating inverted index ' sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_col_idx_values for t in c] if mg._verbose: print 'Done' r_col_values_chopped = self.process_table(r_df, r_overlap_attr, qgram, rem_stop_words) r_idx = 0; l_key = ltable.get_key() r_key = rtable.get_key() block_list = [] # misnomer - should be white list if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(rtable)) per_float = mg._percent/100.0*len(rtable) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(r_col_values_chopped)) df_list = [] for col_values in r_col_values_chopped: if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size) r_row = r_dict[r_idx] r_row_dict = r_row.to_frame().T #r_row_dict['dummy'] = 1 l_rows_dict = l_df.iloc[qualifying_ltable_indices] #l_rows_dict['dummy'] = 1 df = l_rows_dict.merge(r_row_dict, on='_dummy_', suffixes=('_ltable', '_rtable')) if len(df) > 0: df_list.append(df) r_idx += 1 candset = pd.concat(df_list) # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) candset = candset[retain_cols] candset.columns = final_cols if len(candset) > 0: candset.sort(['ltable.'+ltable.get_key(), 'rtable.'+rtable.get_key()], inplace=True) candset.reset_index(inplace=True, drop=True) candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None): """ Block two tables Parameters ---------- ltable, rtable : MTable Input MTables l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs( ltable, rtable, l_output_attrs, r_output_attrs) block_list = [] if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(ltable) * len(rtable)) elif mg._progbar: bar = pyprind.ProgBar(len(ltable) * len(rtable)) l_df = ltable.set_index(ltable.get_key(), drop=False) r_df = rtable.set_index(rtable.get_key(), drop=False) l_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_df.iterrows(): r_dict[k] = r lid_idx = ltable.get_attr_names().index(ltable.get_key()) rid_idx = rtable.get_attr_names().index(rtable.get_key()) for l_t in ltable.itertuples(index=False): for r_t in rtable.itertuples(index=False): if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l = l_dict[l_t[lid_idx]] r = r_dict[r_t[rid_idx]] # check whether it passes res = self.apply_rules(l, r) if res is True: d = OrderedDict() # add left id first ltable_id = 'ltable.' + ltable.get_key() d[ltable_id] = l[ltable.get_key()] # add right id rtable_id = 'rtable.' + rtable.get_key() d[rtable_id] = r[rtable.get_key()] # add left attributes if l_output_attrs: l_out = l[l_output_attrs] l_out.index = 'ltable.' + l_out.index d.update(l_out) # add right attributes if r_output_attrs: r_out = r[r_output_attrs] r_out.index = 'rtable.' + r_out.index d.update(r_out) block_list.append(d) candset = pd.DataFrame(block_list) ret_cols = self.get_attrs_to_retain(ltable.get_key(), rtable.get_key(), l_output_attrs, r_output_attrs) if len(candset) > 0: candset = MTable(candset[ret_cols]) else: candset = MTable(candset, columns=ret_cols) # add key #key_name = candset._get_name_for_key(candset.columns) #candset.add_key(key_name) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return candset
def block_candset(self, vtable): """ Block candidate set (virtual MTable) Parameters ---------- vtable : MTable Input candidate set Returns ------- blocked_table : MTable Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, None, None) l_key = vtable.get_property('foreign_key_ltable') r_key = vtable.get_property('foreign_key_rtable') # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(ltable.get_key(), drop=False) r_tbl = rtable.set_index(rtable.get_key(), drop=False) # create look up table for quick access of rows l_dict = {} for k, r in l_tbl.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_tbl.iterrows(): r_dict[k] = r # keep track of valid ids valid = [] # iterate candidate set and process each row if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(vtable)) elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) column_names = list(vtable.columns) lid_idx = column_names.index(l_key) rid_idx = column_names.index(r_key) for row in vtable.itertuples(index=False): if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l_row = l_dict[row[lid_idx]] r_row = r_dict[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res is True: valid.append(True) else: valid.append(False) # should be modified if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable')) out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable')) return out_table
def block_candset(self,vtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, qgram=None, word_level=True, overlap_size=1): """ Block candidateset with overlap blocker Parameters ---------- vtable : MTable, candidate set to block l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table rem_stop_words : flag to indicate whether stop words should be removed qgram : int, value of q in qgram tokenizer. Default value is None word_level : boolean, flag to indicate to use word level tokenizer overlap_size : int, number of tokens to overlap l_output_attrs, r_output_attrs - list of attribtues to be included in the output table Returns ------- result : MTable """ ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, None, None) # do some integrity checks if l_overlap_attr not in ltable.columns: raise AssertionError('Left overlap attribute not in ltable columns') if r_overlap_attr not in rtable.columns: raise AssertionError('Right overlap attribute not in rtable columns') l_key = vtable.get_property('foreign_key_ltable') r_key = vtable.get_property('foreign_key_rtable') # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(ltable.get_key(), drop=False) r_tbl = rtable.set_index(rtable.get_key(), drop=False) # create look up table for quick access of rows l_dict = {} for k, r in l_tbl.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_tbl.iterrows(): r_dict[k] = r valid = [] column_names = list(vtable.columns) lid_idx = column_names.index(l_key) rid_idx = column_names.index(r_key) if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) for row in vtable.itertuples(index=False): if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l_row = l_dict[row[lid_idx]] r_row = r_dict[row[rid_idx]] num_overlap = self.get_token_overlap_bt_two_tuples(l_row, r_row, l_overlap_attr, r_overlap_attr, qgram, rem_stop_words) if num_overlap >= overlap_size: valid.append(True) else: valid.append(False) if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable')) out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable')) return out_table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None): """ Block two tables Parameters ---------- ltable, rtable : MTable Input MTables l_output_attrs, r_output_attrs : list (of strings), defaults to None attribute names to be included in the output table Returns ------- blocked_table : MTable Notes ----- Output MTable contains the following three attributes * _id * id column from ltable * id column from rtable Also, the properties of blocked table is updated with following key-value pairs * ltable - ref to ltable * rtable - ref to rtable * key * foreign_key_ltable - string, ltable's id attribute name * foreign_key_rtable - string, rtable's id attribute name """ # do integrity checks l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_output_attrs, r_output_attrs) block_list = [] if mg._verbose: count = 0 per_count = math.ceil(mg._percent/100.0*len(ltable)*len(rtable)) elif mg._progbar: bar = pyprind.ProgBar(len(ltable)*len(rtable)) l_df = ltable.set_index(ltable.get_key(), drop=False) r_df = rtable.set_index(rtable.get_key(), drop=False) l_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_df.iterrows(): r_dict[k] = r lid_idx = ltable.get_attr_names().index(ltable.get_key()) rid_idx = rtable.get_attr_names().index(rtable.get_key()) for l_t in ltable.itertuples(index=False): for r_t in rtable.itertuples(index=False): if mg._verbose: count += 1 if count%per_count == 0: print str(mg._percent*count/per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l = l_dict[l_t[lid_idx]] r = r_dict[r_t[rid_idx]] # check whether it passes res = self.apply_rules(l, r) if res is True: d = OrderedDict() # add left id first ltable_id = 'ltable.' + ltable.get_key() d[ltable_id] = l[ltable.get_key()] # add right id rtable_id = 'rtable.' + rtable.get_key() d[rtable_id] = r[rtable.get_key()] # add left attributes if l_output_attrs: l_out = l[l_output_attrs] l_out.index = 'ltable.'+l_out.index d.update(l_out) # add right attributes if r_output_attrs: r_out = r[r_output_attrs] r_out.index = 'rtable.'+r_out.index d.update(r_out) block_list.append(d) candset = pd.DataFrame(block_list) ret_cols = self.get_attrs_to_retain(ltable.get_key(), rtable.get_key(), l_output_attrs, r_output_attrs) if len(candset) > 0: candset = MTable(candset[ret_cols]) else: candset = MTable(candset, columns=ret_cols) # add key #key_name = candset._get_name_for_key(candset.columns) #candset.add_key(key_name) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key()) return candset
def train_test_split(labeled_data, train_proportion=0.5, random_state=None): """ Split MTable into Train and Test Parameters ---------- labeled_data : MTable train_proportion : float, in the range 0-1. Proportion of train tuples, by default set to 0.5 random_state : int, Pseudo-random number generator state for random sampling Returns ------- result: Python dictionary with two keys: train, test. The value for each key is a MTable containing tuples for train and test respectively. """ num_rows = len(labeled_data) assert train_proportion >= 0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1" train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) idx_values = np.array(labeled_data.index.values) idx_train, idx_test = cv.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # create a MTable for train and test data lbl_train = MTable(labeled_data.ix[idx_train], key=labeled_data.get_key()) lbl_test = MTable(labeled_data.ix[idx_test], key=labeled_data.get_key()) # propogate properties lbl_train.set_property('key', labeled_data.get_key()) lbl_train.set_property('ltable', labeled_data.get_property('ltable')) lbl_train.set_property('rtable', labeled_data.get_property('rtable')) lbl_train.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable')) lbl_train.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable')) lbl_test.set_property('key', labeled_data.get_key()) lbl_test.set_property('ltable', labeled_data.get_property('ltable')) lbl_test.set_property('rtable', labeled_data.get_property('rtable')) lbl_test.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable')) lbl_test.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable')) result = OrderedDict() result['train'] = lbl_train result['test'] = lbl_test return result
def train_test_split(labeled_data, train_proportion = 0.5, random_state=None): """ Split MTable into Train and Test Parameters ---------- labeled_data : MTable train_proportion : float, in the range 0-1. Proportion of train tuples, by default set to 0.5 random_state : int, Pseudo-random number generator state for random sampling Returns ------- result: Python dictionary with two keys: train, test. The value for each key is a MTable containing tuples for train and test respectively. """ num_rows = len(labeled_data) assert train_proportion >=0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1" train_size = int(math.floor(num_rows*train_proportion)) test_size = int(num_rows - train_size) idx_values = np.array(labeled_data.index.values) idx_train, idx_test = cv.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # create a MTable for train and test data lbl_train = MTable(labeled_data.ix[idx_train], key=labeled_data.get_key()) lbl_test = MTable(labeled_data.ix[idx_test], key=labeled_data.get_key()) # propogate properties lbl_train.set_property('key', labeled_data.get_key()) lbl_train.set_property('ltable', labeled_data.get_property('ltable')) lbl_train.set_property('rtable', labeled_data.get_property('rtable')) lbl_train.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable')) lbl_train.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable')) lbl_test.set_property('key', labeled_data.get_key()) lbl_test.set_property('ltable', labeled_data.get_property('ltable')) lbl_test.set_property('rtable', labeled_data.get_property('rtable')) lbl_test.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable')) lbl_test.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable')) result = OrderedDict() result['train'] = lbl_train result['test'] = lbl_test return result
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, qgram=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None): """ Block tables with overlap blocker Parameters ---------- ltable, rtable : MTables, input MTables to block l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table rem_stop_words : flag to indicate whether stop words should be removed qgram : int, value of q in qgram tokenizer. Default value is None word_level : boolean, flag to indicate to use word level tokenizer overlap_size : int, number of tokens to overlap l_output_attrs, r_output_attrs - list of attribtues to be included in the output table Returns ------- result : MTable """ # do some integrity checks if l_overlap_attr not in ltable.columns: raise AssertionError( 'Left overlap attribute not in ltable columns') if r_overlap_attr not in rtable.columns: raise AssertionError( 'Right overlap attribute not in rtable columns') l_output_attrs, r_output_attrs = self.check_attrs( ltable, rtable, l_output_attrs, r_output_attrs) if word_level == True and qgram != None: raise SyntaxError( 'Parameters word_level and qgram cannot be set together; Note that world_level is set' 'to true by default, so explicity set word_level=False to use qgram' ) # remove nans l_df = self.rem_nan(ltable, l_overlap_attr) r_df = self.rem_nan(rtable, r_overlap_attr) l_df.reset_index(inplace=True, drop=True) r_df.reset_index(inplace=True, drop=True) l_df['_dummy_'] = 1 r_df['_dummy_'] = 1 if l_df.dtypes[l_overlap_attr] != object: logger.warning( 'Left overlap attribute is not of type string; converting to string temporarily' ) l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str) if r_df.dtypes[r_overlap_attr] != object: logger.warning( 'Right overlap attribute is not of type string; converting to string temporarily' ) r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str) l_dict = {} r_dict = {} for k, r in l_df.iterrows(): l_dict[k] = r for k, r in r_df.iterrows(): r_dict[k] = r l_col_values_chopped = self.process_table( l_df, l_overlap_attr, qgram, rem_stop_words) # zip token list with index-val zipped_l_col_values = zip(l_col_values_chopped, range(0, len(l_col_values_chopped))) appended_l_col_idx_values = [ self.append_index_values(v[0], v[1]) for v in zipped_l_col_values ] inv_idx = {} if mg._verbose: print 'Creating inverted index ' sink = [ self.compute_inv_index(t, inv_idx) for c in appended_l_col_idx_values for t in c ] if mg._verbose: print 'Done' r_col_values_chopped = self.process_table(r_df, r_overlap_attr, qgram, rem_stop_words) r_idx = 0 l_key = ltable.get_key() r_key = rtable.get_key() block_list = [] # misnomer - should be white list if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(rtable)) per_float = mg._percent / 100.0 * len(rtable) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(r_col_values_chopped)) df_list = [] for col_values in r_col_values_chopped: if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() qualifying_ltable_indices = self.get_potential_match_indices( col_values, inv_idx, overlap_size) r_row = r_dict[r_idx] r_row_dict = r_row.to_frame().T #r_row_dict['dummy'] = 1 l_rows_dict = l_df.iloc[qualifying_ltable_indices] #l_rows_dict['dummy'] = 1 df = l_rows_dict.merge(r_row_dict, on='_dummy_', suffixes=('_ltable', '_rtable')) if len(df) > 0: df_list.append(df) r_idx += 1 candset = pd.concat(df_list) # get output columns retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns), l_output_attrs, r_output_attrs) candset = candset[retain_cols] candset.columns = final_cols if len(candset) > 0: candset.sort( ['ltable.' + ltable.get_key(), 'rtable.' + rtable.get_key()], inplace=True) candset.reset_index(inplace=True, drop=True) candset = MTable(candset) # set metadata candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) candset.set_property('foreign_key_ltable', 'ltable.' + ltable.get_key()) candset.set_property('foreign_key_rtable', 'rtable.' + rtable.get_key()) return candset
def block_candset(self, vtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, qgram=None, word_level=True, overlap_size=1): """ Block candidateset with overlap blocker Parameters ---------- vtable : MTable, candidate set to block l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table rem_stop_words : flag to indicate whether stop words should be removed qgram : int, value of q in qgram tokenizer. Default value is None word_level : boolean, flag to indicate to use word level tokenizer overlap_size : int, number of tokens to overlap l_output_attrs, r_output_attrs - list of attribtues to be included in the output table Returns ------- result : MTable """ ltable = vtable.get_property('ltable') rtable = vtable.get_property('rtable') self.check_attrs(ltable, rtable, None, None) # do some integrity checks if l_overlap_attr not in ltable.columns: raise AssertionError( 'Left overlap attribute not in ltable columns') if r_overlap_attr not in rtable.columns: raise AssertionError( 'Right overlap attribute not in rtable columns') l_key = vtable.get_property('foreign_key_ltable') r_key = vtable.get_property('foreign_key_rtable') # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(ltable.get_key(), drop=False) r_tbl = rtable.set_index(rtable.get_key(), drop=False) # create look up table for quick access of rows l_dict = {} for k, r in l_tbl.iterrows(): l_dict[k] = r r_dict = {} for k, r in r_tbl.iterrows(): r_dict[k] = r valid = [] column_names = list(vtable.columns) lid_idx = column_names.index(l_key) rid_idx = column_names.index(r_key) if mg._verbose: count = 0 per_count = math.ceil(mg._percent / 100.0 * len(vtable)) print per_count elif mg._progbar: bar = pyprind.ProgBar(len(vtable)) for row in vtable.itertuples(index=False): if mg._verbose: count += 1 if count % per_count == 0: print str(mg._percent * count / per_count) + ' percentage done !!!' elif mg._progbar: bar.update() l_row = l_dict[row[lid_idx]] r_row = r_dict[row[rid_idx]] num_overlap = self.get_token_overlap_bt_two_tuples( l_row, r_row, l_overlap_attr, r_overlap_attr, qgram, rem_stop_words) if num_overlap >= overlap_size: valid.append(True) else: valid.append(False) if len(vtable) > 0: out_table = MTable(vtable[valid], key=vtable.get_key()) else: out_table = MTable(columns=vtable.columns, key=vtable.get_key()) out_table.set_property('ltable', ltable) out_table.set_property('rtable', rtable) out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable')) out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable')) return out_table