def block_union_combine(candset_list): ltable, rtable = lr_tables(candset_list) key_l = 'ltable.' + ltable.get_key() key_r = 'rtable.' + rtable.get_key() # get the set of id tuples id_set = set([(r[key_l], r[key_r]) for c in candset_list for i, r in c.iterrows()]) # get the union set of columns col_set = set([x for c in candset_list for x in c.columns]) col_l, col_r = lr_cols(col_set) dict_list = [get_dict(ltable.ix[x[0]], rtable.ix[x[1]], col_l, col_r) for x in id_set] table = pd.DataFrame(dict_list) col_f = fin_cols(col_l, col_r, ltable.get_key(), rtable.get_key()) table = MTable(table[col_f]) table._add_key('_id') table.set_property('ltable', ltable) table.set_property('rtable', rtable) return table
def block_tables(self, ltable, rtable, ltable_block_attribute, rtable_block_attribute, ltable_output_colnames=None, rtable_output_colnames=None): # integrity checks ltable_output_colnames, rtable_output_colnames = check_columns(ltable, rtable, ltable_block_attribute, rtable_block_attribute, ltable_output_colnames, rtable_output_colnames) # remove rows with nan values in block attribute column m_ltable, m_rtable = rem_nans(ltable, rtable, ltable_block_attribute, rtable_block_attribute) candset = pd.merge(m_ltable, m_rtable, left_on=ltable_block_attribute, right_on=rtable_block_attribute, suffixes=('_ltable', '_rtable'), copy=False) ret_cols, fin_cols = out_cols(ltable.get_key(), rtable.get_key(), list(candset.columns), ltable_output_colnames, rtable_output_colnames) candset = MTable(candset[ret_cols]) candset._add_key('_id') candset.columns = fin_cols candset.set_property('ltable', ltable) candset.set_property('rtable', rtable) return candset