Ejemplo n.º 1
0
def block_union_combine(candset_list):
    ltable, rtable = lr_tables(candset_list)
    key_l = 'ltable.' + ltable.get_key()
    key_r = 'rtable.' + rtable.get_key()
    # get the set of id tuples
    id_set = set([(r[key_l], r[key_r]) for c in candset_list for i, r in c.iterrows()])
    # get the union set of columns
    col_set = set([x for c in candset_list for x in c.columns])
    col_l, col_r = lr_cols(col_set)
    dict_list = [get_dict(ltable.ix[x[0]], rtable.ix[x[1]], col_l, col_r) for x in id_set]
    table = pd.DataFrame(dict_list)
    col_f = fin_cols(col_l, col_r, ltable.get_key(), rtable.get_key())
    table = MTable(table[col_f])
    table._add_key('_id')
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    return table
    def block_tables(self, ltable, rtable, ltable_block_attribute, rtable_block_attribute,
                     ltable_output_colnames=None, rtable_output_colnames=None):

        # integrity checks
        ltable_output_colnames, rtable_output_colnames = check_columns(ltable, rtable, ltable_block_attribute,
                                                                       rtable_block_attribute, ltable_output_colnames,
                                                                       rtable_output_colnames)
        # remove rows with nan values in block attribute column
        m_ltable, m_rtable = rem_nans(ltable, rtable, ltable_block_attribute, rtable_block_attribute)
        candset = pd.merge(m_ltable, m_rtable, left_on=ltable_block_attribute, right_on=rtable_block_attribute,
                            suffixes=('_ltable', '_rtable'), copy=False)

        ret_cols, fin_cols = out_cols(ltable.get_key(), rtable.get_key(), list(candset.columns),
                                      ltable_output_colnames, rtable_output_colnames)

        candset = MTable(candset[ret_cols])
        candset._add_key('_id')
        candset.columns = fin_cols
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        return candset