Beispiel #1
0
def sample_table(table, size, replace=False):
    """
    Sample MTable

    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.

    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table = table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table
Beispiel #2
0
def read_csv_(*args, **kwargs):
    """
    Read CSV (comma-separated) file into MTable

    Parameters
    ----------
    args : arguments to pandas read_csv command
    kwargs : arguments to pandas read_csv command along with optional "key" parameter.
        If key parameter is given, then it will be set as key,  else a new attribute ("_id")
        is added and set as key

    Returns
    -------
    result : MTable
    """
    # if kwargs.has_key('key') is False:
    #     raise AttributeError('Key is not specified')
    key = kwargs.pop('key', None)
    df = pd.read_csv(*args, **kwargs)
    if key is not None:
        return MTable(df, key=key)
    else:
        df = MTable(df)
        #key_name=df._get_name_for_key(df.columns)
        #df.add_key(key_name)
        return df
Beispiel #3
0
def sample_table(table, size, replace=False):
    """
    Sample MTable

    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.

    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table =  table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table
Beispiel #4
0
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None):
        """
        Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr,
                                                     l_output_attrs, r_output_attrs)
        # remove nans
        l_df = self.rem_nan(ltable, l_block_attr)
        r_df = self.rem_nan(rtable, r_block_attr)

        candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))

        # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns),
                                                   l_output_attrs, r_output_attrs)

        candset = candset[retain_cols]
        candset.columns = final_cols
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
        return candset
Beispiel #5
0
def down_sample(ltable, rtable, size, y):
    s_table, b_table, is_swapped = _order_tables(ltable, rtable)
    s_inv_index = _inv_index(s_table)
    b_sample_size = min(math.floor(size/y), len(b_table))
    b_tbl_indices = np.random.choice(len(b_table), b_sample_size, replace=False)
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y,
                                 len(s_table), s_inv_index)
    if is_swapped:
        s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices
    l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], ltable.get_key())
    l_sampled.properties = ltable.properties
    r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], rtable.get_key())
    r_sampled.properties = rtable.properties
    return l_sampled, r_sampled
def block_union_combine(candset_list):
    ltable, rtable = lr_tables(candset_list)
    key_l = 'ltable.' + ltable.get_key()
    key_r = 'rtable.' + rtable.get_key()
    # get the set of id tuples
    id_set = set([(r[key_l], r[key_r]) for c in candset_list for i, r in c.iterrows()])
    # get the union set of columns
    col_set = set([x for c in candset_list for x in c.columns])
    col_l, col_r = lr_cols(col_set)
    dict_list = [get_dict(ltable.ix[x[0]], rtable.ix[x[1]], col_l, col_r) for x in id_set]
    table = pd.DataFrame(dict_list)
    col_f = fin_cols(col_l, col_r, ltable.get_key(), rtable.get_key())
    table = MTable(table[col_f])
    table._add_key('_id')
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    return table
Beispiel #7
0
def create_mtable(table, key=None, ltable=None, rtable=None, foreign_key_ltable=None, foreign_key_rtable=None):
    """
    Create mtable from dataframe
    """
    out_table = MTable(table, key=key)
    truth_vals = [ltable is not None,  rtable is not None,  foreign_key_ltable is not None,
                  foreign_key_rtable is not None]
    if all(truth_vals) == True:
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', foreign_key_ltable)
        out_table.set_property('foreign_key_rtable', foreign_key_rtable)
    else:
        if any(truth_vals) == True:
            logging.getLogger(__name__).warning('Not all the properties for vtable are given; so not setting '
                                                'any of them')

    return out_table
Beispiel #8
0
def label_table(tbl, col_name, replace=True):
    """
    Label training data

    Parameters
    ----------
    tbl : MTable, Table to be labeled
    col_name : String, Name of the label column
    replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists.
    [This option is currently experimental].

    Returns
    -------
    result : MTable, Table with labels

    Notes
    -----
    The label value is expected to be only 0 or 1.
    """
    from magellan.gui.mtable_gui import edit
    table = tbl.copy()

    if col_name in table.columns:
        if replace == True:
            logging.getLogger(__name__).warning(
                'Input table already contains column %s. '
                '' % col_name)
            table[col_name] = 0
    else:
        table[col_name] = 0
    mg.edit(table)
    table[col_name] = table[col_name].astype(int)
    # check if the table contains only 0s and 1s
    c1 = table[col_name] == 1
    c2 = table[col_name] == 0
    c = sum(c1 | c2)
    assert c == len(
        table), 'The label column contains values other than 0 and 1'

    table = MTable(table, key=tbl.get_key())
    table.properties = tbl.properties
    return table
Beispiel #9
0
def label_table(tbl, col_name, replace=True):
    """
    Label training data

    Parameters
    ----------
    tbl : MTable, Table to be labeled
    col_name : String, Name of the label column
    replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists.
    [This option is currently experimental].

    Returns
    -------
    result : MTable, Table with labels

    Notes
    -----
    The label value is expected to be only 0 or 1.
    """
    from magellan.gui.mtable_gui import edit
    table = tbl.copy()

    if col_name in table.columns:
        if replace == True:
            logging.getLogger(__name__).warning('Input table already contains column %s. '
                                                '' %col_name)
            table[col_name] = 0
    else:
        table[col_name] = 0
    mg.edit(table)
    table[col_name] = table[col_name].astype(int)
    # check if the table contains only 0s and 1s
    c1 = table[col_name] == 1
    c2 = table[col_name] == 0
    c = sum(c1|c2)
    assert c == len(table), 'The label column contains values other than 0 and 1'

    table = MTable(table, key=tbl.get_key())
    table.properties = tbl.properties
    return table
Beispiel #10
0
def create_mtable(table,
                  key=None,
                  ltable=None,
                  rtable=None,
                  foreign_key_ltable=None,
                  foreign_key_rtable=None):
    """
    Create mtable from dataframe
    """
    out_table = MTable(table, key=key)
    truth_vals = [
        ltable is not None, rtable is not None, foreign_key_ltable is not None,
        foreign_key_rtable is not None
    ]
    if all(truth_vals) == True:
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', foreign_key_ltable)
        out_table.set_property('foreign_key_rtable', foreign_key_rtable)
    else:
        if any(truth_vals) == True:
            logging.getLogger(__name__).warning(
                'Not all the properties for vtable are given; so not setting '
                'any of them')

    return out_table
Beispiel #11
0
def down_sample(s_table, b_table, size, y):
    if len(b_table) < size:
        print 'Warning!! size of table B is less than b_size parameter - using entire table B'
        size = len(b_table)

    t1 = time.time()
    s_inv_index = _inv_index(s_table)
    print 'Inverted Index Time: '
    print int(time.time() - t1)
    b_sample_size = min(math.floor(size/y), len(b_table))
    b_tbl_indices = list(np.random.choice(len(b_table), b_sample_size, replace=False))
    t1 = time.time()
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y,
                                 len(s_table), s_inv_index)
    print 'Probe Index Time: '
    print int(time.time() - t1)

    s_tbl_indices = list(s_tbl_indices)
    l_sampled = MTable(s_table.iloc[list(s_tbl_indices)], key=s_table.get_key())
    l_sampled.properties = s_table.properties
    r_sampled = MTable(b_table.iloc[list(b_tbl_indices)], key=b_table.get_key())
    r_sampled.properties = b_table.properties
    return l_sampled, r_sampled
Beispiel #12
0
def down_sample(ltable, rtable, size, y):
    s_table, b_table, is_swapped = _order_tables(ltable, rtable)
    s_inv_index = _inv_index(s_table)
    b_sample_size = min(math.floor(size / y), len(b_table))
    b_tbl_indices = list(
        np.random.choice(len(b_table), b_sample_size, replace=False))
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table),
                                 s_inv_index)
    s_tbl_indices = list(s_tbl_indices)
    if is_swapped:
        s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices
    l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], key=ltable.get_key())
    l_sampled.properties = ltable.properties
    r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], key=rtable.get_key())
    r_sampled.properties = rtable.properties
    return l_sampled, r_sampled
    def block_tables(self, ltable, rtable, ltable_block_attribute, rtable_block_attribute,
                     ltable_output_colnames=None, rtable_output_colnames=None):

        # integrity checks
        ltable_output_colnames, rtable_output_colnames = check_columns(ltable, rtable, ltable_block_attribute,
                                                                       rtable_block_attribute, ltable_output_colnames,
                                                                       rtable_output_colnames)
        # remove rows with nan values in block attribute column
        m_ltable, m_rtable = rem_nans(ltable, rtable, ltable_block_attribute, rtable_block_attribute)
        candset = pd.merge(m_ltable, m_rtable, left_on=ltable_block_attribute, right_on=rtable_block_attribute,
                            suffixes=('_ltable', '_rtable'), copy=False)

        ret_cols, fin_cols = out_cols(ltable.get_key(), rtable.get_key(), list(candset.columns),
                                      ltable_output_colnames, rtable_output_colnames)

        candset = MTable(candset[ret_cols])
        candset._add_key('_id')
        candset.columns = fin_cols
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        return candset
Beispiel #14
0
def read_csv(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into MTable

    Parameters
    ----------
    args : arguments to pandas read_csv command
    kwargs : arguments to pandas read_csv command along with optional "key" if its MTable
            or "key", "ltable", "rtable", "foreign_key_ltable", "foreign_key_rtable" if its VTable

    Returns
    -------
    result : MTable

    Note
    ----
    read_csv can read in the meta data mentioned at the beginning of the file like this:

    #key=id

    A user can override or supply  metadata as key-value args to the function

    """

    properties, num_lines = get_properties_from_file(file_path)
    properties, kwargs = update_properties(properties, **kwargs)
    check_properties(properties)
    kwargs['skiprows']=num_lines
    df = pd.read_csv(file_path, **kwargs)

    # get key
    key = properties.pop('key', None)
    if key is not None:
        df = MTable(df, key=key)
    else:
        df = MTable(df)
    for k, v in properties.iteritems():
        df.set_property(k, v)
    return df
Beispiel #15
0
    def block_candset(self, vtable, l_block_attr, r_block_attr):
        """
        Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        vtable : MTable
            Input candidate set
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """
        # do integrity checks
        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None,
                         None)
        l_key = 'ltable.' + ltable.get_key()
        r_key = 'rtable.' + rtable.get_key()

        # convert to dataframes
        l_df = ltable.to_dataframe()
        r_df = rtable.to_dataframe()

        # set index for convenience
        l_df.set_index(ltable.get_key(), inplace=True)
        r_df.set_index(rtable.get_key(), inplace=True)

        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        for idx, row in vtable.iterrows():

            if mg._verbose:
                count += 1
                if count % per_count == 0:
                    print str(mg._percent * count /
                              per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            # get the value of block attribute from ltuple
            l_val = l_df.ix[row[l_key], l_block_attr]
            r_val = r_df.ix[row[r_key], r_block_attr]
            if l_val != np.NaN and r_val != np.NaN:
                if l_val == r_val:
                    valid.append(True)
                else:
                    valid.append(False)
            else:
                valid.append(False)

        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable',
                               'ltable.' + ltable.get_key())
        out_table.set_property('foreign_key_rtable',
                               'rtable.' + rtable.get_key())
        return out_table
Beispiel #16
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None):
        """
        Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(
            ltable, rtable, l_block_attr, r_block_attr, l_output_attrs,
            r_output_attrs)
        # remove nans
        l_df = self.rem_nan(ltable, l_block_attr)
        r_df = self.rem_nan(rtable, r_block_attr)

        candset = pd.merge(l_df,
                           r_df,
                           left_on=l_block_attr,
                           right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))

        # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(),
                                                      rtable.get_key(),
                                                      list(candset.columns),
                                                      l_output_attrs,
                                                      r_output_attrs)

        candset = candset[retain_cols]
        candset.columns = final_cols
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable',
                             'ltable.' + ltable.get_key())
        candset.set_property('foreign_key_rtable',
                             'rtable.' + rtable.get_key())
        return candset
Beispiel #17
0
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property('foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %(len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]
    end = time.time()
    logging.getLogger(__name__).info('Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors
Beispiel #18
0
    def block_tables_skd(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None):
        """
        Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr,
                                                     l_output_attrs, r_output_attrs)
        # remove nans
        l_df = self.rem_nan(ltable, l_block_attr)
        r_df = self.rem_nan(rtable, r_block_attr)
        
	#print 'cpu_count() = %d\n' % multiprocessing.cpu_count() 
	cpu_count = multiprocessing.cpu_count()
	m = int(math.sqrt(cpu_count)) # no. of splits of l_df
	n = cpu_count/m # no. of splits of r_df
        print "m: ", m, ", n: ", n
	t0 = time.time() 
        l_splits = np.array_split(l_df, m)
	t1 = time.time()
        r_splits = np.array_split(r_df, n)
	t2 = time.time()
        l_key = ltable.get_key()
        r_key = rtable.get_key()
        lr_splits = [(l, r, l_block_attr, r_block_attr, l_key, r_key, l_output_attrs, r_output_attrs) for l in l_splits for r in r_splits]
	t3 = time.time()
        #pool = Pool(4)
        pool = mp.ProcessingPool(processes=cpu_count, maxtasksperchild=1)
	t4 = time.time()
        c_splits = pool.map(self.block_data_frames_skd, lr_splits)
	t5 = time.time()
        pool.close()
	t6 = time.time()
        pool.join() 
	t7 = time.time()
        candset = pd.concat(c_splits, ignore_index=True)
	#candset = c_splits[0].append(c_splits[1], ignore_index=True)
	t8 = time.time()
	print "Time taken to split table A:", (t1 - t0)
	print "Time taken to split table B:", (t2 - t1)
	print "Time taken to get AB splits:", (t3 - t2)
	print "Time taken to start workers:", (t4 - t3)
	print "Time taken to get  C splits:", (t5 - t4)
	print "Time taken to close    pool:", (t6 - t5)
	print "Time taken to join     pool:", (t7 - t6)
	print "Time taken to combine splits:", (t8 - t7)
        final_cols = self.get_final_cols(l_key, r_key,
                                                   l_output_attrs, r_output_attrs)
        candset.columns = final_cols
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
        return candset
Beispiel #19
0
    def block_candset_opt_1(self, vtable, l_block_attr, r_block_attr):
        """
        Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        vtable : MTable
            Input candidate set
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """
	start_time = time.time()
        # do integrity checks
        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None)
        l_key = 'ltable.' + ltable.get_key()
        r_key = 'rtable.' + rtable.get_key()

	t000 = time.time()
	print "Time taken to do integrity checks:", (t000 - start_time)
        # convert to dataframes
        l_df = ltable.to_dataframe()
        r_df = rtable.to_dataframe()

	t001 = time.time()
	print "Time taken to convert tables A and B to data frames:", (t001 - t000)
        # set index for convenience
        l_df.set_index(ltable.get_key(), inplace=True)
        r_df.set_index(rtable.get_key(), inplace=True)

	t002 = time.time()
	print "Time taken to set indexes for tables A and B:", (t002 - t001)
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        column_names = list(vtable.columns)
        #lid_idx = column_names.index(l_key)
        #rid_idx = column_names.index(r_key)
	l_block_attr_idx = column_names.index('ltable.' + l_block_attr)
	r_block_attr_idx = column_names.index('rtable.' + r_block_attr)

        # create look up table for quick access of rows
        #l_dict = {}
        #r_dict = {}

        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        for row in vtable.itertuples(index=False):
            if mg._verbose:
                count += 1
                if count%per_count == 0:
                    print str(mg._percent*count/per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            # get the value of block attribute from ltuple
	    #row_lid = row[lid_idx]
	    #if row_lid not in l_dict:
	    #	l_dict[row_lid] = row[l_block_attr_idx]
            #l_val = l_dict[row_lid]
	    l_val = row[l_block_attr_idx]

            # get the value of block attribute from rtuple
	    #row_rid = row[rid_idx]
	    #if row_rid not in r_dict:
	    #	r_dict[row_rid] = row[r_block_attr_idx]
            #r_val = r_dict[row_rid]
            r_val = row[r_block_attr_idx]

            if l_val != np.NaN and r_val != np.NaN:
                if l_val == r_val:
                    valid.append(True)
                else:
                    valid.append(False)
            else:
                valid.append(False)
        
	t6 = time.time()
	print "Time taken to get valid ids:", (t6 - t002)
        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
	t7 = time.time()
	print "Time taken to create mtable for candset:", (t7 - t6)
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
	end_time = time.time()
	print "Time taken to set properties of candset: ", (end_time - t7)
	print "Total time to block candset: ", (end_time - start_time)
        return out_table
Beispiel #20
0
def combine_block_outputs_via_union(blocker_output_list):
    """
    Combine blocker outputs by unioning ltable, rtable ids in candidate set

    Parameters
    ----------
    blocker_output_list : list
        List of blocker outputs

    Returns
    -------
    combined_blocker_output : MTable
        With combined blocker outputs

    Notes
    -----
    Combined_blocker_output contains the following attributes
    * _id
    * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs
    * union of non-id attributes from each of blocker output
    """
    ltable, rtable = lr_tables(blocker_output_list)
    # get the attribute names in blocker output that represents ltable, rtable
    l_key = 'ltable.' + ltable.get_key()
    r_key = 'rtable.' + rtable.get_key()

    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()

    # get the union of attribute names from blocker output list
    col_set = set([x for c in blocker_output_list for x in c.columns])
    l_col, r_col = lr_cols(col_set)
    l_col = list(l_col)
    r_col = list(r_col)

    l_df = l_df[l_col] # minimally the projection must contain id column
    r_df = r_df[r_col]

    col_names = ['ltable.'+c for c in l_df.columns]
    l_df.columns = col_names
    col_names = ['rtable.'+c for c in r_df.columns]
    r_df.columns = col_names



    l_df.set_index(l_key, inplace=True, drop=False)
    r_df.set_index(r_key, inplace=True, drop=False)


    # get id pairs
    id_set = []

    for c in blocker_output_list:
        lfid_idx = c.get_attr_names().index(c.get_property('foreign_key_ltable'))
        rfid_idx = c.get_attr_names().index(c.get_property('foreign_key_rtable'))

        for r in c.itertuples(index=False):
            id_set.append((r[lfid_idx], r[rfid_idx]))
    id_set = list(set(id_set))

    f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key())

    if len(id_set) > 0:
        id_df = pd.DataFrame(id_set)
        l_consol_table = l_df.ix[id_df[0]]
        r_consol_table = r_df.ix[id_df[1]]
        l_consol_table.reset_index(inplace=True, drop=True)
        r_consol_table.reset_index(inplace=True, drop=True)

        table = pd.concat([l_consol_table, r_consol_table], axis=1)
        table.sort([l_key, r_key], inplace=True)
        table.reset_index(inplace=True, drop=True)
        table = MTable(table[f_cols])
    else:
        table = MTable([], columns=f_cols)

    # project df and convert to MTable
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
    table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())


    return table
Beispiel #21
0
def extract_feature_vecs(s,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(
    ), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property(
        'foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %
                                     (len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]
    end = time.time()
    logging.getLogger(__name__).info(
        'Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors
Beispiel #22
0
    def block_candset_skd(self, vtable, l_block_attr, r_block_attr):
        """
        Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        vtable : MTable
            Input candidate set
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """
	start_time = time.time()
        # do integrity checks
        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None)
        l_key = 'ltable.' + ltable.get_key()
        r_key = 'rtable.' + rtable.get_key()
	
	t000 = time.time()
	print "Time taken to do integrity checks:", (t000 - start_time)

        # convert to dataframes
        l_df = ltable.to_dataframe()
        r_df = rtable.to_dataframe()

	t001 = time.time()
	print "Time taken to convert tables A and B to data frames:", (t001 - t000)

        # set index for convenience
        l_df.set_index(ltable.get_key(), inplace=True)
        r_df.set_index(rtable.get_key(), inplace=True)

	t002 = time.time()
	print "Time taken to set indexes for tables A and B:", (t002 - t001)

	cpu_count = multiprocessing.cpu_count() 
        #pool = Pool(4)
	t00 = time.time()
        pool = mp.ProcessingPool(processes=cpu_count, maxtasksperchild=1)
	t01 = time.time()
	print "Time taken to initialize the pool of workers:", (t01 - t00)
	c_df = vtable.to_dataframe()
	t0 = time.time()
	print "Time taken to convert mtable to data frame:", (t0 - t01)
        c_splits = np.array_split(c_df, cpu_count)
	t1 = time.time()
	print "Time taken to split table C:", (t1 - t0)
        args_splits = [(c, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr) for c in c_splits]
	t2 = time.time()
	print "Time taken to get args splits:", (t2 - t1)
        valid_splits = pool.map(self.get_valid_ids, args_splits)
	t3 = time.time()
	print "Time taken to get valid splits:", (t3 - t2)
        pool.close()
	t4 = time.time()
	print "Time taken to close    pool:", (t4 - t3)
        pool.join() 
	t5 = time.time()
	print "Time taken to join     pool:", (t5 - t4)
        #valid = pd.concat(valid_splits, ignore_index=True)
	#valid = list(chain(valid_splits))
	valid = sum(valid_splits, [])
	t6 = time.time()
	print "Time taken to combine valid splits:", (t6 - t5)
 
        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
	t7 = time.time()
	print "Time taken to create mtable from data frame:", (t7 - t6)
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
	end_time = time.time()
	print "Time taken to set properties of mtable:", (end_time - t7)
	print "Time taken to block candset:", (end_time - start_time)
        return out_table
Beispiel #23
0
def _combine_block_outputs_via_union(blocker_output_list):
    """
    Combine blocker outputs by unioning ltable, rtable ids in candidate set

    Parameters
    ----------
    blocker_output_list : list
        List of blocker outputs

    Returns
    -------
    combined_blocker_output : MTable
        With combined blocker outputs

    Notes
    -----
    Combined_blocker_output contains the following attributes
    * _id
    * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs
    * union of non-id attributes from each of blocker output
    """
    ltable, rtable = lr_tables(blocker_output_list)
    # get the attribute names in blocker output that represents ltable, rtable
    l_key = 'ltable.' + ltable.get_key()
    r_key = 'rtable.' + rtable.get_key()

    # get the set of id pairs from all blocker output list
    id_set = set([(r[l_key], r[r_key]) for c in blocker_output_list for i, r in c.iterrows()])

    # get the union of attribute names from blocker output list
    col_set = set([x for c in blocker_output_list for x in c.columns])
    l_col, r_col = lr_cols(col_set)

    # convert ltable, rtable to dfs and set index
    l_df = ltable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df = rtable.to_dataframe()
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    # get the l_col, r_col from ltable and rtable respectively
    dict_list = [get_dict(l_df.ix[x[0]], r_df.ix[x[1]], l_col, r_col) for x in id_set]

    # convert list of dicts to dataframe
    table = pd.DataFrame(dict_list)

    # get the final column names for output table
    f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key())


    if len(table) > 0:
        table.sort([l_key, r_key], inplace=True)
        table.reset_index(inplace=True, drop=True)
        table = MTable(table[f_cols])
    else:
        table = MTable(table, columns=f_cols)

    # project df and convert to MTable
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
    table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())


    return table
Beispiel #24
0
def get_filtered_table(ltable, rtable, corres_list):
    ltable_cols = [col_pair[0] for col_pair in corres_list]
    rtable_cols = [col_pair[1] for col_pair in corres_list]
    l_mtable = MTable(ltable[ltable_cols], key=ltable.get_key())
    r_mtable = MTable(rtable[rtable_cols], key=rtable.get_key())
    return l_mtable, r_mtable
Beispiel #25
0
    def block_candset(self, vtable, l_block_attr, r_block_attr):
        """
        Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        vtable : MTable
            Input candidate set
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """
        # do integrity checks
        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None, None)
        l_key = 'ltable.' + ltable.get_key()
        r_key = 'rtable.' + rtable.get_key()

        # convert to dataframes
        l_df = ltable.to_dataframe()
        r_df = rtable.to_dataframe()

        # set index for convenience
        l_df.set_index(ltable.get_key(), inplace=True)
        r_df.set_index(rtable.get_key(), inplace=True)

        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))


        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        for idx, row in vtable.iterrows():

            if mg._verbose:
                count += 1
                if count%per_count == 0:
                    print str(mg._percent*count/per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            # get the value of block attribute from ltuple
            l_val = l_df.ix[row[l_key], l_block_attr]
            r_val = r_df.ix[row[r_key], r_block_attr]
            if l_val != np.NaN and r_val != np.NaN:
                if l_val == r_val:
                    valid.append(True)
                else:
                    valid.append(False)
            else:
                valid.append(False)
        
        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        out_table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
        return out_table
Beispiel #26
0
    def block_tables_opt(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None):
        """
        Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_block_attr, r_block_attr,
                                                     l_output_attrs, r_output_attrs)
        # remove nans
        l_df = self.rem_nan(ltable, l_block_attr)
        r_df = self.rem_nan(rtable, r_block_attr)
	
	t00 = time.time()
	lk = ltable.get_key()
	l_output_attrs_1 = l_output_attrs;
	if lk not in l_output_attrs_1:
 	    l_output_attrs_1.append(lk)
	if l_block_attr not in l_output_attrs_1:
 	    l_output_attrs_1.append(l_block_attr)

	r_output_attrs_1 = r_output_attrs
	rk = rtable.get_key()
	if rk not in r_output_attrs_1:
 	    r_output_attrs_1.append(rk)
	if r_block_attr not in r_output_attrs_1:
 	    r_output_attrs_1.append(r_block_attr)

	print l_output_attrs_1
	print r_output_attrs_1

	l_df_1 = l_df[l_output_attrs_1]
	r_df_1 = r_df[r_output_attrs_1]
	
	#l_df_1.set_index(l_block_attr)
	#r_df_1.set_index(r_block_attr)
	t0 = time.time()
        candset = pd.merge(l_df_1, r_df_1, left_on=l_block_attr, right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))
	print list(candset)
	t1 = time.time()
        # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns),
                                                   l_output_attrs, r_output_attrs)
	print "retain_cols: ", retain_cols
	print "final_cols: ", final_cols

	t2 = time.time()
        candset = candset[retain_cols]
	t3 = time.time()
        candset.columns = final_cols
	t4 = time.time()
        candset = MTable(candset)
	t5 = time.time()

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())	
	t6 = time.time()
	print "Time taken to project A and B:", (t0 - t00)
	print "Time taken to merge A and B:", (t1 - t0)
	print "Time taken to get output cols:", (t2 - t1)
	print "Time taken to project C cols:", (t3 - t2)
	print "Time taken to set C final cols:", (t4 - t3)
	print "Time taken to create table C:", (t5 - t4)
	print "Time taken to set props for C:", (t6 - t5)

        return candset
Beispiel #27
0
def debug_blocker(ltable, rtable, candidate_set, pred_list_size=200, field_corres_list=None):
    """
    Debug the blocker. The basic idea is trying to suggest the user a list of record pairs
    out of the candidate set with high (document) jaccard similarity. The object of similarity
    measurement (document) is generated based on a string concatenation method with field
    selection. Given the suggestion list, the user should go through the pairs and determine if
    there are, and how many true matches in it. And based on this information, the user can
    determine if further improvement on the blocking step is necessary (Ex. if there are many
    true matches in the list, the user may conclude that the blocking step is flawed, and should
    revise it to produce a better candidate set).

    Parameters
    ----------
    ltable, rtable : MTable
        Input MTables
    candidate_set : MTable
        The candidate set table after performing blocking on ltable and rtable
    pred_list_size : int
        The size of the output suggestion list
    field_corres_list : list (of tuples), defaults to None
        The list of field pairs from ltable and rtable. Each pair indicates a field correspondence
        between two tables. Since ltable and rtable can have different schemas, it' necessary to
        have this parameter to build the field correspondence to make sure the string concatenation
        algorithm runs correctly.
        Note each pair in the list should be represented as a tuple in the following format:
                            (some_ltable_field, some_rtable_field)

    Returns
    -------
    suggestion_table : MTable
        Contains a list of pair suggestions with high jaccard similarity. The output MTable contains
        the following fields:
            * _id
            * similarity (of the record pair)
            * ltable record key value
            * rtable record key value
            * field pairs from filtered corres_list (removing the numeric types)
                ltable_field_1
                rtable_field_1 (corresponding to ltable_field_1)
                ltable_field_2
                rtable_field_2 (corresponding to ltable_field_2)
                      .
                      .
                ltable_field_k
                rtable_field_k (corresponding to ltable_field_k)
    """

    # Basic checks.
    if len(ltable) == 0:
        raise StandardError('Error: ltable is empty!')
    if len(rtable) == 0:
        raise StandardError('Error: rtable is empty!')
    if pred_list_size <= 0:
        raise StandardError('The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!')

    logging.info('\nPreparing for debugging blocker')

    # Check the user input field correst list (if exists) and get the raw version of
    # our internal correst list.
    check_input_field_correspondence_list(ltable, rtable, field_corres_list)
    corres_list = get_field_correspondence_list(ltable, rtable, field_corres_list)

    # Build the (col_name: col_index) dict to speed up locating a field in the schema.
    ltable_col_dict = build_col_name_index_dict(ltable)
    rtable_col_dict = build_col_name_index_dict(rtable)

    # Filter correspondence list to remove numeric types. We only consider string types
    # for document concatenation.
    filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict, corres_list)
    # logging.info('\nFiltered field correspondence list:\n' + str(corres_list))

    # Get field filtered new table.
    ltable_filtered, rtable_filtered = get_filtered_table(ltable, rtable, corres_list)

    # Select features.
    """TODO(hanli): currently we don't select the key fields even if they have the largest score.
    # This is because the values of the key field could be simply domain-specific serial numbers,
    # which might be meaningless or even harmful (when two tables use different key formats).
    # Modify it if this ituition is not proper."""
    feature_list = select_features(ltable_filtered, rtable_filtered)
    if len(feature_list) == 0:
        raise StandardError('\nError: the selected field list is empty, nothing could be done! ' +
                            'Please check if all table fields are numeric types.')
    # logging.info('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i],
    #  rtable_filtered.columns[i]) for i in feature_list]))

    # Get each table kgram dict.
    ltable_kgram_dict = get_kgram_dict(ltable_filtered, ltable_filtered.get_key(), feature_list, 3)
    rtable_kgram_dict = get_kgram_dict(rtable_filtered, rtable_filtered.get_key(), feature_list, 3)

    # Build inverted index on ltable kgrams to speed up debugging.
    inverted_index = build_inverted_index(ltable_kgram_dict)

    ltable_key = candidate_set.get_property('foreign_key_ltable')
    rtable_key = candidate_set.get_property('foreign_key_rtable')

    indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key], drop=False)
    candidate_index_key_set = set(indexed_candidate_set[rtable_key])
    # logging.info('\nCandidate set size: %d' %(len(indexed_candidate_set)))

    rtable_len = len(rtable_filtered)
    progress_dict = {}
    for i in range(10):
        progress_dict[int((i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10

    logging.info('\nStart debugging blocker')
    pred_index_list = []
    count = 0
    for rkey in rtable_kgram_dict:
        count += 1
        # if count == 500:
            # print pred_index_list
        #     break
        if rtable_len <= 10:
            logging.info('\nDebugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len)))
        else:
            if count in progress_dict:
                logging.info('\nDebugging %s' %('{percent:.2%}'.format(percent=progress_dict[count])))

        rkgram_set = rtable_kgram_dict[rkey]
        if len(rkgram_set) == 0:
            continue
        cand_set = {}
        if rkey in candidate_index_key_set:
            cand_set = indexed_candidate_set.ix[rkey].index.values
        ltable_index_set = get_potential_match_set(rkgram_set, inverted_index)
        for lkey in ltable_index_set:
            if lkey in cand_set:
                continue
            jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set)
            if len(pred_index_list) == pred_list_size:
                hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey))
            else:
                hq.heappush(pred_index_list, (jac_sim, lkey, rkey))

    ret_data_frame = generate_prediction_table(ltable_filtered, rtable_filtered, pred_index_list)

    """This print is for debugging"""
    #print ret_data_frame

    ret_mtable = MTable(ret_data_frame)
    ret_mtable.set_property('foreign_key_ltable', ltable_key)
    ret_mtable.set_property('foreign_key_rtable', rtable_key)
    ret_mtable.set_property('ltable', ltable)
    ret_mtable.set_property('rtable', rtable)

    logging.info('\nFinish debugging blocker')

    return ret_mtable
Beispiel #28
0
def debug_blocker(ltable,
                  rtable,
                  candidate_set,
                  output_size=200,
                  attr_corres=None):
    """
    Debug the blocker. The basic idea is trying to suggest the user a list of record pairs
    out of the candidate set with high (document) jaccard similarity. The object of similarity
    measurement (document) is generated based on a string concatenation method with field
    selection. Given the suggestion list, the user should go through the pairs and determine if
    there are, and how many true matches in it. And based on this information, the user can
    determine if further improvement on the blocking step is necessary (Ex. if there are many
    true matches in the list, the user may conclude that the blocking step is flawed, and should
    revise it to produce a better candidate set).

    Parameters
    ----------
    ltable, rtable : MTable
        Input MTables
    candidate_set : MTable
        The candidate set table after performing blocking on ltable and rtable
    pred_list_size : int
        The size of the output suggestion list
    field_corres_list : list (of tuples), defaults to None
        The list of field pairs from ltable and rtable. Each pair indicates a field correspondence
        between two tables. Since ltable and rtable can have different schemas, it' necessary to
        have this parameter to build the field correspondence to make sure the string concatenation
        algorithm runs correctly.
        Note each pair in the list should be represented as a tuple in the following format:
                            (some_ltable_field, some_rtable_field)

    Returns
    -------
    suggestion_table : MTable
        Contains a list of pair suggestions with high jaccard similarity. The output MTable contains
        the following fields:
            * _id
            * similarity (of the record pair)
            * ltable record key value
            * rtable record key value
            * field pairs from filtered corres_list (removing the numeric types)
                ltable_field_1
                rtable_field_1 (corresponding to ltable_field_1)
                ltable_field_2
                rtable_field_2 (corresponding to ltable_field_2)
                      .
                      .
                ltable_field_k
                rtable_field_k (corresponding to ltable_field_k)
    """

    # Basic checks.
    if len(ltable) == 0:
        raise StandardError('Error: ltable is empty!')
    if len(rtable) == 0:
        raise StandardError('Error: rtable is empty!')
    if output_size <= 0:
        raise StandardError(
            'The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!'
        )

    # logging.info('\nPreparing for debugging blocker')

    # Check the user input field correst list (if exists) and get the raw version of
    # our internal correst list.
    check_input_field_correspondence_list(ltable, rtable, attr_corres)
    corres_list = get_field_correspondence_list(ltable, rtable, attr_corres)

    # Build the (col_name: col_index) dict to speed up locating a field in the schema.
    ltable_col_dict = build_col_name_index_dict(ltable)
    rtable_col_dict = build_col_name_index_dict(rtable)

    # Filter correspondence list to remove numeric types. We only consider string types
    # for document concatenation.
    filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict,
                       corres_list)
    #print('\nFiltered field correspondence list:\n' + str(corres_list))

    # Get field filtered new table.
    ltable_filtered, rtable_filtered = get_filtered_table(
        ltable, rtable, corres_list)

    # Select features.
    """TODO: currently we don't select the key fields even if they have the largest score.
    # This is because the values of the key field could be simply domain-specific serial numbers,
    # which might be meaningless or even harmful (when two tables use different key formats).
    # Modify it if this ituition is not proper."""
    feature_list = select_features(ltable_filtered, rtable_filtered)
    if len(feature_list) == 0:
        raise StandardError(
            '\nError: the selected field list is empty, nothing could be done! '
            + 'Please check if all table fields are numeric types.')
    #print('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i], rtable_filtered.columns[i]) for i in feature_list]))

    # Get each table kgram dict.
    ltable_kgram_dict = get_kgram_dict(ltable_filtered,
                                       ltable_filtered.get_key(), feature_list,
                                       3)
    rtable_kgram_dict = get_kgram_dict(rtable_filtered,
                                       rtable_filtered.get_key(), feature_list,
                                       3)

    # Build inverted index on ltable kgrams to speed up debugging.
    inverted_index = build_inverted_index(ltable_kgram_dict)

    ltable_key = candidate_set.get_property('foreign_key_ltable')
    rtable_key = candidate_set.get_property('foreign_key_rtable')

    indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key],
                                                    drop=False)
    candidate_index_key_set = set(indexed_candidate_set[rtable_key])
    #print('\nCandidate set size: %d' %(len(indexed_candidate_set)))

    rtable_len = len(rtable_filtered)
    progress_dict = {}
    for i in range(10):
        progress_dict[int(
            (i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10

    #print('\nStart debugging blocker')
    # print('Start debugging blocker')
    pred_index_list = []
    count = 0

    if mg._verbose:
        count_ = 0
        per_count = math.ceil(mg._percent / 100.0 * len(rtable))
    elif mg._progbar:
        bar = pyprind.ProgBar(len(rtable))

    for rkey in rtable_kgram_dict:
        count += 1
        #if count == 500:
        #print pred_index_list
        #	break
        # if rtable_len <= 10:
        #     print('Debugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len)))
        # else:
        #     if count in progress_dict:
        #         print('Debugging %s' %('{percent:.2%}'.format(percent=progress_dict[count])))

        if mg._verbose:
            count_ += 1
            if count_ % per_count == 0:
                print str(
                    mg._percent * count_ / per_count) + ' percentage done !!!'
        elif mg._progbar:
            bar.update()

        rkgram_set = rtable_kgram_dict[rkey]
        if len(rkgram_set) == 0:
            continue
        cand_set = {}
        if rkey in candidate_index_key_set:
            cand_set = indexed_candidate_set.ix[rkey].index.values
        ltable_index_set = get_potential_match_set(rkgram_set, inverted_index)
        for lkey in ltable_index_set:
            if lkey in cand_set:
                continue
            jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set)
            if len(pred_index_list) == output_size:
                hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey))
            else:
                hq.heappush(pred_index_list, (jac_sim, lkey, rkey))

    ret_data_frame = generate_prediction_table(ltable_filtered,
                                               rtable_filtered,
                                               pred_index_list)

    # """This print is for debugging"""
    #print ret_data_frame

    ret_mtable = MTable(ret_data_frame)
    ret_mtable.set_property('foreign_key_ltable', ltable_key)
    ret_mtable.set_property('foreign_key_rtable', rtable_key)
    ret_mtable.set_property('ltable', ltable)
    ret_mtable.set_property('rtable', rtable)

    return ret_mtable