Beispiel #1
0
def read_csv_(*args, **kwargs):
    """
    Read CSV (comma-separated) file into MTable

    Parameters
    ----------
    args : arguments to pandas read_csv command
    kwargs : arguments to pandas read_csv command along with optional "key" parameter.
        If key parameter is given, then it will be set as key,  else a new attribute ("_id")
        is added and set as key

    Returns
    -------
    result : MTable
    """
    # if kwargs.has_key('key') is False:
    #     raise AttributeError('Key is not specified')
    key = kwargs.pop('key', None)
    df = pd.read_csv(*args, **kwargs)
    if key is not None:
        return MTable(df, key=key)
    else:
        df = MTable(df)
        #key_name=df._get_name_for_key(df.columns)
        #df.add_key(key_name)
        return df
Beispiel #2
0
def create_mtable(table,
                  key=None,
                  ltable=None,
                  rtable=None,
                  foreign_key_ltable=None,
                  foreign_key_rtable=None):
    """
    Create mtable from dataframe
    """
    out_table = MTable(table, key=key)
    truth_vals = [
        ltable is not None, rtable is not None, foreign_key_ltable is not None,
        foreign_key_rtable is not None
    ]
    if all(truth_vals) == True:
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', foreign_key_ltable)
        out_table.set_property('foreign_key_rtable', foreign_key_rtable)
    else:
        if any(truth_vals) == True:
            logging.getLogger(__name__).warning(
                'Not all the properties for vtable are given; so not setting '
                'any of them')

    return out_table
Beispiel #3
0
def sample_table(table, size, replace=False):
    """
    Sample MTable

    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.

    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table = table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table
Beispiel #4
0
def down_sample(ltable, rtable, size, y):
    s_table, b_table, is_swapped = _order_tables(ltable, rtable)
    s_inv_index = _inv_index(s_table)
    b_sample_size = min(math.floor(size / y), len(b_table))
    b_tbl_indices = list(
        np.random.choice(len(b_table), b_sample_size, replace=False))
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table),
                                 s_inv_index)
    s_tbl_indices = list(s_tbl_indices)
    if is_swapped:
        s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices
    l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], key=ltable.get_key())
    l_sampled.properties = ltable.properties
    r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], key=rtable.get_key())
    r_sampled.properties = rtable.properties
    return l_sampled, r_sampled
Beispiel #5
0
def read_csv(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into MTable

    Parameters
    ----------
    args : arguments to pandas read_csv command
    kwargs : arguments to pandas read_csv command along with optional "key" if its MTable
            or "key", "ltable", "rtable", "foreign_key_ltable", "foreign_key_rtable" if its VTable

    Returns
    -------
    result : MTable

    Note
    ----
    read_csv can read in the meta data mentioned at the beginning of the file like this:

    #key=id

    A user can override or supply  metadata as key-value args to the function

    """

    properties, num_lines = get_properties_from_file(file_path)
    properties, kwargs = update_properties(properties, **kwargs)
    check_properties(properties)
    kwargs['skiprows']=num_lines
    df = pd.read_csv(file_path, **kwargs)

    # get key
    key = properties.pop('key', None)
    if key is not None:
        df = MTable(df, key=key)
    else:
        df = MTable(df)
    for k, v in properties.iteritems():
        df.set_property(k, v)
    return df
Beispiel #6
0
def label_table(tbl, col_name, replace=True):
    """
    Label training data

    Parameters
    ----------
    tbl : MTable, Table to be labeled
    col_name : String, Name of the label column
    replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists.
    [This option is currently experimental].

    Returns
    -------
    result : MTable, Table with labels

    Notes
    -----
    The label value is expected to be only 0 or 1.
    """
    from magellan.gui.mtable_gui import edit
    table = tbl.copy()

    if col_name in table.columns:
        if replace == True:
            logging.getLogger(__name__).warning(
                'Input table already contains column %s. '
                '' % col_name)
            table[col_name] = 0
    else:
        table[col_name] = 0
    mg.edit(table)
    table[col_name] = table[col_name].astype(int)
    # check if the table contains only 0s and 1s
    c1 = table[col_name] == 1
    c2 = table[col_name] == 0
    c = sum(c1 | c2)
    assert c == len(
        table), 'The label column contains values other than 0 and 1'

    table = MTable(table, key=tbl.get_key())
    table.properties = tbl.properties
    return table
Beispiel #7
0
def _combine_block_outputs_via_union(blocker_output_list):
    """
    Combine blocker outputs by unioning ltable, rtable ids in candidate set

    Parameters
    ----------
    blocker_output_list : list
        List of blocker outputs

    Returns
    -------
    combined_blocker_output : MTable
        With combined blocker outputs

    Notes
    -----
    Combined_blocker_output contains the following attributes
    * _id
    * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs
    * union of non-id attributes from each of blocker output
    """
    ltable, rtable = lr_tables(blocker_output_list)
    # get the attribute names in blocker output that represents ltable, rtable
    l_key = 'ltable.' + ltable.get_key()
    r_key = 'rtable.' + rtable.get_key()

    # get the set of id pairs from all blocker output list
    id_set = set([(r[l_key], r[r_key]) for c in blocker_output_list for i, r in c.iterrows()])

    # get the union of attribute names from blocker output list
    col_set = set([x for c in blocker_output_list for x in c.columns])
    l_col, r_col = lr_cols(col_set)

    # convert ltable, rtable to dfs and set index
    l_df = ltable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df = rtable.to_dataframe()
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    # get the l_col, r_col from ltable and rtable respectively
    dict_list = [get_dict(l_df.ix[x[0]], r_df.ix[x[1]], l_col, r_col) for x in id_set]

    # convert list of dicts to dataframe
    table = pd.DataFrame(dict_list)

    # get the final column names for output table
    f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key())


    if len(table) > 0:
        table.sort([l_key, r_key], inplace=True)
        table.reset_index(inplace=True, drop=True)
        table = MTable(table[f_cols])
    else:
        table = MTable(table, columns=f_cols)

    # project df and convert to MTable
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
    table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())


    return table
Beispiel #8
0
def combine_block_outputs_via_union(blocker_output_list):
    """
    Combine blocker outputs by unioning ltable, rtable ids in candidate set

    Parameters
    ----------
    blocker_output_list : list
        List of blocker outputs

    Returns
    -------
    combined_blocker_output : MTable
        With combined blocker outputs

    Notes
    -----
    Combined_blocker_output contains the following attributes
    * _id
    * combined id pairs (ltable.id, rtabled.id) from list of blocker outputs
    * union of non-id attributes from each of blocker output
    """
    ltable, rtable = lr_tables(blocker_output_list)
    # get the attribute names in blocker output that represents ltable, rtable
    l_key = 'ltable.' + ltable.get_key()
    r_key = 'rtable.' + rtable.get_key()

    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()

    # get the union of attribute names from blocker output list
    col_set = set([x for c in blocker_output_list for x in c.columns])
    l_col, r_col = lr_cols(col_set)
    l_col = list(l_col)
    r_col = list(r_col)

    l_df = l_df[l_col] # minimally the projection must contain id column
    r_df = r_df[r_col]

    col_names = ['ltable.'+c for c in l_df.columns]
    l_df.columns = col_names
    col_names = ['rtable.'+c for c in r_df.columns]
    r_df.columns = col_names



    l_df.set_index(l_key, inplace=True, drop=False)
    r_df.set_index(r_key, inplace=True, drop=False)


    # get id pairs
    id_set = []

    for c in blocker_output_list:
        lfid_idx = c.get_attr_names().index(c.get_property('foreign_key_ltable'))
        rfid_idx = c.get_attr_names().index(c.get_property('foreign_key_rtable'))

        for r in c.itertuples(index=False):
            id_set.append((r[lfid_idx], r[rfid_idx]))
    id_set = list(set(id_set))

    f_cols = fin_cols(l_col, r_col, ltable.get_key(), rtable.get_key())

    if len(id_set) > 0:
        id_df = pd.DataFrame(id_set)
        l_consol_table = l_df.ix[id_df[0]]
        r_consol_table = r_df.ix[id_df[1]]
        l_consol_table.reset_index(inplace=True, drop=True)
        r_consol_table.reset_index(inplace=True, drop=True)

        table = pd.concat([l_consol_table, r_consol_table], axis=1)
        table.sort([l_key, r_key], inplace=True)
        table.reset_index(inplace=True, drop=True)
        table = MTable(table[f_cols])
    else:
        table = MTable([], columns=f_cols)

    # project df and convert to MTable
    table.set_property('ltable', ltable)
    table.set_property('rtable', rtable)
    table.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
    table.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())


    return table
Beispiel #9
0
    def block_candset(self, vtable, l_block_attr, r_block_attr):
        """
        Block candidate set (virtual MTable) based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        vtable : MTable
            Input candidate set
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """
        # do integrity checks
        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, l_block_attr, r_block_attr, None,
                         None)
        l_key = 'ltable.' + ltable.get_key()
        r_key = 'rtable.' + rtable.get_key()

        # convert to dataframes
        l_df = ltable.to_dataframe()
        r_df = rtable.to_dataframe()

        # set index for convenience
        l_df.set_index(ltable.get_key(), inplace=True)
        r_df.set_index(rtable.get_key(), inplace=True)

        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        for idx, row in vtable.iterrows():

            if mg._verbose:
                count += 1
                if count % per_count == 0:
                    print str(mg._percent * count /
                              per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            # get the value of block attribute from ltuple
            l_val = l_df.ix[row[l_key], l_block_attr]
            r_val = r_df.ix[row[r_key], r_block_attr]
            if l_val != np.NaN and r_val != np.NaN:
                if l_val == r_val:
                    valid.append(True)
                else:
                    valid.append(False)
            else:
                valid.append(False)

        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable',
                               'ltable.' + ltable.get_key())
        out_table.set_property('foreign_key_rtable',
                               'rtable.' + rtable.get_key())
        return out_table
Beispiel #10
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None):
        """
        Block tables based on l_block_attr, r_block_attr equivalence (similar to equi-join)

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_block_attr, r_block_attr : string,
            attribute names in ltable, rtable
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable
            Containing tuple pairs whose l_block_attr and r_block_attr values are same

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(
            ltable, rtable, l_block_attr, r_block_attr, l_output_attrs,
            r_output_attrs)
        # remove nans
        l_df = self.rem_nan(ltable, l_block_attr)
        r_df = self.rem_nan(rtable, r_block_attr)

        candset = pd.merge(l_df,
                           r_df,
                           left_on=l_block_attr,
                           right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))

        # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(),
                                                      rtable.get_key(),
                                                      list(candset.columns),
                                                      l_output_attrs,
                                                      r_output_attrs)

        candset = candset[retain_cols]
        candset.columns = final_cols
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable',
                             'ltable.' + ltable.get_key())
        candset.set_property('foreign_key_rtable',
                             'rtable.' + rtable.get_key())
        return candset
Beispiel #11
0
def extract_feature_vecs(s,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(
    ), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property(
        'foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %
                                     (len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]
    end = time.time()
    logging.getLogger(__name__).info(
        'Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors
Beispiel #12
0
def get_filtered_table(ltable, rtable, corres_list):
    ltable_cols = [col_pair[0] for col_pair in corres_list]
    rtable_cols = [col_pair[1] for col_pair in corres_list]
    l_mtable = MTable(ltable[ltable_cols], key=ltable.get_key())
    r_mtable = MTable(rtable[rtable_cols], key=rtable.get_key())
    return l_mtable, r_mtable
Beispiel #13
0
def debug_blocker(ltable,
                  rtable,
                  candidate_set,
                  output_size=200,
                  attr_corres=None):
    """
    Debug the blocker. The basic idea is trying to suggest the user a list of record pairs
    out of the candidate set with high (document) jaccard similarity. The object of similarity
    measurement (document) is generated based on a string concatenation method with field
    selection. Given the suggestion list, the user should go through the pairs and determine if
    there are, and how many true matches in it. And based on this information, the user can
    determine if further improvement on the blocking step is necessary (Ex. if there are many
    true matches in the list, the user may conclude that the blocking step is flawed, and should
    revise it to produce a better candidate set).

    Parameters
    ----------
    ltable, rtable : MTable
        Input MTables
    candidate_set : MTable
        The candidate set table after performing blocking on ltable and rtable
    pred_list_size : int
        The size of the output suggestion list
    field_corres_list : list (of tuples), defaults to None
        The list of field pairs from ltable and rtable. Each pair indicates a field correspondence
        between two tables. Since ltable and rtable can have different schemas, it' necessary to
        have this parameter to build the field correspondence to make sure the string concatenation
        algorithm runs correctly.
        Note each pair in the list should be represented as a tuple in the following format:
                            (some_ltable_field, some_rtable_field)

    Returns
    -------
    suggestion_table : MTable
        Contains a list of pair suggestions with high jaccard similarity. The output MTable contains
        the following fields:
            * _id
            * similarity (of the record pair)
            * ltable record key value
            * rtable record key value
            * field pairs from filtered corres_list (removing the numeric types)
                ltable_field_1
                rtable_field_1 (corresponding to ltable_field_1)
                ltable_field_2
                rtable_field_2 (corresponding to ltable_field_2)
                      .
                      .
                ltable_field_k
                rtable_field_k (corresponding to ltable_field_k)
    """

    # Basic checks.
    if len(ltable) == 0:
        raise StandardError('Error: ltable is empty!')
    if len(rtable) == 0:
        raise StandardError('Error: rtable is empty!')
    if output_size <= 0:
        raise StandardError(
            'The input parameter: \'pred_list_size\' is less than or equal to 0. Nothing needs to be done!'
        )

    # logging.info('\nPreparing for debugging blocker')

    # Check the user input field correst list (if exists) and get the raw version of
    # our internal correst list.
    check_input_field_correspondence_list(ltable, rtable, attr_corres)
    corres_list = get_field_correspondence_list(ltable, rtable, attr_corres)

    # Build the (col_name: col_index) dict to speed up locating a field in the schema.
    ltable_col_dict = build_col_name_index_dict(ltable)
    rtable_col_dict = build_col_name_index_dict(rtable)

    # Filter correspondence list to remove numeric types. We only consider string types
    # for document concatenation.
    filter_corres_list(ltable, rtable, ltable_col_dict, rtable_col_dict,
                       corres_list)
    #print('\nFiltered field correspondence list:\n' + str(corres_list))

    # Get field filtered new table.
    ltable_filtered, rtable_filtered = get_filtered_table(
        ltable, rtable, corres_list)

    # Select features.
    """TODO: currently we don't select the key fields even if they have the largest score.
    # This is because the values of the key field could be simply domain-specific serial numbers,
    # which might be meaningless or even harmful (when two tables use different key formats).
    # Modify it if this ituition is not proper."""
    feature_list = select_features(ltable_filtered, rtable_filtered)
    if len(feature_list) == 0:
        raise StandardError(
            '\nError: the selected field list is empty, nothing could be done! '
            + 'Please check if all table fields are numeric types.')
    #print('\nSelected fields for concatenation:\n' + str([(ltable_filtered.columns[i], rtable_filtered.columns[i]) for i in feature_list]))

    # Get each table kgram dict.
    ltable_kgram_dict = get_kgram_dict(ltable_filtered,
                                       ltable_filtered.get_key(), feature_list,
                                       3)
    rtable_kgram_dict = get_kgram_dict(rtable_filtered,
                                       rtable_filtered.get_key(), feature_list,
                                       3)

    # Build inverted index on ltable kgrams to speed up debugging.
    inverted_index = build_inverted_index(ltable_kgram_dict)

    ltable_key = candidate_set.get_property('foreign_key_ltable')
    rtable_key = candidate_set.get_property('foreign_key_rtable')

    indexed_candidate_set = candidate_set.set_index([rtable_key, ltable_key],
                                                    drop=False)
    candidate_index_key_set = set(indexed_candidate_set[rtable_key])
    #print('\nCandidate set size: %d' %(len(indexed_candidate_set)))

    rtable_len = len(rtable_filtered)
    progress_dict = {}
    for i in range(10):
        progress_dict[int(
            (i + 1) * 1.0 / 10 * rtable_len)] = (i + 1) * 1.0 / 10

    #print('\nStart debugging blocker')
    # print('Start debugging blocker')
    pred_index_list = []
    count = 0

    if mg._verbose:
        count_ = 0
        per_count = math.ceil(mg._percent / 100.0 * len(rtable))
    elif mg._progbar:
        bar = pyprind.ProgBar(len(rtable))

    for rkey in rtable_kgram_dict:
        count += 1
        #if count == 500:
        #print pred_index_list
        #	break
        # if rtable_len <= 10:
        #     print('Debugging %s' %('{percent:.2%}'.format(percent=count * 1.0 / rtable_len)))
        # else:
        #     if count in progress_dict:
        #         print('Debugging %s' %('{percent:.2%}'.format(percent=progress_dict[count])))

        if mg._verbose:
            count_ += 1
            if count_ % per_count == 0:
                print str(
                    mg._percent * count_ / per_count) + ' percentage done !!!'
        elif mg._progbar:
            bar.update()

        rkgram_set = rtable_kgram_dict[rkey]
        if len(rkgram_set) == 0:
            continue
        cand_set = {}
        if rkey in candidate_index_key_set:
            cand_set = indexed_candidate_set.ix[rkey].index.values
        ltable_index_set = get_potential_match_set(rkgram_set, inverted_index)
        for lkey in ltable_index_set:
            if lkey in cand_set:
                continue
            jac_sim = jaccard_kgram_sim(ltable_kgram_dict[lkey], rkgram_set)
            if len(pred_index_list) == output_size:
                hq.heappushpop(pred_index_list, (jac_sim, lkey, rkey))
            else:
                hq.heappush(pred_index_list, (jac_sim, lkey, rkey))

    ret_data_frame = generate_prediction_table(ltable_filtered,
                                               rtable_filtered,
                                               pred_index_list)

    # """This print is for debugging"""
    #print ret_data_frame

    ret_mtable = MTable(ret_data_frame)
    ret_mtable.set_property('foreign_key_ltable', ltable_key)
    ret_mtable.set_property('foreign_key_rtable', rtable_key)
    ret_mtable.set_property('ltable', ltable)
    ret_mtable.set_property('rtable', rtable)

    return ret_mtable