Beispiel #1
0
    def filter_candset(self,
                       candset,
                       candset_l_key_attr,
                       candset_r_key_attr,
                       ltable,
                       rtable,
                       l_key_attr,
                       r_key_attr,
                       l_filter_attr,
                       r_filter_attr,
                       n_jobs=1):
        """Filter candidate set.

        Args:
        candset : Pandas data frame
        candset_l_key_attr, candset_r_key_attr : String, key attributes in candset (that refer to ltable and rtable)
        ltable, rtable : Pandas data frame, base tables from which candset was obtained
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable

        Returns:
        result : Pandas data frame
        """
        # check for empty candset
        if candset.empty:
            return candset
        if n_jobs == 1:
            return _filter_candset_split(candset, candset_l_key_attr,
                                         candset_r_key_attr, ltable, rtable,
                                         l_key_attr, r_key_attr, l_filter_attr,
                                         r_filter_attr, self)
        else:
            candset_splits = split_table(candset, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_candset_split)(
                candset_split, candset_l_key_attr, candset_r_key_attr, ltable,
                rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr,
                self) for candset_split in candset_splits)
            return pd.concat(results)
Beispiel #2
0
def edit_dist_join(ltable, rtable,
                   l_key_attr, r_key_attr,
                   l_join_attr, r_join_attr,
                   threshold,
                   l_out_attrs=None, r_out_attrs=None,
                   l_out_prefix='l_', r_out_prefix='r_',
                   out_sim_score=True, n_jobs=1,
                   tokenizer=create_qgram_tokenizer(2)):
    """Join two tables using edit distance similarity measure.

    Finds tuple pairs from ltable and rtable such that
    EditDistance(ltable.l_join_attr, rtable.r_join_attr) <= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : int, edit distance threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if edit distance needs to be included in the output table

    Returns:
    result : Pandas data frame
    """
    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # check if the input threshold is valid
    validate_threshold(threshold, 'EDIT_DISTANCE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # convert threshold to integer (incase if it is float)
    threshold = int(floor(threshold))

    if n_jobs == 1:
        output_table = _edit_dist_join_split(ltable, rtable,
                               l_key_attr, r_key_attr,
                               l_join_attr, r_join_attr,
                               tokenizer,
                               threshold,
                               l_out_attrs, r_out_attrs,
                               l_out_prefix, r_out_prefix,
                               out_sim_score)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
    else:
        r_splits = split_table(rtable, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_edit_dist_join_split)(
                                             ltable, s,
                                             l_key_attr, r_key_attr,
                                             l_join_attr, r_join_attr,
                                             tokenizer,
                                             threshold,
                                             l_out_attrs, r_out_attrs,
                                             l_out_prefix, r_out_prefix,
                                             out_sim_score) for s in r_splits)
        output_table = pd.concat(results)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_',
                      n_jobs=1):
        """Filter tables with size filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        if n_jobs == 1:
            output_table = _filter_tables_split(ltable, rtable, l_key_attr,
                                                r_key_attr, l_filter_attr,
                                                r_filter_attr, self,
                                                l_out_attrs, r_out_attrs,
                                                l_out_prefix, r_out_prefix)
            output_table.insert(0, '_id', range(0, len(output_table)))
            return output_table
        else:
            rtable_splits = split_table(rtable, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_tables_split)(
                ltable, rtable_split, l_key_attr, r_key_attr, l_filter_attr,
                r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix,
                r_out_prefix) for rtable_split in rtable_splits)
            output_table = pd.concat(results)
            output_table.insert(0, '_id', range(0, len(output_table)))
            return output_table
Beispiel #4
0
def jaccard_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer,
                 threshold,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True,
                 n_jobs=1):
    """Join two tables using jaccard similarity measure.

    Finds tuple pairs from ltable and rtable such that
    Jaccard(ltable.l_join_attr, rtable.r_join_attr) >= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : float, jaccard threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if similarity score needs to be included in the output table

    Returns:
    result : Pandas data frame
    """ 
    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)
 
    # check if the input threshold is valid
    validate_threshold(threshold, 'JACCARD')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    if n_jobs == 1:
        output_table = _set_sim_join_split(ltable, rtable,
                                           l_key_attr, r_key_attr,
                                           l_join_attr, r_join_attr,
                                           tokenizer,
                                           'JACCARD',
                                           threshold,
                                           l_out_attrs, r_out_attrs,
                                           l_out_prefix, r_out_prefix,
                                           out_sim_score)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
    else:
        r_splits = split_table(rtable, n_jobs) 
        results = Parallel(n_jobs=n_jobs)(delayed(_set_sim_join_split)(
                                              ltable, r_split,
                                              l_key_attr, r_key_attr,
                                              l_join_attr, r_join_attr,
                                              tokenizer,
                                              'JACCARD',
                                              threshold,
                                              l_out_attrs, r_out_attrs,
                                              l_out_prefix, r_out_prefix,
                                              out_sim_score)
                                          for r_split in r_splits)
        output_table = pd.concat(results)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table