Beispiel #1
0
def dice_join(ltable,
              rtable,
              l_key_attr,
              r_key_attr,
              l_join_attr,
              r_join_attr,
              tokenizer,
              threshold,
              comp_op='>=',
              allow_empty=True,
              allow_missing=False,
              l_out_attrs=None,
              r_out_attrs=None,
              l_out_prefix='l_',
              r_out_prefix='r_',
              out_sim_score=True,
              n_jobs=1,
              show_progress=True):
    """Join two tables using Dice similarity measure.

    For two sets X and Y, the Dice similarity score between them is given by:                      
                                                                                
        :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`        
                                                                                
    In the case where both X and Y are empty sets, we define their Dice 
    score to be 1.

    Finds tuple pairs from left table and right table such that the Dice 
    similarity between the join attributes satisfies the condition on input 
    threshold. For example, if the comparison operator is '>=', finds tuple     
    pairs whose Dice similarity between the strings that are the values of    
    the join attributes is greater than or equal to the input threshold, as     
    specified in "threshold". 

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): Dice similarity threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_empty (boolean): flag to indicate whether tuple pairs with empty  
            set of tokens in both the join attributes should be included in the 
            output (defaults to True).                                          
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          
                                                                                
        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default). 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table')

    # check if the join attributes are not of numeric type
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,
                       'join attribute', 'left table')
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,
                       'join attribute', 'right table')

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # check if the input threshold is valid
    validate_threshold(threshold, 'DICE')

    # check if the comparison operator is valid
    validate_comp_op_for_sim_measure(comp_op, 'DICE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                          rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr)

    # Do a projection on the input dataframes to keep only the required
    # attributes. Then, remove rows with missing value in join attribute from
    # the input dataframes. Then, convert the resulting dataframes into ndarray.
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs,
                                              l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs,
                                              r_join_attr)

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = set_sim_join(ltable_array, rtable_array, l_proj_attrs,
                                    r_proj_attrs, l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr, tokenizer,
                                    'DICE', threshold, comp_op, allow_empty,
                                    l_out_attrs, r_out_attrs, l_out_prefix,
                                    r_out_prefix, out_sim_score, show_progress)
    else:
        # if n_jobs is above 1, split the right table into n_jobs splits and
        # join each right table split with the whole of left table in a separate
        # process.
        r_splits = split_table(rtable_array, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(set_sim_join)(
            ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs,
            l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer,
            'DICE', threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs,
            l_out_prefix, r_out_prefix, out_sim_score, (
                show_progress and (job_index == n_jobs - 1)))
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    # If allow_missing flag is set, then compute all pairs with missing value in
    # at least one of the join attributes and then add it to the output
    # obtained from the join.
    if allow_missing:
        missing_pairs = get_pairs_with_missing_value(
            ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr,
            l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
            out_sim_score, show_progress)
        output_table = pd.concat([output_table, missing_pairs])

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table
    def filter_tables(self, ltable, rtable,
                      l_key_attr, r_key_attr,
                      l_filter_attr, r_filter_attr,
                      l_out_attrs=None, r_out_attrs=None,
                      l_out_prefix='l_', r_out_prefix='r_',
                      out_sim_score=False, n_jobs=1, show_progress=True):
        """Finds candidate matching pairs of strings from the input tables using
        overlap filtering technique.

        Args:
            ltable (DataFrame): left input table.

            rtable (DataFrame): right input table.

            l_key_attr (string): key attribute in left table.

            r_key_attr (string): key attribute in right table.

            l_filter_attr (string): attribute in left table on which the filter
                should be applied.

            r_filter_attr (string): attribute in right table on which the filter
                should be applied.

            l_out_attrs (list): list of attribute names from the left table to 
                be included in the output table (defaults to None).

            r_out_attrs (list): list of attribute names from the right table to 
                be included in the output table (defaults to None).

            l_out_prefix (string): prefix to be used for the attribute names 
                coming from the left table, in the output table 
                (defaults to 'l\_').

            r_out_prefix (string): prefix to be used for the attribute names 
                coming from the right table, in the output table 
                (defaults to 'r\_').

            out_sim_score (boolean): flag to indicate whether the overlap score 
                should be included in the output table (defaults to True). 
                Setting this flag to True will add a column named '_sim_score' 
                in the output table. This column will contain the overlap scores
                for the tuple pairs in the output. 

            n_jobs (int): number of parallel jobs to use for the computation
                (defaults to 1). If -1 is given, all CPUs are used. If 1 is 
                given, no parallel computing code is used at all, which is 
                useful for debugging. For n_jobs below -1, 
                (n_cpus + 1 + n_jobs) are used (where n_cpus is the total 
                number of CPUs in the machine). Thus for n_jobs = -2, all CPUs 
                but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, 
                then no parallel computing code will be used (i.e., equivalent 
                to the default).  

            show_progress (boolean): flag to indicate whether task progress 
                should be displayed to the user (defaults to True).

        Returns:
            An output table containing tuple pairs that survive the filter 
            (DataFrame).
        """

        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns,
                      'key attribute', 'left table')
        validate_attr(r_key_attr, rtable.columns,
                      'key attribute', 'right table')
        validate_attr(l_filter_attr, ltable.columns,
                      'attribute', 'left table')
        validate_attr(r_filter_attr, rtable.columns,
                      'attribute', 'right table')

        # check if the filter attributes are not of numeric type                      
        validate_attr_type(l_filter_attr, ltable[l_filter_attr].dtype,                  
                           'attribute', 'left table')                          
        validate_attr_type(r_filter_attr, rtable[r_filter_attr].dtype,                  
                           'attribute', 'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns,
                              r_out_attrs, rtable.columns)

        # check if the key attributes are unique and do not contain 
        # missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # remove redundant attrs from output attrs.
        l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
        r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

        # get attributes to project.  
        l_proj_attrs = get_attrs_to_project(l_out_attrs,
                                            l_key_attr, l_filter_attr)
        r_proj_attrs = get_attrs_to_project(r_out_attrs,
                                            r_key_attr, r_filter_attr)

        # Do a projection on the input dataframes to keep only the required         
        # attributes. Then, remove rows with missing value in filter attribute 
        # from the input dataframes. Then, convert the resulting dataframes 
        # into ndarray.
        ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, 
                                                  l_filter_attr)
        rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, 
                                                  r_filter_attr)

        # computes the actual number of jobs to launch.
        n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

        if n_jobs <= 1:
            # if n_jobs is 1, do not use any parallel code.
            output_table = _filter_tables_split(
                                           ltable_array, rtable_array,
                                           l_proj_attrs, r_proj_attrs,
                                           l_key_attr, r_key_attr,
                                           l_filter_attr, r_filter_attr,
                                           self,
                                           l_out_attrs, r_out_attrs,
                                           l_out_prefix, r_out_prefix,
                                           out_sim_score, show_progress)
        else:
            # if n_jobs is above 1, split the right table into n_jobs splits and    
            # filter each right table split with the whole of left table in a 
            # separate process.
            r_splits = split_table(rtable_array, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_tables_split)(
                                    ltable_array, r_splits[job_index],
                                    l_proj_attrs, r_proj_attrs,
                                    l_key_attr, r_key_attr,
                                    l_filter_attr, r_filter_attr,
                                    self,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, 
                                    (show_progress and (job_index==n_jobs-1)))
                                for job_index in range(n_jobs))
            output_table = pd.concat(results)

        # If allow_missing flag is set, then compute all pairs with missing     
        # value in at least one of the filter attributes and then add it to the 
        # output obtained from applying the filter.
        if self.allow_missing:
            missing_pairs = get_pairs_with_missing_value(
                                            ltable, rtable,
                                            l_key_attr, r_key_attr,
                                            l_filter_attr, r_filter_attr,
                                            l_out_attrs, r_out_attrs,
                                            l_out_prefix, r_out_prefix,
                                            out_sim_score, show_progress)
            output_table = pd.concat([output_table, missing_pairs])

        # add an id column named '_id' to the output table.
        output_table.insert(0, '_id', range(0, len(output_table)))

        return output_table
def apply_matcher(candset,
                  candset_l_key_attr,
                  candset_r_key_attr,
                  ltable,
                  rtable,
                  l_key_attr,
                  r_key_attr,
                  l_match_attr,
                  r_match_attr,
                  tokenizer,
                  sim_function,
                  threshold,
                  comp_op='>=',
                  allow_missing=False,
                  l_out_attrs=None,
                  r_out_attrs=None,
                  l_out_prefix='l_',
                  r_out_prefix='r_',
                  out_sim_score=True,
                  n_jobs=1,
                  show_progress=True):
    """Find matching string pairs from the candidate set (typically produced by
    applying a filter to two tables) by applying a matcher of form 
    (sim_function comp_op threshold).

    Specifically, this method computes the input similarity function on string 
    pairs in the candidate set and checks if the resulting score satisfies the 
    input threshold (depending on the comparison operator).

    Args:
        candset (DataFrame): input candidate set.

        candset_l_key_attr (string): attribute in candidate set which is a key 
            in left table.

        candset_r_key_attr (string): attribute in candidate set which is a key 
            in right table.

        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_match_attr (string): attribute in left table on which the matcher 
            should be applied.

        r_match_attr (string): attribute in right table on which the matcher
            should be applied.

        tokenizer (Tokenizer): tokenizer to be used to tokenize the
            match attributes. If set to None, the matcher is applied directly
            on the match attributes.

        sim_function (function): matcher function to be applied.

        threshold (float): threshold to be satisfied.

        comp_op (string): comparison operator. Supported values are '>=', '>', '
            <=', '<', '=' and '!=' (defaults to '>=').

        allow_missing (boolean): flag to indicate whether tuple pairs with 
            missing value in at least one of the match attributes should be 
            included in the output (defaults to False). 

        l_out_attrs (list): list of attribute names from the left table to be 
            included in the output table (defaults to None).

        r_out_attrs (list): list of attribute names from the right table to be 
            included in the output table (defaults to None).

        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').

        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').

        out_sim_score (boolean): flag to indicate whether similarity score 
            should be included in the output table (defaults to True). Setting
            this flag to True will add a column named '_sim_score' in the 
            output table. This column will contain the similarity scores for the
            tuple pairs in the output. 

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default). 

        show_progress (boolean): flag to indicate whether task progress should 
            be displayed to the user (defaults to True).

    Returns:
        An output table containing tuple pairs from the candidate set that 
        survive the matcher (DataFrame).
    """

    # check if the input candset is a dataframe
    validate_input_table(candset, 'candset')

    # check if the candset key attributes exist
    validate_attr(candset_l_key_attr, candset.columns, 'left key attribute',
                  'candset')
    validate_attr(candset_r_key_attr, candset.columns, 'right key attribute',
                  'candset')

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table')
    validate_attr(l_match_attr, ltable.columns, 'match attribute',
                  'left table')
    validate_attr(r_match_attr, rtable.columns, 'match attribute',
                  'right table')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                          rtable.columns)

    # check if the input tokenizer is valid, if it is not None
    if tokenizer is not None:
        validate_tokenizer(tokenizer)

    # check if the comparison operator is valid
    validate_comp_op(comp_op)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # check for empty candset
    if candset.empty:
        return candset

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_match_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_match_attr)

    # do a projection on the input dataframes. Note that this doesn't create a
    # copy of the dataframes. It only creates a view on original dataframes.
    ltable_projected = ltable[l_proj_attrs]
    rtable_projected = rtable[r_proj_attrs]

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))

    # If a tokenizer is provided, we can optimize by tokenizing each value
    # only once by caching the tokens of l_match_attr and r_match_attr. But,
    # this can be a bad strategy in case the candset has very few records
    # compared to the original tables. Hence, we check if the sum of tuples in
    # ltable and rtable is less than twice the number of tuples in the candset.
    # If yes, we decide to cache the token values. Else, we do not cache the
    # tokens as the candset is small.
    l_tokens = None
    r_tokens = None
    if tokenizer is not None and (len(ltable) + len(rtable) <
                                  len(candset) * 2):
        l_tokens = generate_tokens(ltable_projected, l_key_attr, l_match_attr,
                                   tokenizer)
        r_tokens = generate_tokens(rtable_projected, r_key_attr, r_match_attr,
                                   tokenizer)

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = _apply_matcher_split(
            candset, candset_l_key_attr, candset_r_key_attr, ltable_projected,
            rtable_projected, l_key_attr, r_key_attr, l_match_attr,
            r_match_attr, tokenizer, sim_function, threshold, comp_op,
            allow_missing, l_out_attrs, r_out_attrs, l_out_prefix,
            r_out_prefix, out_sim_score, show_progress, l_tokens, r_tokens)
    else:
        # if n_jobs is above 1, split the candset into n_jobs splits and apply
        # the matcher on each candset split in a separate process.
        candset_splits = split_table(candset, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_apply_matcher_split)(
            candset_splits[job_index], candset_l_key_attr, candset_r_key_attr,
            ltable_projected, rtable_projected, l_key_attr, r_key_attr,
            l_match_attr, r_match_attr, tokenizer, sim_function, threshold,
            comp_op, allow_missing, l_out_attrs, r_out_attrs, l_out_prefix,
            r_out_prefix, out_sim_score, (show_progress and (
                job_index == n_jobs - 1)), l_tokens, r_tokens)
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    return output_table
Beispiel #4
0
def edit_distance_join(ltable,
                       rtable,
                       l_key_attr,
                       r_key_attr,
                       l_join_attr,
                       r_join_attr,
                       threshold,
                       comp_op='<=',
                       allow_missing=False,
                       l_out_attrs=None,
                       r_out_attrs=None,
                       l_out_prefix='l_',
                       r_out_prefix='r_',
                       out_sim_score=True,
                       n_jobs=1,
                       show_progress=True,
                       tokenizer=QgramTokenizer(qval=2)):
    """Join two tables using edit distance measure.

    Finds tuple pairs from left table and right table such that the edit 
    distance between the join attributes satisfies the condition on input 
    threshold. For example, if the comparison operator is '<=', finds tuple     
    pairs whose edit distance between the strings that are the values of    
    the join attributes is less than or equal to the input threshold, as     
    specified in "threshold". 

    Note:
        Currently, this method only computes an approximate join result. This is
        because, to perform the join we transform an edit distance measure 
        between strings into an overlap measure between qgrams of the strings. 
        Hence, we need at least one qgram to be in common between two input 
        strings, to appear in the join output. For smaller strings, where all 
        qgrams of the strings differ, we cannot process them.
 
        This method implements a simplified version of the algorithm proposed in
        `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance
        Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08
        <http://www.vldb.org/pvldb/1/1453957.pdf>`_. 
        
    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        threshold (float): edit distance threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '<=', '<'   
            and '=' (defaults to '<=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to
            True, a tuple in ltable with missing value in the join attribute 
            will be matched with every tuple in rtable and vice versa. 
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether the edit distance 
            score should be included in the output table (defaults to True). 
            Setting this flag to True will add a column named '_sim_score' in 
            the output table. This column will contain the edit distance scores 
            for the tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        

        tokenizer (Tokenizer): tokenizer to be used to tokenize the join 
            attributes during filtering, when edit distance measure is          
            transformed into an overlap measure. This must be a q-gram tokenizer
            (defaults to 2-gram tokenizer).
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table')

    # check if the join attributes are not of numeric type
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,
                       'join attribute', 'left table')
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,
                       'join attribute', 'right table')

    # check if the input tokenizer is valid for edit distance measure. Only
    # qgram tokenizer can be used for edit distance.
    validate_tokenizer_for_sim_measure(tokenizer, 'EDIT_DISTANCE')

    # check if the input threshold is valid
    validate_threshold(threshold, 'EDIT_DISTANCE')

    # check if the comparison operator is valid
    validate_comp_op_for_sim_measure(comp_op, 'EDIT_DISTANCE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                          rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # convert threshold to integer (incase if it is float)
    threshold = int(floor(threshold))

    # set return_set flag of tokenizer to be False, in case it is set to True
    revert_tokenizer_return_set_flag = False
    if tokenizer.get_return_set():
        tokenizer.set_return_set(False)
        revert_tokenizer_return_set_flag = True

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr)

    # Do a projection on the input dataframes to keep only the required
    # attributes. Then, remove rows with missing value in join attribute from
    # the input dataframes. Then, convert the resulting dataframes into ndarray.
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs,
                                              l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs,
                                              r_join_attr)

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = _edit_distance_join_split(
            ltable_array, rtable_array, l_proj_attrs, r_proj_attrs, l_key_attr,
            r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold,
            comp_op, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
            out_sim_score, show_progress)
    else:
        # if n_jobs is above 1, split the right table into n_jobs splits and
        # join each right table split with the whole of left table in a separate
        # process.
        r_splits = split_table(rtable_array, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_edit_distance_join_split)(
            ltable_array, r_splits[job_index], l_proj_attrs, r_proj_attrs,
            l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer,
            threshold, comp_op, l_out_attrs, r_out_attrs, l_out_prefix,
            r_out_prefix, out_sim_score, (
                show_progress and (job_index == n_jobs - 1)))
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    # If allow_missing flag is set, then compute all pairs with missing value in
    # at least one of the join attributes and then add it to the output
    # obtained from the join.
    if allow_missing:
        missing_pairs = get_pairs_with_missing_value(
            ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr,
            l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix,
            out_sim_score, show_progress)
        output_table = pd.concat([output_table, missing_pairs])

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(True)

    return output_table
def edit_distance_join(ltable, rtable,
                       l_key_attr, r_key_attr,
                       l_join_attr, r_join_attr,
                       threshold, comp_op='<=',
                       allow_missing=False,
                       l_out_attrs=None, r_out_attrs=None,
                       l_out_prefix='l_', r_out_prefix='r_',
                       out_sim_score=True, n_jobs=1, show_progress=True,
                       tokenizer=QgramTokenizer(qval=2)):
    """Join two tables using edit distance measure.

    Finds tuple pairs from left table and right table such that the edit 
    distance between the join attributes satisfies the condition on input 
    threshold. For example, if the comparison operator is '<=', finds tuple     
    pairs whose edit distance between the strings that are the values of    
    the join attributes is less than or equal to the input threshold, as     
    specified in "threshold". 

    Note:
        Currently, this method only computes an approximate join result. This is
        because, to perform the join we transform an edit distance measure 
        between strings into an overlap measure between qgrams of the strings. 
        Hence, we need at least one qgram to be in common between two input 
        strings, to appear in the join output. For smaller strings, where all 
        qgrams of the strings differ, we cannot process them.
 
        This method implements a simplified version of the algorithm proposed in
        `Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance
        Constraints (Chuan Xiao, Wei Wang and Xuemin Lin), VLDB 08
        <http://www.vldb.org/pvldb/1/1453957.pdf>`_. 
        
    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        threshold (float): edit distance threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '<=', '<'   
            and '=' (defaults to '<=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to
            True, a tuple in ltable with missing value in the join attribute 
            will be matched with every tuple in rtable and vice versa. 
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether the edit distance 
            score should be included in the output table (defaults to True). 
            Setting this flag to True will add a column named '_sim_score' in 
            the output table. This column will contain the edit distance scores 
            for the tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        

        tokenizer (Tokenizer): tokenizer to be used to tokenize the join 
            attributes during filtering, when edit distance measure is          
            transformed into an overlap measure. This must be a q-gram tokenizer
            (defaults to 2-gram tokenizer).
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the join attributes are not of numeric type                      
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,                  
                       'join attribute', 'left table')                          
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,                  
                       'join attribute', 'right table')

    # check if the input tokenizer is valid for edit distance measure. Only
    # qgram tokenizer can be used for edit distance.
    validate_tokenizer_for_sim_measure(tokenizer, 'EDIT_DISTANCE')

    # check if the input threshold is valid
    validate_threshold(threshold, 'EDIT_DISTANCE')

    # check if the comparison operator is valid
    validate_comp_op_for_sim_measure(comp_op, 'EDIT_DISTANCE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # convert threshold to integer (incase if it is float)
    threshold = int(floor(threshold))

    # set return_set flag of tokenizer to be False, in case it is set to True
    revert_tokenizer_return_set_flag = False
    if tokenizer.get_return_set():
        tokenizer.set_return_set(False)
        revert_tokenizer_return_set_flag = True

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.  
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr)

    # Do a projection on the input dataframes to keep only the required         
    # attributes. Then, remove rows with missing value in join attribute from   
    # the input dataframes. Then, convert the resulting dataframes into ndarray.    
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr)

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = _edit_distance_join_split(
                               ltable_array, rtable_array,
                               l_proj_attrs, r_proj_attrs,
                               l_key_attr, r_key_attr,
                               l_join_attr, r_join_attr,
                               tokenizer, threshold, comp_op,
                               l_out_attrs, r_out_attrs,
                               l_out_prefix, r_out_prefix,
                               out_sim_score, show_progress)
    else:
        # if n_jobs is above 1, split the right table into n_jobs splits and    
        # join each right table split with the whole of left table in a separate
        # process.
        r_splits = split_table(rtable_array, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_edit_distance_join_split)(
                                    ltable_array, r_splits[job_index],
                                    l_proj_attrs, r_proj_attrs,
                                    l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr,
                                    tokenizer, threshold, comp_op,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score,
                                    (show_progress and (job_index==n_jobs-1)))
                                for job_index in range(n_jobs))
        output_table = pd.concat(results)

    # If allow_missing flag is set, then compute all pairs with missing value in
    # at least one of the join attributes and then add it to the output         
    # obtained from the join. 
    if allow_missing:
        missing_pairs = get_pairs_with_missing_value(
                                            ltable, rtable,
                                            l_key_attr, r_key_attr,
                                            l_join_attr, r_join_attr,
                                            l_out_attrs, r_out_attrs,
                                            l_out_prefix, r_out_prefix,
                                            out_sim_score, show_progress)
        output_table = pd.concat([output_table, missing_pairs])

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(True)

    return output_table
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_',
                      n_jobs=1):
        """Filter tables with size filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        if n_jobs == 1:
            output_table = _filter_tables_split(ltable, rtable, l_key_attr,
                                                r_key_attr, l_filter_attr,
                                                r_filter_attr, self,
                                                l_out_attrs, r_out_attrs,
                                                l_out_prefix, r_out_prefix)
            output_table.insert(0, '_id', range(0, len(output_table)))
            return output_table
        else:
            rtable_splits = split_table(rtable, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_tables_split)(
                ltable, rtable_split, l_key_attr, r_key_attr, l_filter_attr,
                r_filter_attr, self, l_out_attrs, r_out_attrs, l_out_prefix,
                r_out_prefix) for rtable_split in rtable_splits)
            output_table = pd.concat(results)
            output_table.insert(0, '_id', range(0, len(output_table)))
            return output_table
def dice_join_py(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, threshold, comp_op='>=',
                 allow_empty=True, allow_missing=False,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True, n_jobs=1, show_progress=True):
    """Join two tables using Dice similarity measure.

    For two sets X and Y, the Dice similarity score between them is given by:                      
                                                                                
        :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`        
                                                                                
    In the case where both X and Y are empty sets, we define their Dice 
    score to be 1.

    Finds tuple pairs from left table and right table such that the Dice 
    similarity between the join attributes satisfies the condition on input 
    threshold. For example, if the comparison operator is '>=', finds tuple     
    pairs whose Dice similarity between the strings that are the values of    
    the join attributes is greater than or equal to the input threshold, as     
    specified in "threshold". 

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): Dice similarity threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_empty (boolean): flag to indicate whether tuple pairs with empty  
            set of tokens in both the join attributes should be included in the 
            output (defaults to True).                                          
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          
                                                                                
        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default). 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the join attributes are not of numeric type                      
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,                  
                       'join attribute', 'left table')                          
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,                  
                       'join attribute', 'right table') 

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # check if the input threshold is valid
    validate_threshold(threshold, 'DICE')

    # check if the comparison operator is valid
    validate_comp_op_for_sim_measure(comp_op, 'DICE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.  
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_join_attr)

    # Do a projection on the input dataframes to keep only the required         
    # attributes. Then, remove rows with missing value in join attribute from   
    # the input dataframes. Then, convert the resulting dataframes into ndarray.
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr)

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(rtable_array))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = set_sim_join(ltable_array, rtable_array,
                                    l_proj_attrs, r_proj_attrs,
                                    l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr,
                                    tokenizer, 'DICE',
                                    threshold, comp_op, allow_empty,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, show_progress)
    else:
        # if n_jobs is above 1, split the right table into n_jobs splits and    
        # join each right table split with the whole of left table in a separate
        # process.
        r_splits = split_table(rtable_array, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(set_sim_join)(
                                          ltable_array, r_splits[job_index],
                                          l_proj_attrs, r_proj_attrs,
                                          l_key_attr, r_key_attr,
                                          l_join_attr, r_join_attr,
                                          tokenizer, 'DICE',
                                          threshold, comp_op, allow_empty,
                                          l_out_attrs, r_out_attrs,
                                          l_out_prefix, r_out_prefix,
                                          out_sim_score,
                                      (show_progress and (job_index==n_jobs-1)))
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    # If allow_missing flag is set, then compute all pairs with missing value in
    # at least one of the join attributes and then add it to the output         
    # obtained from the join. 
    if allow_missing:
        missing_pairs = get_pairs_with_missing_value(
                                            ltable, rtable,
                                            l_key_attr, r_key_attr,
                                            l_join_attr, r_join_attr,
                                            l_out_attrs, r_out_attrs,
                                            l_out_prefix, r_out_prefix,
                                            out_sim_score, show_progress)
        output_table = pd.concat([output_table, missing_pairs])

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table
def profile_table_for_join(input_table, profile_attrs=None):
    """Profiles the attributes in the table to suggest implications for join.
 
    Args:
        input_table (DataFrame): input table to profile.
        profile_attrs (list): list of attribute names from the input table to be
            profiled (defaults to None). If not provided, all attributes in the 
            input table will be profiled.

    Returns:
        A dataframe consisting of profile output. Specifically, the dataframe 
        contains three columns, 

        1) 'Unique values' column, which shows the number of unique values in 
           each attribute,
        2) 'Missing values' column, which shows the number of missing values in 
           each attribute, and 
        3) 'Comments' column, which contains comments about each attribute. 

        The output dataframe is indexed by attribute name, so that the 
        statistics for each attribute can be easily accessed using the attribute name.
    """

    # check if the input table is a dataframe
    validate_input_table(input_table, 'input table')

    profile_output = []

    if profile_attrs is None:
        profile_attrs = list(input_table.columns.values)
    else:
        # check if the profile attributes exist
        for attr in profile_attrs:
            validate_attr(attr, input_table.columns,
                          'profile attribute', 'input table')
            
    num_rows = len(input_table)

    for attr in profile_attrs:
        # compute number of unique values in the column
        unique_values = len(input_table[attr].unique())

        # compute number of missing values in the column
        missing_values = sum(pd.isnull(input_table[attr]))

        # compute percentage of unique values in the column
        unique_percent = round((float(unique_values) / float(num_rows)) * 100,
                               2)

        # compute percentage of missing values in the column
        missing_percent = round((float(missing_values) / float(num_rows)) * 100,
                                2)

        # format stats for better display
        formatted_unique_stat = _format_statistic(unique_values, unique_percent)
        formatted_missing_stat = _format_statistic(missing_values,
                                                   missing_percent)

        comments = ''
        # if there are missing values in the column, add a comment.
        if missing_percent > 0:
            comments = ''.join(['Joining on this attribute will ignore ',
                                formatted_missing_stat, ' rows.'])
        # if the column consists of unique values, add a comment. 
        if unique_percent == 100.0 and missing_values == 0:
            comments = 'This attribute can be used as a key attribute.'

        profile_output.append((attr, formatted_unique_stat,
                               formatted_missing_stat, comments))

    # compose output dataframe containing the profiling results.
    output_header = ['Attribute', 'Unique values', 'Missing values', 'Comments']
    output_df = pd.DataFrame(profile_output, columns=output_header)
    return output_df.set_index('Attribute')
def extract_feature_vecs(candset,
                         candset_l_key_attr,
                         candset_r_key_attr,
                         ltable,
                         rtable,
                         l_key_attr,
                         r_key_attr,
                         l_join_attr,
                         r_join_attr,
                         feature_table,
                         n_jobs=1,
                         show_progress=True):
    # check if the input candset is a dataframe
    validate_input_table(candset, 'candset')

    # check if the candset key attributes exist
    validate_attr(candset_l_key_attr, candset.columns, 'left key attribute',
                  'candset')
    validate_attr(candset_r_key_attr, candset.columns, 'right key attribute',
                  'candset')

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns, 'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns, 'join attribute', 'right table')

    # check if the join attributes are not of numeric type
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,
                       'join attribute', 'left table')
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,
                       'join attribute', 'right table')

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # Do a projection on the input dataframes to keep only required
    # attributes. Note that this does not create a copy of the dataframes.
    # It only creates a view on original dataframes.
    ltable_projected = ltable[[l_key_attr, l_join_attr]]
    rtable_projected = rtable[[r_key_attr, r_join_attr]]

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.
        output_table = _extract_feature_vecs_split(
            candset, candset_l_key_attr, candset_r_key_attr, ltable_projected,
            rtable_projected, l_key_attr, r_key_attr, l_join_attr, r_join_attr,
            feature_table, show_progress)
    else:
        # if n_jobs is above 1, split the candset into n_jobs splits and
        # filter each candset split in a separate process.
        candset_splits = split_table(candset, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_extract_feature_vecs_split)(
            candset_splits[job_index], candset_l_key_attr, candset_r_key_attr,
            ltable_projected, rtable_projected, l_key_attr, r_key_attr,
            l_join_attr, r_join_attr, feature_table, (
                show_progress and (job_index == n_jobs - 1)))
                                          for job_index in range(n_jobs))
        output_table = pd.concat(results)

    return output_table
Beispiel #10
0
def edit_dist_join(ltable, rtable,
                   l_key_attr, r_key_attr,
                   l_join_attr, r_join_attr,
                   threshold,
                   l_out_attrs=None, r_out_attrs=None,
                   l_out_prefix='l_', r_out_prefix='r_',
                   out_sim_score=True, n_jobs=1,
                   tokenizer=create_qgram_tokenizer(2)):
    """Join two tables using edit distance similarity measure.

    Finds tuple pairs from ltable and rtable such that
    EditDistance(ltable.l_join_attr, rtable.r_join_attr) <= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : int, edit distance threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if edit distance needs to be included in the output table

    Returns:
    result : Pandas data frame
    """
    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # check if the input threshold is valid
    validate_threshold(threshold, 'EDIT_DISTANCE')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # convert threshold to integer (incase if it is float)
    threshold = int(floor(threshold))

    if n_jobs == 1:
        output_table = _edit_dist_join_split(ltable, rtable,
                               l_key_attr, r_key_attr,
                               l_join_attr, r_join_attr,
                               tokenizer,
                               threshold,
                               l_out_attrs, r_out_attrs,
                               l_out_prefix, r_out_prefix,
                               out_sim_score)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
    else:
        r_splits = split_table(rtable, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_edit_dist_join_split)(
                                             ltable, s,
                                             l_key_attr, r_key_attr,
                                             l_join_attr, r_join_attr,
                                             tokenizer,
                                             threshold,
                                             l_out_attrs, r_out_attrs,
                                             l_out_prefix, r_out_prefix,
                                             out_sim_score) for s in r_splits)
        output_table = pd.concat(results)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
def extract_feature_vecs(candset, candset_l_key_attr, candset_r_key_attr,                       
                         ltable, rtable,                                               
                         l_key_attr, r_key_attr, l_join_attr, r_join_attr,
                         feature_table, n_jobs=1, show_progress=True):
    # check if the input candset is a dataframe                             
    validate_input_table(candset, 'candset')                                
                                                                                
    # check if the candset key attributes exist                             
    validate_attr(candset_l_key_attr, candset.columns,                      
                  'left key attribute', 'candset')                          
    validate_attr(candset_r_key_attr, candset.columns,                      
                  'right key attribute', 'candset')                         
                                                                                
    # check if the input tables are dataframes                              
    validate_input_table(ltable, 'left table')                              
    validate_input_table(rtable, 'right table')                             
                                                                                
    # check if the key attributes and join attributes exist              
    validate_attr(l_key_attr, ltable.columns,                               
                  'key attribute', 'left table')                            
    validate_attr(r_key_attr, rtable.columns,                               
                  'key attribute', 'right table')                           
    validate_attr(l_join_attr, ltable.columns,                            
                  'join attribute', 'left table')                         
    validate_attr(r_join_attr, rtable.columns,                            
                  'join attribute', 'right table')                        
                                                                                
    # check if the join attributes are not of numeric type                      
    validate_attr_type(l_join_attr, ltable[l_join_attr].dtype,          
                       'join attribute', 'left table')                    
    validate_attr_type(r_join_attr, rtable[r_join_attr].dtype,          
                       'join attribute', 'right table')                   
                                                                                
    # check if the key attributes are unique and do not contain missing values                                                        
    validate_key_attr(l_key_attr, ltable, 'left table')                     
    validate_key_attr(r_key_attr, rtable, 'right table')                    
                                                                                
    # Do a projection on the input dataframes to keep only required         
    # attributes. Note that this does not create a copy of the dataframes.  
    # It only creates a view on original dataframes.                        
    ltable_projected = ltable[[l_key_attr, l_join_attr]]                  
    rtable_projected = rtable[[r_key_attr, r_join_attr]]                  
                                                                                
    # computes the actual number of jobs to launch.                         
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))         
                                                                                
    if n_jobs <= 1:                                                         
        # if n_jobs is 1, do not use any parallel code.                     
        output_table =  _extract_feature_vecs_split(candset,                      
                                         candset_l_key_attr, candset_r_key_attr,
                                         ltable_projected, rtable_projected,    
                                         l_key_attr, r_key_attr,                
                                         l_join_attr, r_join_attr,          
                                         feature_table, show_progress)                   
    else:   
        # if n_jobs is above 1, split the candset into n_jobs splits and    
        # filter each candset split in a separate process.                  
        candset_splits = split_table(candset, n_jobs)                       
        results = Parallel(n_jobs=n_jobs)(delayed(_extract_feature_vecs_split)(   
                                      candset_splits[job_index],                
                                      candset_l_key_attr, candset_r_key_attr,   
                                      ltable_projected, rtable_projected,       
                                      l_key_attr, r_key_attr,                   
                                      l_join_attr, r_join_attr,             
                                      feature_table,                                     
                                      (show_progress and (job_index==n_jobs-1)))
                                          for job_index in range(n_jobs))       
        output_table = pd.concat(results)                                   
                                                                                
    return output_table  
Beispiel #12
0
def jaccard_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer,
                 threshold,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True,
                 n_jobs=1):
    """Join two tables using jaccard similarity measure.

    Finds tuple pairs from ltable and rtable such that
    Jaccard(ltable.l_join_attr, rtable.r_join_attr) >= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : float, jaccard threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if similarity score needs to be included in the output table

    Returns:
    result : Pandas data frame
    """ 
    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_join_attr, ltable.columns,
                  'join attribute', 'left table')
    validate_attr(r_join_attr, rtable.columns,
                  'join attribute', 'right table')

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)
 
    # check if the input threshold is valid
    validate_threshold(threshold, 'JACCARD')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    if n_jobs == 1:
        output_table = _set_sim_join_split(ltable, rtable,
                                           l_key_attr, r_key_attr,
                                           l_join_attr, r_join_attr,
                                           tokenizer,
                                           'JACCARD',
                                           threshold,
                                           l_out_attrs, r_out_attrs,
                                           l_out_prefix, r_out_prefix,
                                           out_sim_score)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
    else:
        r_splits = split_table(rtable, n_jobs) 
        results = Parallel(n_jobs=n_jobs)(delayed(_set_sim_join_split)(
                                              ltable, r_split,
                                              l_key_attr, r_key_attr,
                                              l_join_attr, r_join_attr,
                                              tokenizer,
                                              'JACCARD',
                                              threshold,
                                              l_out_attrs, r_out_attrs,
                                              l_out_prefix, r_out_prefix,
                                              out_sim_score)
                                          for r_split in r_splits)
        output_table = pd.concat(results)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table
Beispiel #13
0
    def filter_candset(self, candset,
                       candset_l_key_attr, candset_r_key_attr,
                       ltable, rtable,
                       l_key_attr, r_key_attr,
                       l_filter_attr, r_filter_attr,
                       n_jobs=1, show_progress=True):
        """Finds candidate matching pairs of strings from the input candidate 
        set.

        Args:
            candset (DataFrame): input candidate set.

            candset_l_key_attr (string): attribute in candidate set which is a 
                key in left table.

            candset_r_key_attr (string): attribute in candidate set which is a 
                key in right table.

            ltable (DataFrame): left input table.

            rtable (DataFrame): right input table.

            l_key_attr (string): key attribute in left table.

            r_key_attr (string): key attribute in right table.

            l_filter_attr (string): attribute in left table on which the filter 
                should be applied.                                              
                                                                                
            r_filter_attr (string): attribute in right table on which the filter
                should be applied.

            n_jobs (int): number of parallel jobs to use for the computation    
                (defaults to 1). If -1 is given, all CPUs are used. If 1 is     
                given, no parallel computing code is used at all, which is      
                useful for debugging. For n_jobs below -1,                      
                (n_cpus + 1 + n_jobs) are used (where n_cpus is the total       
                number of CPUs in the machine). Thus for n_jobs = -2, all CPUs  
                but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, 
                then no parallel computing code will be used (i.e., equivalent  
                to the default).
                                                                                
            show_progress (boolean): flag to indicate whether task progress     
                should be displayed to the user (defaults to True). 

        Returns:
            An output table containing tuple pairs from the candidate set that 
            survive the filter (DataFrame).
        """

        # check if the input candset is a dataframe
        validate_input_table(candset, 'candset')

        # check if the candset key attributes exist
        validate_attr(candset_l_key_attr, candset.columns,
                      'left key attribute', 'candset')
        validate_attr(candset_r_key_attr, candset.columns,
                      'right key attribute', 'candset')

        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes filter join attributes exist
        validate_attr(l_key_attr, ltable.columns,
                      'key attribute', 'left table')
        validate_attr(r_key_attr, rtable.columns,
                      'key attribute', 'right table')
        validate_attr(l_filter_attr, ltable.columns,
                      'filter attribute', 'left table')
        validate_attr(r_filter_attr, rtable.columns,
                      'filter attribute', 'right table')

        # check if the filter attributes are not of numeric type                      
        validate_attr_type(l_filter_attr, ltable[l_filter_attr].dtype,          
                           'filter attribute', 'left table')                    
        validate_attr_type(r_filter_attr, rtable[r_filter_attr].dtype,          
                           'filter attribute', 'right table')

        # check if the key attributes are unique and do not contain 
        # missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # check for empty candset
        if candset.empty:
            return candset

        # Do a projection on the input dataframes to keep only required 
        # attributes. Note that this does not create a copy of the dataframes. 
        # It only creates a view on original dataframes.
        ltable_projected = ltable[[l_key_attr, l_filter_attr]]
        rtable_projected = rtable[[r_key_attr, r_filter_attr]]

        # computes the actual number of jobs to launch.
        n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))
        
        if n_jobs <= 1:
            # if n_jobs is 1, do not use any parallel code.                     
            output_table =  _filter_candset_split(candset,
                                         candset_l_key_attr, candset_r_key_attr,
                                         ltable_projected, rtable_projected,
                                         l_key_attr, r_key_attr,
                                         l_filter_attr, r_filter_attr,
                                         self, show_progress)
        else:
            # if n_jobs is above 1, split the candset into n_jobs splits and    
            # filter each candset split in a separate process.
            candset_splits = split_table(candset, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_candset_split)(
                                      candset_splits[job_index],
                                      candset_l_key_attr, candset_r_key_attr,
                                      ltable_projected, rtable_projected,
                                      l_key_attr, r_key_attr,
                                      l_filter_attr, r_filter_attr,
                                      self,
                                      (show_progress and (job_index==n_jobs-1)))
                                          for job_index in range(n_jobs))
            output_table = pd.concat(results)

        return output_table        
def apply_matcher(candset,
                  candset_l_key_attr, candset_r_key_attr,
                  ltable, rtable,
                  l_key_attr, r_key_attr,
                  l_match_attr, r_match_attr,
                  tokenizer, sim_function,
                  threshold, comp_op='>=',
                  allow_missing=False,
                  l_out_attrs=None, r_out_attrs=None,
                  l_out_prefix='l_', r_out_prefix='r_',
                  out_sim_score=True, n_jobs=1, show_progress=True):
    """Find matching string pairs from the candidate set (typically produced by
    applying a filter to two tables) by applying a matcher of form 
    (sim_function comp_op threshold).

    Specifically, this method computes the input similarity function on string 
    pairs in the candidate set and checks if the resulting score satisfies the 
    input threshold (depending on the comparison operator).

    Args:
        candset (DataFrame): input candidate set.

        candset_l_key_attr (string): attribute in candidate set which is a key 
            in left table.

        candset_r_key_attr (string): attribute in candidate set which is a key 
            in right table.

        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_match_attr (string): attribute in left table on which the matcher 
            should be applied.

        r_match_attr (string): attribute in right table on which the matcher
            should be applied.

        tokenizer (Tokenizer): tokenizer to be used to tokenize the
            match attributes. If set to None, the matcher is applied directly
            on the match attributes.

        sim_function (function): matcher function to be applied.

        threshold (float): threshold to be satisfied.

        comp_op (string): comparison operator. Supported values are '>=', '>', '
            <=', '<', '=' and '!=' (defaults to '>=').

        allow_missing (boolean): flag to indicate whether tuple pairs with 
            missing value in at least one of the match attributes should be 
            included in the output (defaults to False). 

        l_out_attrs (list): list of attribute names from the left table to be 
            included in the output table (defaults to None).

        r_out_attrs (list): list of attribute names from the right table to be 
            included in the output table (defaults to None).

        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').

        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').

        out_sim_score (boolean): flag to indicate whether similarity score 
            should be included in the output table (defaults to True). Setting
            this flag to True will add a column named '_sim_score' in the 
            output table. This column will contain the similarity scores for the
            tuple pairs in the output. 

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default). 

        show_progress (boolean): flag to indicate whether task progress should 
            be displayed to the user (defaults to True).

    Returns:
        An output table containing tuple pairs from the candidate set that 
        survive the matcher (DataFrame).
    """

    # check if the input candset is a dataframe
    validate_input_table(candset, 'candset')

    # check if the candset key attributes exist
    validate_attr(candset_l_key_attr, candset.columns,
                  'left key attribute', 'candset')
    validate_attr(candset_r_key_attr, candset.columns,
                  'right key attribute', 'candset')

    # check if the input tables are dataframes
    validate_input_table(ltable, 'left table')
    validate_input_table(rtable, 'right table')

    # check if the key attributes and join attributes exist
    validate_attr(l_key_attr, ltable.columns,
                  'key attribute', 'left table')
    validate_attr(r_key_attr, rtable.columns,
                  'key attribute', 'right table')
    validate_attr(l_match_attr, ltable.columns,
                  'match attribute', 'left table')
    validate_attr(r_match_attr, rtable.columns,
                  'match attribute', 'right table')

    # check if the output attributes exist
    validate_output_attrs(l_out_attrs, ltable.columns,
                          r_out_attrs, rtable.columns)

    # check if the input tokenizer is valid, if it is not None
    if tokenizer is not None:
        validate_tokenizer(tokenizer)

    # check if the comparison operator is valid
    validate_comp_op(comp_op)

    # check if the key attributes are unique and do not contain missing values
    validate_key_attr(l_key_attr, ltable, 'left table')
    validate_key_attr(r_key_attr, rtable, 'right table')

    # check for empty candset
    if candset.empty:
        return candset

    # remove redundant attrs from output attrs.
    l_out_attrs = remove_redundant_attrs(l_out_attrs, l_key_attr)
    r_out_attrs = remove_redundant_attrs(r_out_attrs, r_key_attr)

    # get attributes to project.  
    l_proj_attrs = get_attrs_to_project(l_out_attrs, l_key_attr, l_match_attr)
    r_proj_attrs = get_attrs_to_project(r_out_attrs, r_key_attr, r_match_attr)

    # do a projection on the input dataframes. Note that this doesn't create a 
    # copy of the dataframes. It only creates a view on original dataframes.
    ltable_projected = ltable[l_proj_attrs]
    rtable_projected = rtable[r_proj_attrs]

    # computes the actual number of jobs to launch.
    n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))

    # If a tokenizer is provided, we can optimize by tokenizing each value 
    # only once by caching the tokens of l_match_attr and r_match_attr. But, 
    # this can be a bad strategy in case the candset has very few records 
    # compared to the original tables. Hence, we check if the sum of tuples in 
    # ltable and rtable is less than twice the number of tuples in the candset. 
    # If yes, we decide to cache the token values. Else, we do not cache the 
    # tokens as the candset is small.
    l_tokens = None
    r_tokens = None
    if tokenizer is not None and (len(ltable) + len(rtable) < len(candset)*2):
        l_tokens = generate_tokens(ltable_projected, l_key_attr, l_match_attr,
                                   tokenizer)
        r_tokens = generate_tokens(rtable_projected, r_key_attr, r_match_attr,
                                   tokenizer)

    if n_jobs <= 1:
        # if n_jobs is 1, do not use any parallel code.                     
        output_table =  _apply_matcher_split(candset,
                                    candset_l_key_attr, candset_r_key_attr,
                                    ltable_projected, rtable_projected,
                                    l_key_attr, r_key_attr,
                                    l_match_attr, r_match_attr,
                                    tokenizer, sim_function,
                                    threshold, comp_op, allow_missing,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, show_progress,
                                    l_tokens, r_tokens)
    else:
        # if n_jobs is above 1, split the candset into n_jobs splits and apply   
        # the matcher on each candset split in a separate process.  
        candset_splits = split_table(candset, n_jobs)
        results = Parallel(n_jobs=n_jobs)(delayed(_apply_matcher_split)(
                                      candset_splits[job_index],
                                      candset_l_key_attr, candset_r_key_attr,
                                      ltable_projected, rtable_projected,
                                      l_key_attr, r_key_attr,
                                      l_match_attr, r_match_attr,
                                      tokenizer, sim_function,
                                      threshold, comp_op, allow_missing,
                                      l_out_attrs, r_out_attrs,
                                      l_out_prefix, r_out_prefix,
                                      out_sim_score,
                                      (show_progress and (job_index==n_jobs-1)),
                                      l_tokens, r_tokens)
                                          for job_index in range(n_jobs))
        output_table =  pd.concat(results)

    return output_table
Beispiel #15
0
    def filter_candset(self,
                       candset,
                       candset_l_key_attr,
                       candset_r_key_attr,
                       ltable,
                       rtable,
                       l_key_attr,
                       r_key_attr,
                       l_filter_attr,
                       r_filter_attr,
                       n_jobs=1,
                       show_progress=True):
        """Finds candidate matching pairs of strings from the input candidate 
        set.

        Args:
            candset (DataFrame): input candidate set.

            candset_l_key_attr (string): attribute in candidate set which is a 
                key in left table.

            candset_r_key_attr (string): attribute in candidate set which is a 
                key in right table.

            ltable (DataFrame): left input table.

            rtable (DataFrame): right input table.

            l_key_attr (string): key attribute in left table.

            r_key_attr (string): key attribute in right table.

            l_filter_attr (string): attribute in left table on which the filter 
                should be applied.                                              
                                                                                
            r_filter_attr (string): attribute in right table on which the filter
                should be applied.

            n_jobs (int): number of parallel jobs to use for the computation    
                (defaults to 1). If -1 is given, all CPUs are used. If 1 is     
                given, no parallel computing code is used at all, which is      
                useful for debugging. For n_jobs below -1,                      
                (n_cpus + 1 + n_jobs) are used (where n_cpus is the total       
                number of CPUs in the machine). Thus for n_jobs = -2, all CPUs  
                but one are used. If (n_cpus + 1 + n_jobs) becomes less than 1, 
                then no parallel computing code will be used (i.e., equivalent  
                to the default).
                                                                                
            show_progress (boolean): flag to indicate whether task progress     
                should be displayed to the user (defaults to True). 

        Returns:
            An output table containing tuple pairs from the candidate set that 
            survive the filter (DataFrame).
        """

        # check if the input candset is a dataframe
        validate_input_table(candset, 'candset')

        # check if the candset key attributes exist
        validate_attr(candset_l_key_attr, candset.columns,
                      'left key attribute', 'candset')
        validate_attr(candset_r_key_attr, candset.columns,
                      'right key attribute', 'candset')

        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes filter join attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the filter attributes are not of numeric type
        validate_attr_type(l_filter_attr, ltable[l_filter_attr].dtype,
                           'filter attribute', 'left table')
        validate_attr_type(r_filter_attr, rtable[r_filter_attr].dtype,
                           'filter attribute', 'right table')

        # check if the key attributes are unique and do not contain
        # missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # check for empty candset
        if candset.empty:
            return candset

        # Do a projection on the input dataframes to keep only required
        # attributes. Note that this does not create a copy of the dataframes.
        # It only creates a view on original dataframes.
        ltable_projected = ltable[[l_key_attr, l_filter_attr]]
        rtable_projected = rtable[[r_key_attr, r_filter_attr]]

        # computes the actual number of jobs to launch.
        n_jobs = min(get_num_processes_to_launch(n_jobs), len(candset))

        if n_jobs <= 1:
            # if n_jobs is 1, do not use any parallel code.
            output_table = _filter_candset_split(
                candset, candset_l_key_attr, candset_r_key_attr,
                ltable_projected, rtable_projected, l_key_attr, r_key_attr,
                l_filter_attr, r_filter_attr, self, show_progress)
        else:
            # if n_jobs is above 1, split the candset into n_jobs splits and
            # filter each candset split in a separate process.
            candset_splits = split_table(candset, n_jobs)
            results = Parallel(n_jobs=n_jobs)(delayed(_filter_candset_split)(
                candset_splits[job_index], candset_l_key_attr,
                candset_r_key_attr, ltable_projected, rtable_projected,
                l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, self, (
                    show_progress and (job_index == n_jobs - 1)))
                                              for job_index in range(n_jobs))
            output_table = pd.concat(results)

        return output_table
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_'):
        """Filter tables with suffix filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # find column indices of key attr, filter attr and
        # output attrs in ltable
        l_columns = list(ltable.columns.values)
        l_key_attr_index = l_columns.index(l_key_attr)
        l_filter_attr_index = l_columns.index(l_filter_attr)
        l_out_attrs_indices = find_output_attribute_indices(
            l_columns, l_out_attrs)

        # find column indices of key attr, filter attr and
        # output attrs in rtable
        r_columns = list(rtable.columns.values)
        r_key_attr_index = r_columns.index(r_key_attr)
        r_filter_attr_index = r_columns.index(r_filter_attr)
        r_out_attrs_indices = find_output_attribute_indices(
            r_columns, r_out_attrs)

        # build a dictionary on ltable
        ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                            l_filter_attr_index)

        # build a dictionary on rtable
        rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                            r_filter_attr_index)

        # generate token ordering using tokens in l_filter_attr
        # and r_filter_attr
        token_ordering = gen_token_ordering_for_tables(
            [ltable_dict.values(), rtable_dict.values()],
            [l_filter_attr_index, r_filter_attr_index], self.tokenizer,
            self.sim_measure_type)

        output_rows = []
        has_output_attributes = (l_out_attrs is not None
                                 or r_out_attrs is not None)
        prog_bar = pyprind.ProgBar(len(ltable))

        for l_row in ltable_dict.values():
            l_id = l_row[l_key_attr_index]
            l_string = str(l_row[l_filter_attr_index])
            # check for empty string
            if not l_string:
                continue
            ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type)
            ordered_ltokens = order_using_token_ordering(
                ltokens, token_ordering)
            l_num_tokens = len(ordered_ltokens)
            l_prefix_length = get_prefix_length(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold, self.tokenizer)
            l_suffix = ordered_ltokens[l_prefix_length:]
            for r_row in rtable_dict.values():
                r_id = r_row[r_key_attr_index]
                r_string = str(r_row[r_filter_attr_index])
                # check for empty string
                if not r_string:
                    continue
                rtokens = tokenize(r_string, self.tokenizer,
                                   self.sim_measure_type)
                ordered_rtokens = order_using_token_ordering(
                    rtokens, token_ordering)
                r_num_tokens = len(ordered_rtokens)
                r_prefix_length = get_prefix_length(r_num_tokens,
                                                    self.sim_measure_type,
                                                    self.threshold,
                                                    self.tokenizer)
                if not self._filter_suffix(
                        l_suffix, ordered_rtokens[r_prefix_length:],
                        l_prefix_length, r_prefix_length, l_num_tokens,
                        r_num_tokens):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable_dict[l_id], r_row, l_id, r_id,
                            l_out_attrs_indices, r_out_attrs_indices)
                        output_rows.append(output_row)
                    else:
                        output_rows.append([l_id, r_id])

            prog_bar.update()

        output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                      l_out_attrs, r_out_attrs,
                                                      l_out_prefix,
                                                      r_out_prefix)

        # generate a dataframe from the list of output rows
        output_table = pd.DataFrame(output_rows, columns=output_header)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table