Python OverlapFilter.filter_tables Exemples, py_stringsimjoin.filter.overlap_filter.OverlapFilter.filter_tables Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : join.py Projet : paulgc/py_stringsimjoin-1

def overlap_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer,
                 threshold,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True, n_jobs=1):
    """Join two tables using overlap similarity measure.

    Finds tuple pairs from ltable and rtable such that
    Overlap(ltable.l_join_attr, rtable.r_join_attr) >= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : float, overlap threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if similarity score needs to be included in the output table

    Returns:
    result : Pandas data frame
    """
    overlap_filter = OverlapFilter(tokenizer, threshold)
    return overlap_filter.filter_tables(ltable, rtable,
                                        l_key_attr, r_key_attr,
                                        l_join_attr, r_join_attr,
                                        l_out_attrs, r_out_attrs,
                                        l_out_prefix, r_out_prefix,
                                        out_sim_score, n_jobs)

Exemple #2

0

Afficher le fichier

Fichier : test_apply_matcher.py Projet : raghu100692/py_stringsimjoin

    def test_apply_matcher_with_join_attr_of_type_int(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='
        l_join_attr = 'A.zipcode'
        r_join_attr = 'B.zipcode'

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[l_join_attr])),
                                 tok.tokenize(str(row[r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                                               self.l_key_attr,
                                               self.r_key_attr, l_join_attr,
                                               r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(
            candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable,
            self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok,
            sim_func, threshold)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))

Exemple #3

0

Afficher le fichier

Fichier : test_apply_matcher.py Projet : anhaidgroup/py_stringsimjoin

    def test_apply_matcher(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.ltable, self.rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, False,
            [self.l_join_attr], [self.r_join_attr], out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               DEFAULT_L_OUT_PREFIX + self.l_join_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_join_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))

Exemple #4

0

Afficher le fichier

    def test_filter_tables(self, tokenizer, overlap_size, comp_op,
                           allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op,
                                       allow_missing)
        actual_candset = overlap_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))

Exemple #5

0

Afficher le fichier

Fichier : test_overlap_filter.py Projet : anhaidgroup/py_stringsimjoin

    def test_filter_tables(self, tokenizer, overlap_size, comp_op,
                           allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       comp_op, allow_missing)
        actual_candset = overlap_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))

Exemple #6

0

Afficher le fichier

 def test_invalid_r_out_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                  'B.attr', ['A.attr'], ['B.invalid_attr'])

Exemple #7

0

Afficher le fichier

 def test_numeric_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                  'B.int_attr')

Exemple #8

0

Afficher le fichier

 def test_invalid_rtable(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr',
                                  'B.attr')

Exemple #9

0

Afficher le fichier

Fichier : test_apply_matcher.py Projet : raghu100692/py_stringsimjoin

    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])),
                                 tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr])
                        or pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True.
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(
            self.orig_ltable, self.orig_rtable, self.l_key_attr,
            self.r_key_attr, self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
                                       DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                                       DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                                       self.orig_ltable,
                                       self.orig_rtable,
                                       self.l_key_attr,
                                       self.r_key_attr,
                                       self.l_join_attr,
                                       self.r_join_attr,
                                       tok,
                                       sim_func,
                                       threshold,
                                       comp_op,
                                       True,
                                       out_sim_score=True)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))

Exemple #10

0

Afficher le fichier

Fichier : overlap_join.py Projet : paulgc/py_stringsimjoin-2

def overlap_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, threshold, comp_op='>=',
                 allow_missing=False,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True, n_jobs=1, show_progress=True):
    """Join two tables using overlap measure.

    For two sets X and Y, the overlap between them is given by:                       
                                                                                
        :math:`overlap(X, Y) = |X \\cap Y|`

    Finds tuple pairs from left table and right table such that the overlap 
    between the join attributes satisfies the condition on input threshold. For 
    example, if the comparison operator is '>=', finds tuple pairs whose 
    overlap between the strings that are the values of the join attributes is 
    greater than or equal to the input threshold, as specified in "threshold".

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): overlap threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # use overlap filter to perform the join.
    overlap_filter = OverlapFilter(tokenizer, threshold, comp_op, allow_missing)
    output_table =  overlap_filter.filter_tables(ltable, rtable,
                                                 l_key_attr, r_key_attr,
                                                 l_join_attr, r_join_attr,
                                                 l_out_attrs, r_out_attrs,
                                                 l_out_prefix, r_out_prefix,
                                                 out_sim_score, n_jobs,
                                                 show_progress)

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table

Exemple #11

0

Afficher le fichier

Fichier : overlap_join_py.py Projet : guptaarth87/py_stringsimjoin

def overlap_join_py(ltable,
                    rtable,
                    l_key_attr,
                    r_key_attr,
                    l_join_attr,
                    r_join_attr,
                    tokenizer,
                    threshold,
                    comp_op='>=',
                    allow_missing=False,
                    l_out_attrs=None,
                    r_out_attrs=None,
                    l_out_prefix='l_',
                    r_out_prefix='r_',
                    out_sim_score=True,
                    n_jobs=1,
                    show_progress=True):
    """Join two tables using overlap measure.

    For two sets X and Y, the overlap between them is given by:                       
                                                                                
        :math:`overlap(X, Y) = |X \\cap Y|`

    Finds tuple pairs from left table and right table such that the overlap 
    between the join attributes satisfies the condition on input threshold. For 
    example, if the comparison operator is '>=', finds tuple pairs whose 
    overlap between the strings that are the values of the join attributes is 
    greater than or equal to the input threshold, as specified in "threshold".

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): overlap threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # use overlap filter to perform the join.
    overlap_filter = OverlapFilter(tokenizer, threshold, comp_op,
                                   allow_missing)
    output_table = overlap_filter.filter_tables(ltable, rtable, l_key_attr,
                                                r_key_attr, l_join_attr,
                                                r_join_attr, l_out_attrs,
                                                r_out_attrs, l_out_prefix,
                                                r_out_prefix, out_sim_score,
                                                n_jobs, show_progress)

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table

Exemple #12

0

Afficher le fichier

Fichier : test_overlap_filter.py Projet : anhaidgroup/py_stringsimjoin

 def test_invalid_r_out_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',
                                  'A.attr', 'B.attr',
                                  ['A.attr'], ['B.invalid_attr'])

Exemple #13

0

Afficher le fichier

Fichier : test_overlap_filter.py Projet : anhaidgroup/py_stringsimjoin

 def test_numeric_r_filter_attr(self):                                       
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)          
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',            
                                  'A.attr', 'B.int_attr')

Exemple #14

0

Afficher le fichier

Fichier : test_overlap_filter.py Projet : anhaidgroup/py_stringsimjoin

 def test_invalid_rtable(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id',
                                  'A.attr', 'B.attr')

Exemple #15

0

Afficher le fichier

Fichier : test_apply_matcher.py Projet : anhaidgroup/py_stringsimjoin

    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr]) or
                    pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True. 
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, True, out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))