Exemple #1
0
    def test_candset_with_numeric_r_filter_attr(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr': '1990'}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr': 2001}])

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                     on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        overlap_filter = OverlapFilter(qg2_tok)
        overlap_filter.filter_candset(C, 'l_id', 'r_id', A, B, 'l_id', 'r_id',
                                      'l_attr', 'r_attr')
 def test_candset_with_numeric_r_filter_attr(self):                          
     A = pd.DataFrame([{'l_id': 1, 'l_attr':'1990'}])                          
     B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}])                          
                                                                             
     A['tmp_join_key'] = 1                                                   
     B['tmp_join_key'] = 1                                                   
     C = pd.merge(A[['l_id', 'tmp_join_key']],                               
                  B[['r_id', 'tmp_join_key']],                               
              on='tmp_join_key').drop('tmp_join_key', 1)                     
                                                                             
     qg2_tok = QgramTokenizer(2, return_set=True)                            
     overlap_filter = OverlapFilter(qg2_tok)                                 
     overlap_filter.filter_candset(C, 'l_id', 'r_id',                                        
                                   A, B, 'l_id', 'r_id',                     
                                   'l_attr', 'r_attr')  
Exemple #3
0
    def test_filter_candset(self, tokenizer, overlap_size, args,
                            expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size)
        actual_output_candset = overlap_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_candset(self, tokenizer, overlap_size, comp_op,
                            allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       comp_op, allow_missing)
        actual_output_candset = overlap_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
Exemple #5
0
    def block_candset(self,
                      candset,
                      l_overlap_attr,
                      r_overlap_attr,
                      rem_stop_words=False,
                      q_val=None,
                      word_level=True,
                      overlap_size=1,
                      allow_missing=False,
                      verbose=False,
                      show_progress=True,
                      n_jobs=1):
        """Blocks an input candidate set of tuple pairs based on the overlap
           of token sets of attribute values.

        Finds tuple pairs from an input candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information


                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus are the total number of CPUs in the
                machine).Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        self.cleanup_table(l_df, l_overlap_attr, rem_stop_words)
        self.cleanup_table(r_df, r_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # create a filter for overlap similarity join
        overlap_filter = OverlapFilter(tokenizer,
                                       overlap_size,
                                       allow_missing=allow_missing)

        # # perform overlap similarity filtering of the candset
        out_table = overlap_filter.filter_candset(candset,
                                                  fk_ltable,
                                                  fk_rtable,
                                                  l_df,
                                                  r_df,
                                                  l_key,
                                                  r_key,
                                                  l_overlap_attr,
                                                  r_overlap_attr,
                                                  n_jobs,
                                                  show_progress=show_progress)
        # update catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return out_table
Exemple #6
0
 def test_rtable_r_key_attr_with_missing_value(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B,
                                   'l_id', 'r_attr', 'l_attr', 'r_attr')
Exemple #7
0
 def test_invalid_rtable_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B,
                                   'l_id', 'r_id', 'l_attr', 'invalid_attr')
Exemple #8
0
 def test_invalid_ltable(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', [], self.B,
                                   'l_id', 'r_id', 'l_attr', 'r_attr')
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True,
                      overlap_size=1, allow_missing=False,
                      verbose=False, show_progress=True, n_jobs=1):
        """Blocks an input candidate set of tuple pairs based on the overlap
           of token sets of attribute values.

        Finds tuple pairs from an input candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information

                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus are the total number of CPUs in the
                machine).Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val,
                                         word_level, overlap_size)

        # get and validate metadata
        log_info(logger,
                 'Required metadata: cand.set key, fk ltable, fk rtable, '
                 'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        self.cleanup_table(l_df, l_overlap_attr, rem_stop_words)
        self.cleanup_table(r_df, r_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # create a filter for overlap similarity join
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       allow_missing=allow_missing)

        # # perform overlap similarity filtering of the candset
        out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable,
                                                  l_df, r_df, l_key, r_key,
                                                  l_overlap_attr,
                                                  r_overlap_attr,
                                                  n_jobs,
                                                  show_progress=show_progress)
        # update catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return out_table
 def test_rtable_r_key_attr_with_missing_value(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   self.A, self.B,
                                  'l_id', 'r_attr',
                                  'l_attr', 'r_attr')
 def test_invalid_rtable_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   self.A, self.B,
                                  'l_id', 'r_id',
                                  'l_attr', 'invalid_attr')
 def test_invalid_ltable(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   [], self.B,
                                  'l_id', 'r_id',
                                  'l_attr', 'r_attr')