def test_overlap_join_using_tokenizer_with_return_set_false(self):
     A = pd.DataFrame([{'id':1, 'attr':'hello'}])
     B = pd.DataFrame([{'id':1, 'attr':'he ll'}])
     qg2_tok = QgramTokenizer(2)
     assert_equal(qg2_tok.get_return_set(), False)
     c = overlap_join(A, B, 'id', 'id', 'attr', 'attr', qg2_tok, 1)
     assert_equal(len(c), 1)
     assert_equal(qg2_tok.get_return_set(), False)        
Exemple #2
0
 def test_overlap_join_using_tokenizer_with_return_set_false(self):
     A = pd.DataFrame([{'id': 1, 'attr': 'hello'}])
     B = pd.DataFrame([{'id': 1, 'attr': 'he ll'}])
     qg2_tok = QgramTokenizer(2)
     assert_equal(qg2_tok.get_return_set(), False)
     c = overlap_join(A, B, 'id', 'id', 'attr', 'attr', qg2_tok, 1)
     assert_equal(len(c), 1)
     assert_equal(qg2_tok.get_return_set(), False)
 def test_overlap_join_invalid_tokenizer(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  [], self.threshold)
 def test_overlap_join_numeric_r_join_attr(self):                            
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr',
                  self.tokenizer, self.threshold)
 def test_overlap_join_invalid_r_join_attr(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.invalid_attr',
                  self.tokenizer, self.threshold)
 def test_overlap_join_invalid_rtable(self):
     overlap_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold)
Exemple #7
0
 def time_overlap_delim_1(self):
     overlap_join(self.ltable, self.rtable, self.l_id_attr, self.r_id_attr,
                  self.l_join_attr, self.r_join_attr, self.delim_tok, 1)
 def test_overlap_join_invalid_l_out_attr(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold, '>=', True, False,
                  ['A.invalid_attr'], ['B.attr'])
Exemple #9
0
 def test_overlap_join_invalid_comp_op_le(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold, '<=')
Exemple #10
0
 def test_overlap_join_invalid_threshold_zero(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, 0)
Exemple #11
0
 def test_overlap_join_invalid_tokenizer(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', [],
                  self.threshold)
Exemple #12
0
 def test_overlap_join_numeric_r_join_attr(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr',
                  self.tokenizer, self.threshold)
Exemple #13
0
 def test_overlap_join_invalid_r_join_attr(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                  'B.invalid_attr', self.tokenizer, self.threshold)
Exemple #14
0
 def test_overlap_join_invalid_rtable(self):
     overlap_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold)
 def test_overlap_join_invalid_threshold_zero(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, 0)
 def test_overlap_join_invalid_comp_op_le(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold, '<=')
 def time_overlap_delim_1(self):
     overlap_join(self.ltable, self.rtable,
                  self.l_id_attr, self.r_id_attr,
                  self.l_join_attr, self.r_join_attr,
                  self.delim_tok, 1)
Exemple #18
0
 def time_overlap_delim_1(self):
     overlap_join(self.ltable, self.rtable, 'id', 'id', 'attr', 'attr',
                  self.delim_tok, 1)
    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True,
                     overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, show_progress=True,
                     n_jobs=1):
        """
        Blocks two tables based on the overlap of token sets of attribute
         values.

        Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus is the total number of CPUs in the
                machine). Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_jobs` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1)
            # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2)
            # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
            # Use all the cores in the machine
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1)


        """

        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val,
                                         word_level, overlap_size)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate overlap attributes
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr,
                                                 l_output_attrs)
        l_df = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr,
                                                 r_output_attrs)
        r_df = rtable[r_proj_attrs]

        # # case the column to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)



        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@'
        r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@'
        l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr]
        r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr]

        if not l_df.empty:
            self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words)
        if not r_df.empty:
            self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer 
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # perform overlap similarity join
        candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr,
                               r_dummy_overlap_attr, tokenizer, overlap_size,
                               '>=',
                               allow_missing, l_output_attrs, r_output_attrs,
                               l_output_prefix, r_output_prefix, False, n_jobs,
                               show_progress)

        # # retain only the required attributes in the output candidate set 
        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs,
                                               l_output_prefix, r_output_prefix)
        candset = candset[retain_cols]

        # update metadata in the catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return the candidate set
        return candset
Exemple #20
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_overlap_attr,
                     r_overlap_attr,
                     rem_stop_words=False,
                     q_val=None,
                     word_level=True,
                     overlap_size=1,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     allow_missing=False,
                     verbose=False,
                     show_progress=True,
                     n_jobs=1):
        """
        Blocks two tables based on the overlap of token sets of attribute
         values.

        Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus is the total number of CPUs in the
                machine). Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_jobs` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1)
            # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2)
            # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
            # Use all the cores in the machine
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1)


        """

        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate overlap attributes
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr,
                                                 l_output_attrs)
        l_df = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr,
                                                 r_output_attrs)
        r_df = rtable[r_proj_attrs]

        # # case the column to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@'
        r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@'
        l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr]
        r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr]

        if not l_df.empty:
            self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words)
        if not r_df.empty:
            self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # perform overlap similarity join
        candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr,
                               r_dummy_overlap_attr, tokenizer, overlap_size,
                               '>=', allow_missing, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_jobs, show_progress)

        # # retain only the required attributes in the output candidate set
        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs, l_output_prefix,
                                               r_output_prefix)
        candset = candset[retain_cols]

        # update metadata in the catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return the candidate set
        return candset
Exemple #21
0
 def test_overlap_join_invalid_r_out_attr(self):
     overlap_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                  self.tokenizer, self.threshold, '>=', True, False,
                  ['A.attr'], ['B.invalid_attr'])