def block_candset_excluding_rule(self, c_df, l_df, r_df, l_key, r_key,
                                     fk_ltable, fk_rtable, rule_to_exclude,
                                     show_progress, n_chunks):


        # # list to keep track of valid ids
        valid = []

        apply_rules_excluding_rule_pkl = cp.dumps(self.apply_rules_excluding_rule)

        if n_chunks == 1:
            # single process
            valid = _block_candset_excluding_rule_split(c_df, l_df, r_df,
                                                        l_key, r_key,
                                                        fk_ltable, fk_rtable,
                                                        rule_to_exclude,
                                                        apply_rules_excluding_rule_pkl,
                                                        show_progress)
        else:
            # multiprocessing
            c_splits = pd.np.array_split(c_df, n_chunks)

            valid_splits = []

            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_excluding_rule_split)(c_splits[i],
                                                             l_df, r_df,
                                                             l_key, r_key,
                                                             fk_ltable,
                                                             fk_rtable,
                                                             rule_to_exclude,
                                                             apply_rules_excluding_rule_pkl, False)
                                                            # use Progressbar from
                                                            # Dask.diagnostics so set the
                                                            #show_progress to False

                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output candset
        if len(c_df) > 0:
            candset = c_df[valid]
        else:
            candset = pd.DataFrame(columns=c_df.columns)

        # return candidate set
        return candset
Beispiel #2
0
    def block_candset_excluding_rule(self, c_df, l_df, r_df, l_key, r_key,
                                     fk_ltable, fk_rtable, rule_to_exclude,
                                     show_progress, n_chunks):

        # # list to keep track of valid ids
        valid = []

        apply_rules_excluding_rule_pkl = cp.dumps(
            self.apply_rules_excluding_rule)

        if n_chunks == 1:
            # single process
            valid = _block_candset_excluding_rule_split(
                c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable,
                rule_to_exclude, apply_rules_excluding_rule_pkl, show_progress)
        else:
            # multiprocessing
            c_splits = np.array_split(c_df, n_chunks)

            valid_splits = []

            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_excluding_rule_split)(
                    c_splits[i], l_df, r_df, l_key, r_key, fk_ltable,
                    fk_rtable, rule_to_exclude, apply_rules_excluding_rule_pkl,
                    False)
                # use Progressbar from
                # Dask.diagnostics so set the
                #show_progress to False

                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(
                        scheduler="processes", num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(
                    scheduler="processes", num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output candset
        if len(c_df) > 0:
            candset = c_df[valid]
        else:
            candset = pd.DataFrame(columns=c_df.columns)

        # return candidate set
        return candset
    def block_tables_without_filters(self, l_df, r_df, l_key, r_key,
                                     l_output_attrs, r_output_attrs,
                                     l_output_prefix, r_output_prefix,
                                     verbose, show_progress, n_ltable_chunks,
                                     n_rtable_chunks):

        # do blocking

        # # determine the number of processes to launch parallely


        candset = None

        apply_rules_pkl = cp.dumps(self.apply_rules)

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          apply_rules_pkl, show_progress)
        else:
            # multiprocessing
            # m, n = self.get_split_params(n_procs, len(l_df), len(r_df))
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)

            c_splits = []
            for i in range(len(l_splits)):
                for j in range(len(r_splits)):
                    partial_result = delayed(_block_tables_split)(l_splits[i], r_splits[j],
                                             l_key, r_key,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             apply_rules_pkl, False) # we will use
                    # Dask.diagnostics to display the progress bar so set
                    # show_progress to False
                    c_splits.append(partial_result)

            c_splits = delayed(wrap)(c_splits)
            if show_progress:
                with ProgressBar():
                    c_splits = c_splits.compute(scheduler="processes", num_workers = get_num_cores())
            else:
                c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # return candidate set
        return candset
Beispiel #4
0
    def block_tables_without_filters(self, l_df, r_df, l_key, r_key,
                                     l_output_attrs, r_output_attrs,
                                     l_output_prefix, r_output_prefix, verbose,
                                     show_progress, n_ltable_chunks,
                                     n_rtable_chunks):

        # do blocking

        # # determine the number of processes to launch parallely

        candset = None

        apply_rules_pkl = cp.dumps(self.apply_rules)

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          apply_rules_pkl, show_progress)
        else:
            # multiprocessing
            # m, n = self.get_split_params(n_procs, len(l_df), len(r_df))
            l_splits = np.array_split(l_df, n_ltable_chunks)
            r_splits = np.array_split(r_df, n_rtable_chunks)

            c_splits = []
            for i in range(len(l_splits)):
                for j in range(len(r_splits)):
                    partial_result = delayed(_block_tables_split)(
                        l_splits[i], r_splits[j], l_key, r_key, l_output_attrs,
                        r_output_attrs, l_output_prefix, r_output_prefix,
                        apply_rules_pkl, False)  # we will use
                    # Dask.diagnostics to display the progress bar so set
                    # show_progress to False
                    c_splits.append(partial_result)

            c_splits = delayed(wrap)(c_splits)
            if show_progress:
                with ProgressBar():
                    c_splits = c_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())
            else:
                c_splits = c_splits.compute(scheduler="processes",
                                            num_workers=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # return candidate set
        return candset
Beispiel #5
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=False,
                     show_progress=True,
                     n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on the sequence of rules supplied by the user.
        Finds tuple pairs from left and right tables that survive the sequence
        of blocking rules. A tuple pair survives the sequence of blocking rules
        if none of the rules in the sequence returns True for that pair. If any
        of the rules returns True, then the pair is blocked.
        
        Args:
            
            ltable (DataFrame): The left input table.
            
            rtable (DataFrame): The right input table.
            
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            
            verbose (boolean): A flag to indicate whether the debug
                information  should be logged (defaults to False).
                
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     

        Returns:
            
            A candidate set of tuple pairs that survived the sequence of
            blocking rules (DataFrame).

        Raises:
            
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If the input `l_output_prefix` is not of type
                string.
            AssertionError: If the input `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
            AssertionError: If there are no rules to apply.
        Examples:
                >>> import py_entitymatching as em
                >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker
                >>> rb = DaskRuleBasedBlocker()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> block_f = em.get_features_for_blocking(A, B)
                >>> rule = ['name_name_lev(ltuple, rtuple) > 3']
                >>> rb.add_rule(rule, feature_table=block_f)
                >>> C = rb.block_tables(A, B)
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate input parameters
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]

        # # get attributes to project
        l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(
            l_key, r_key, l_output_attrs_1, r_output_attrs_1)
        l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs]

        candset, rule_applied = self.block_tables_with_filters(
            l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1,
            l_output_prefix, r_output_prefix, verbose, show_progress,
            get_num_cores())
        # pass number of splits as
        #  the number of cores in the machine

        if candset is None:
            # no filterable rule was applied
            candset = self.block_tables_without_filters(
                l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1,
                l_output_prefix, r_output_prefix, verbose, show_progress,
                n_ltable_chunks, n_rtable_chunks)
        elif len(self.rules) > 1:
            # one filterable rule was applied but other rules are left
            # block candset by applying other rules and excluding the applied rule
            candset = self.block_candset_excluding_rule(
                candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key,
                r_output_prefix + r_key, rule_applied, show_progress,
                get_num_cores())

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1,
                                               r_output_attrs_1,
                                               l_output_prefix,
                                               r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=False,
                     show_progress=True,
                     n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.        

        Blocks two tables based on a black box blocking function specified
        by the user.
        Finds tuple pairs from left and right tables that survive the black
        box function. A tuple pair survives the black box blocking function if
        the function returns False for that pair, otherwise the tuple pair is
        dropped.
        
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            verbose (boolean): A flag to indicate whether the debug
             information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     
        Returns:

            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If values in `l_output_attrs` is not of type
                string.
            AssertionError: If values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
        Examples:
            >>> def match_last_name(ltuple, rtuple):
                # assume that there is a 'name' attribute in the input tables
                # and each value in it has two words
                l_last_name = ltuple['name'].split()[1]
                r_last_name = rtuple['name'].split()[1]
                if l_last_name != r_last_name:
                    return True
                else:
                    return False
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker
            >>> bb = DaskBlackBoxBlocker()
            >>> bb.set_black_box_function(match_last_name)
            >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] )
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate black box function
        assert self.black_box_function != None, 'Black box function is not set'

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # do blocking

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]

        # # pickle the black-box function before passing it as an arg to
        # # _block_tables_split to be executed by each child process
        black_box_function_pkl = cp.dumps(self.black_box_function)

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_output_attrs_1, r_output_attrs_1,
                                          l_output_prefix, r_output_prefix,
                                          black_box_function_pkl,
                                          show_progress)
        else:
            # multiprocessing
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)

            c_splits = []
            for i in range(len(l_splits)):
                for j in range(len(r_splits)):
                    partial_result = delayed(_block_tables_split)(
                        l_splits[i], r_splits[j], l_key, r_key,
                        l_output_attrs_1, r_output_attrs_1, l_output_prefix,
                        r_output_prefix, black_box_function_pkl, False)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            if show_progress:
                with ProgressBar():
                    c_splits = c_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())
            else:
                c_splits = c_splits.compute(scheduler="processes",
                                            num_workers=get_num_cores())

            candset = pd.concat(c_splits, ignore_index=True)

        # # determine the attributes to retain in the output candidate set
        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs, l_output_prefix,
                                               r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_candset(self,
                      candset,
                      verbose=True,
                      show_progress=True,
                      n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.


        Blocks an input candidate set of tuple pairs based on a black box
        blocking function specified by the user.

        Finds tuple pairs from an input candidate set of tuple pairs that
        survive the black box function. A tuple pair survives the black box
        blocking function if the function returns False for that pair,
        otherwise the tuple pair is dropped.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            verbose (boolean): A flag to indicate whether logging should be done
                               (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.

        Examples:
            >>> def match_last_name(ltuple, rtuple):
                # assume that there is a 'name' attribute in the input tables
                # and each value in it has two words
                l_last_name = ltuple['name'].split()[1]
                r_last_name = rtuple['name'].split()[1]
                if l_last_name != r_last_name:
                    return True
                else:
                    return False
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker
            >>> bb = DaskBlackBoxBlocker()
            >>> bb.set_black_box_function(match_last_name)
            >>> D = bb.block_candset(C) # C is an output from block_tables
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_chunks)

        # validate black box functionn
        assert self.black_box_function != None, 'Black box function is not set'

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          verbose)

        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)
        # do blocking

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # project candset to keep only the ID attributes
        c_df = candset[[key, fk_ltable, fk_rtable]]

        # # determine the number of processes to launch parallely
        n_chunks = get_num_partitions(n_chunks, len(candset))

        # # pickle the black-box function before passing it as an arg to
        # # _block_candset_split to be executed by each child process
        black_box_function_pkl = cp.dumps(self.black_box_function)

        valid = []
        if n_chunks == 1:
            # single process
            valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key,
                                         fk_ltable, fk_rtable,
                                         black_box_function_pkl, show_progress)
        else:
            # multiprocessing
            c_splits = pd.np.array_split(c_df, n_chunks)

            valid_splits = []
            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_split)(
                    c_splits[i], l_df, r_df, l_key, r_key, fk_ltable,
                    fk_rtable, black_box_function_pkl, False)
                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(
                        scheduler="processes", num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(
                    scheduler="processes", num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output table
        if len(c_df) > 0:
            c_df = candset[valid]
        else:
            c_df = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return c_df
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True,
                      overlap_size=1, allow_missing=False,
                      verbose=False, show_progress=True, n_chunks=-1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
        
        Blocks an input candidate set of tuple pairs based on the overlap
        of token sets of attribute values. Finds tuple pairs from an input 
        candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information

                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """
        logger.warning(
            "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Validate input parameters
        self.validate_types_params_candset(candset, verbose, show_progress, n_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)

        # get and validate metadata
        log_info(logger,
                 'Required metadata: cand.set key, fk ltable, fk rtable, '
                 'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # validate number of chunks
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)


        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(return_set=True)


        n_chunks = get_num_partitions(n_chunks, len(candset))
        c_splits = pd.np.array_split(candset, n_chunks)
        valid_splits = []

        # Create DAG
        for i in range(n_chunks):
            result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key,
                                                       r_key, l_overlap_attr,
                                                       r_overlap_attr, fk_ltable,
                                                       fk_rtable, allow_missing,
                                                       rem_stop_words, tokenizer, overlap_size)
            valid_splits.append(result)
        valid_splits = delayed(wrap)(valid_splits)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())
        else:
            valid_splits = valid_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())

        valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
    def block_candset(self, candset, l_block_attr, r_block_attr,
                      allow_missing=False, verbose=False, show_progress=True,
                      n_chunks=1):
        """

        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks an input candidate set of tuple pairs based on attribute equivalence.
        Finds tuple pairs from an input candidate set of tuple pairs
        such that the value of attribute l_block_attr of the left tuple in a
        tuple pair exactly matches the value of attribute r_block_attr of the 
        right tuple in the tuple pair.
        
        Args:
            candset (DataFrame): The input candidate set of tuple pairs.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.
            verbose (boolean): A flag to indicate whether the debug
                              information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  
       
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
       
        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()            
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1)
        """
        logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
              "RISK.")

        # validate data types of input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_chunks)

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                         'fk rtable, ltable, rtable, ltable key, rtable key',
                 verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # validate n_chunks parameter
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_block_attr]]
        r_df = rtable[[r_key, r_block_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # determine number of processes to launch parallely
        n_chunks = get_num_partitions(n_chunks, len(candset))

        valid = []
        if n_chunks == 1:
            # single process
            valid = _block_candset_split(candset, l_df, r_df, l_key, r_key,
                                         l_block_attr, r_block_attr, fk_ltable,
                                         fk_rtable, allow_missing, show_progress)
        else:
            c_splits = np.array_split(candset, n_chunks)

            valid_splits = []
            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_split)(c_splits[i],
                                                               l_df, r_df,
                                                               l_key, r_key,
                                                               l_block_attr, r_block_attr,
                                                               fk_ltable, fk_rtable,
                                                               allow_missing,
                                                               False)  # setting show
                # progress to False as we will use Dask diagnostics to display progress
                #  bar
                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                        num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False,
                seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1,
                n_sample_rtable_chunks=1):


    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks')


    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]


    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                             start_row_id,
                                                             rem_puncs, rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)


    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]


    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                              inverted_index, rem_puncs,
                                rem_stop_words, seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]



    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on attribute equivalence.
        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.
      
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.
            verbose (boolean): A flag to indicate whether the debug information
                              should be logged (defaults to False).
            
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
            
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
       
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
        """

        logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR "
                    "OWN RISK.")


        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1) # last arg is
                                         # set to 1 just to reuse the function from the
                                         # old blocker.

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)
            c_splits = []

            for l in l_splits:
                for r in r_splits:
                    partial_result = delayed(_block_tables_split)(l, r, l_key, r_key,
                                             l_block_attr, r_block_attr,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             allow_missing)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(ltable_proj,
                                                              rtable_proj,
                                                              l_key, r_key,
                                                              l_block_attr,
                                                              r_block_attr,
                                                              l_output_attrs,
                                                              r_output_attrs,
                                                              l_output_prefix,
                                                              r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_candset(self, candset, l_block_attr, r_block_attr,
                      allow_missing=False, verbose=False, show_progress=True,
                      n_chunks=1):
        """

        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks an input candidate set of tuple pairs based on attribute equivalence.
        Finds tuple pairs from an input candidate set of tuple pairs
        such that the value of attribute l_block_attr of the left tuple in a
        tuple pair exactly matches the value of attribute r_block_attr of the 
        right tuple in the tuple pair.
        
        Args:
            candset (DataFrame): The input candidate set of tuple pairs.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.
            verbose (boolean): A flag to indicate whether the debug
                              information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  
       
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
       
        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()            
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1)
        """
        logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
              "RISK.")

        # validate data types of input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_chunks)

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                         'fk rtable, ltable, rtable, ltable key, rtable key',
                 verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # validate n_chunks parameter
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_block_attr]]
        r_df = rtable[[r_key, r_block_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # determine number of processes to launch parallely
        n_chunks = get_num_partitions(n_chunks, len(candset))

        valid = []
        if n_chunks == 1:
            # single process
            valid = _block_candset_split(candset, l_df, r_df, l_key, r_key,
                                         l_block_attr, r_block_attr, fk_ltable,
                                         fk_rtable, allow_missing, show_progress)
        else:
            c_splits = pd.np.array_split(candset, n_chunks)

            valid_splits = []
            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_split)(c_splits[i],
                                                               l_df, r_df,
                                                               l_key, r_key,
                                                               l_block_attr, r_block_attr,
                                                               fk_ltable, fk_rtable,
                                                               allow_missing,
                                                               False)  # setting show
                # progress to False as we will use Dask diagnostics to display progress
                #  bar
                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                        num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True,
                      overlap_size=1, allow_missing=False,
                      verbose=False, show_progress=True, n_chunks=-1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
        
        Blocks an input candidate set of tuple pairs based on the overlap
        of token sets of attribute values. Finds tuple pairs from an input 
        candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information

                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """
        logger.warning(
            "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Validate input parameters
        self.validate_types_params_candset(candset, verbose, show_progress, n_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)

        # get and validate metadata
        log_info(logger,
                 'Required metadata: cand.set key, fk ltable, fk rtable, '
                 'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # validate number of chunks
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)


        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(return_set=True)


        n_chunks = get_num_partitions(n_chunks, len(candset))
        c_splits = pd.np.array_split(candset, n_chunks)
        valid_splits = []

        # Create DAG
        for i in range(n_chunks):
            result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key,
                                                       r_key, l_overlap_attr,
                                                       r_overlap_attr, fk_ltable,
                                                       fk_rtable, allow_missing,
                                                       rem_stop_words, tokenizer, overlap_size)
            valid_splits.append(result)
        valid_splits = delayed(wrap)(valid_splits)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())
        else:
            valid_splits = valid_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())

        valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
    def block_tables(self, ltable, rtable,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=False, show_progress=True, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.        

        Blocks two tables based on a black box blocking function specified
        by the user.
        Finds tuple pairs from left and right tables that survive the black
        box function. A tuple pair survives the black box blocking function if
        the function returns False for that pair, otherwise the tuple pair is
        dropped.
        
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            verbose (boolean): A flag to indicate whether the debug
             information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     
        Returns:

            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If values in `l_output_attrs` is not of type
                string.
            AssertionError: If values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
        Examples:
            >>> def match_last_name(ltuple, rtuple):
                # assume that there is a 'name' attribute in the input tables
                # and each value in it has two words
                l_last_name = ltuple['name'].split()[1]
                r_last_name = rtuple['name'].split()[1]
                if l_last_name != r_last_name:
                    return True
                else:
                    return False
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker
            >>> bb = DaskBlackBoxBlocker()
            >>> bb.set_black_box_function(match_last_name)
            >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] )
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")


        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate black box function
        assert self.black_box_function != None, 'Black box function is not set'

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # do blocking

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]



        # # pickle the black-box function before passing it as an arg to
        # # _block_tables_split to be executed by each child process
        black_box_function_pkl = cp.dumps(self.black_box_function)

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_output_attrs_1, r_output_attrs_1,
                                          l_output_prefix, r_output_prefix,
                                          black_box_function_pkl, show_progress)
        else:
            # multiprocessing
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)

            c_splits = []
            for i in range(len(l_splits)):
                for j in range(len(r_splits)):
                    partial_result = delayed(_block_tables_split)(l_splits[i], r_splits[j],
                                             l_key, r_key,
                                             l_output_attrs_1, r_output_attrs_1,
                                             l_output_prefix, r_output_prefix,
                                             black_box_function_pkl, False)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            if show_progress:
                with ProgressBar():
                    c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores())
            else:
                c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores())

            candset = pd.concat(c_splits, ignore_index=True)

        # # determine the attributes to retain in the output candidate set
        retain_cols = self.get_attrs_to_retain(l_key, r_key,
                                               l_output_attrs, r_output_attrs,
                                               l_output_prefix, r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.


        Blocks an input candidate set of tuple pairs based on a black box
        blocking function specified by the user.

        Finds tuple pairs from an input candidate set of tuple pairs that
        survive the black box function. A tuple pair survives the black box
        blocking function if the function returns False for that pair,
        otherwise the tuple pair is dropped.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            verbose (boolean): A flag to indicate whether logging should be done
                               (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.

        Examples:
            >>> def match_last_name(ltuple, rtuple):
                # assume that there is a 'name' attribute in the input tables
                # and each value in it has two words
                l_last_name = ltuple['name'].split()[1]
                r_last_name = rtuple['name'].split()[1]
                if l_last_name != r_last_name:
                    return True
                else:
                    return False
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker
            >>> bb = DaskBlackBoxBlocker()
            >>> bb.set_black_box_function(match_last_name)
            >>> D = bb.block_candset(C) # C is an output from block_tables
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")


        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress, n_chunks)

        # validate black box functionn
        assert self.black_box_function != None, 'Black box function is not set'

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                         'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable,
                                          rtable, l_key, r_key,
                                          logger, verbose)

        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)
        # do blocking

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # project candset to keep only the ID attributes
        c_df = candset[[key, fk_ltable, fk_rtable]]

        # # determine the number of processes to launch parallely
        n_chunks = get_num_partitions(n_chunks, len(candset))

        # # pickle the black-box function before passing it as an arg to
        # # _block_candset_split to be executed by each child process
        black_box_function_pkl = cp.dumps(self.black_box_function)

        valid = []
        if n_chunks == 1:
            # single process
            valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key,
                                         fk_ltable, fk_rtable,
                                         black_box_function_pkl, show_progress)
        else:
            # multiprocessing
            c_splits = pd.np.array_split(c_df, n_chunks)

            valid_splits = []
            for i in range(len(c_splits)):
                partial_result =  delayed(_block_candset_split)(c_splits[i],
                                              l_df, r_df,
                                              l_key, r_key,
                                              fk_ltable, fk_rtable,
                                              black_box_function_pkl, False)
                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                        num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output table
        if len(c_df) > 0:
            c_df = candset[valid]
        else:
            c_df = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable)

        # return candidate set
        return c_df
    def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                append=False, return_probs=False, probs_attr=None, inplace=True,
                show_progress=False, n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Predict interface for the matcher.

        Specifically, there are two ways the user can call the predict method.
        First, interface similar to scikit-learn where the feature vectors
        given as projected DataFrame.
        Second, give the DataFrame and explicitly specify the feature vectors
        (by specifying the attributes to be excluded) .

        A point to note is all the input parameters have a default value of
        None. This is done to support both the interfaces in a single function.

        Currently, the Dask implementation supports only the cases when the table is not 
        None and the flags inplace, append are False. 


        Args:
            x (DataFrame): The input pandas DataFrame containing only feature
                vectors (defaults to None).
            table (DataFrame): The input pandas DataFrame containing feature
                vectors, and may be other attributes (defaults to None).
            exclude_attrs (list): A list of attributes to be excluded from the
                input table to get the feature vectors (defaults to None).
            target_attr (string): The attribute name where the predictions
                need to be stored in the input table (defaults to None).
            probs_attr (string): The attribute name where the prediction probabilities 
                need to be stored in the input table (defaults to None).
            append (boolean): A flag to indicate whether the predictions need
                to be appended in the input DataFrame (defaults to False).
            return_probs (boolean): A flag to indicate where the prediction probabilities
                need to be returned (defaults to False). If set to True, returns the 
                probability if the pair was a match.
            inplace (boolean): A flag to indicate whether the append needs to be
                done inplace (defaults to True).
            show_progress (boolean): A flag to indicate whether the progress of
                extracting feature vectors must be displayed (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                is set to -1, the number of partitions will be set to the 
                number of cores in the machine.  


        Returns:
            An array of predictions or a DataFrame with predictions updated.

        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")

        if x is not None:
            return self._predict(x, table, exclude_attrs, target_attr, append,
                                 return_probs, probs_attr, inplace)
        else:
            n_chunks = get_num_partitions(n_chunks, len(table))
            if n_chunks == 1 or inplace == True or append == False:
                # When the inplace flag is True, the predictions (and probs) are added
                # in place. If he have to use Dask then we have to modify _predict (
                # specifically _predict_sk_learn) function.
                # So, to keep things simple, we support Dask only when
                # inplace=False

                # Similarly, when append=False, the return value from _predict will be
                # different for different cases (for example, when return_probs is True
                #  or False). If we have to use Dask then we have to careful in
                # recording the return values for each chunk.
                # So, to keep things simple, we support Dask only when
                # append=True


                result = self._predict(table=table, exclude_attrs=exclude_attrs,
                                     target_attr=target_attr, append=append,
                                     return_probs=return_probs, probs_attr=probs_attr,
                                     inplace=inplace, copy_props=True)

            else:
                predicted_results = []
                splitted_tables = pd.np.array_split(table, n_chunks)
                for i in range(len(splitted_tables)):
                    partial_result = delayed(self._predict)(table=splitted_tables[i],
                                                            exclude_attrs=exclude_attrs, target_attr=target_attr,
                                                            append=append,
                                                            return_probs=return_probs,
                                                            probs_attr=probs_attr,
                                                            inplace=inplace,
                                                            copy_props=False)
                    predicted_results.append(partial_result)
                predicted_results = delayed(wrap)(predicted_results)
                if show_progress:
                    with ProgressBar():
                        predicted_results = predicted_results.compute(
                            scheduler="processes", num_workers=get_num_cores())
                else:
                     predicted_results = predicted_results.compute(
                        scheduler="processes", num_workers=get_num_cores())


                result = pd.concat(predicted_results)
                cm.copy_properties(table, result)
                return result
def dask_down_sample(ltable,
                     rtable,
                     size,
                     y_param,
                     show_progress=True,
                     verbose=False,
                     seed=None,
                     rem_stop_words=True,
                     rem_puncs=True,
                     n_ltable_chunks=1,
                     n_sample_rtable_chunks=1):
    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)'
        )

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B'
        )

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int,
                         'Parameter n_sample_rtable_chunks')

    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]

    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                          start_row_id,
                                                          rem_puncs,
                                                          rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)

    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]

    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                inverted_index, rem_puncs, rem_stop_words,
                                seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(
            scheduler="processes", num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]

    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
Beispiel #18
0
def dask_extract_feature_vecs(candset,
                              attrs_before=None,
                              feature_table=None,
                              attrs_after=None,
                              verbose=False,
                              show_progress=True,
                              n_chunks=1):
    """
    WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
            
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
            
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
            
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
            
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
            
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).
            
        n_chunks (int): The number of partitions to split the candidate set. If it 
            is set to -1, the number of partitions will be set to the 
            number of cores in the machine.  


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.
        AssertionError: If `n_chunks` is not of type
                int.

    Examples:
        >>> import py_entitymatching as em
        >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
    )

    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    validate_object_type(n_chunks, int, 'Parameter n_chunks')
    validate_chunks(n_chunks)

    n_chunks = get_num_partitions(n_chunks, len(candset))

    c_splits = np.array_split(candset, n_chunks)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = []

    for i in range(len(c_splits)):
        partial_result = delayed(get_feature_vals_by_cand_split)(
            pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i],
            False)
        feat_vals_by_splits.append(partial_result)

    feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits)
    if show_progress:
        with ProgressBar():
            feat_vals_by_splits = feat_vals_by_splits.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        feat_vals_by_splits = feat_vals_by_splits.compute(
            scheduler="processes", num_workers=get_num_cores())

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Beispiel #19
0
    def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                append=False, return_probs=False, probs_attr=None, inplace=True,
                show_progress=False, n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Predict interface for the matcher.

        Specifically, there are two ways the user can call the predict method.
        First, interface similar to scikit-learn where the feature vectors
        given as projected DataFrame.
        Second, give the DataFrame and explicitly specify the feature vectors
        (by specifying the attributes to be excluded) .

        A point to note is all the input parameters have a default value of
        None. This is done to support both the interfaces in a single function.

        Currently, the Dask implementation supports only the cases when the table is not 
        None and the flags inplace, append are False. 


        Args:
            x (DataFrame): The input pandas DataFrame containing only feature
                vectors (defaults to None).
            table (DataFrame): The input pandas DataFrame containing feature
                vectors, and may be other attributes (defaults to None).
            exclude_attrs (list): A list of attributes to be excluded from the
                input table to get the feature vectors (defaults to None).
            target_attr (string): The attribute name where the predictions
                need to be stored in the input table (defaults to None).
            probs_attr (string): The attribute name where the prediction probabilities 
                need to be stored in the input table (defaults to None).
            append (boolean): A flag to indicate whether the predictions need
                to be appended in the input DataFrame (defaults to False).
            return_probs (boolean): A flag to indicate where the prediction probabilities
                need to be returned (defaults to False). If set to True, returns the 
                probability if the pair was a match.
            inplace (boolean): A flag to indicate whether the append needs to be
                done inplace (defaults to True).
            show_progress (boolean): A flag to indicate whether the progress of
                extracting feature vectors must be displayed (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                is set to -1, the number of partitions will be set to the 
                number of cores in the machine.  


        Returns:
            An array of predictions or a DataFrame with predictions updated.

        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")

        if x is not None:
            return self._predict(x, table, exclude_attrs, target_attr, append,
                                 return_probs, probs_attr, inplace)
        else:
            n_chunks = get_num_partitions(n_chunks, len(table))
            if n_chunks == 1 or inplace == True or append == False:
                # When the inplace flag is True, the predictions (and probs) are added
                # in place. If he have to use Dask then we have to modify _predict (
                # specifically _predict_sk_learn) function.
                # So, to keep things simple, we support Dask only when
                # inplace=False

                # Similarly, when append=False, the return value from _predict will be
                # different for different cases (for example, when return_probs is True
                #  or False). If we have to use Dask then we have to careful in
                # recording the return values for each chunk.
                # So, to keep things simple, we support Dask only when
                # append=True


                result = self._predict(table=table, exclude_attrs=exclude_attrs,
                                     target_attr=target_attr, append=append,
                                     return_probs=return_probs, probs_attr=probs_attr,
                                     inplace=inplace, copy_props=True)

            else:
                predicted_results = []
                splitted_tables = np.array_split(table, n_chunks)
                for i in range(len(splitted_tables)):
                    partial_result = delayed(self._predict)(table=splitted_tables[i],
                                                            exclude_attrs=exclude_attrs, target_attr=target_attr,
                                                            append=append,
                                                            return_probs=return_probs,
                                                            probs_attr=probs_attr,
                                                            inplace=inplace,
                                                            copy_props=False)
                    predicted_results.append(partial_result)
                predicted_results = delayed(wrap)(predicted_results)
                if show_progress:
                    with ProgressBar():
                        predicted_results = predicted_results.compute(
                            scheduler="processes", num_workers=get_num_cores())
                else:
                     predicted_results = predicted_results.compute(
                        scheduler="processes", num_workers=get_num_cores())


                result = pd.concat(predicted_results)
                cm.copy_properties(table, result)
                return result
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on attribute equivalence.
        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.
      
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.
            verbose (boolean): A flag to indicate whether the debug information
                              should be logged (defaults to False).
            
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
            
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
       
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
        """

        logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR "
                    "OWN RISK.")


        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1) # last arg is
                                         # set to 1 just to reuse the function from the
                                         # old blocker.

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            l_splits = np.array_split(l_df, n_ltable_chunks)
            r_splits = np.array_split(r_df, n_rtable_chunks)
            c_splits = []

            for l in l_splits:
                for r in r_splits:
                    partial_result = delayed(_block_tables_split)(l, r, l_key, r_key,
                                             l_block_attr, r_block_attr,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             allow_missing)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(ltable_proj,
                                                              rtable_proj,
                                                              l_key, r_key,
                                                              l_block_attr,
                                                              r_block_attr,
                                                              l_output_attrs,
                                                              r_output_attrs,
                                                              l_output_prefix,
                                                              r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_tables(self, ltable, rtable, l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=False, show_progress=True, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on the sequence of rules supplied by the user.
        Finds tuple pairs from left and right tables that survive the sequence
        of blocking rules. A tuple pair survives the sequence of blocking rules
        if none of the rules in the sequence returns True for that pair. If any
        of the rules returns True, then the pair is blocked.
        
        Args:
            
            ltable (DataFrame): The left input table.
            
            rtable (DataFrame): The right input table.
            
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            
            verbose (boolean): A flag to indicate whether the debug
                information  should be logged (defaults to False).
                
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     

        Returns:
            
            A candidate set of tuple pairs that survived the sequence of
            blocking rules (DataFrame).

        Raises:
            
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If the input `l_output_prefix` is not of type
                string.
            AssertionError: If the input `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
            AssertionError: If there are no rules to apply.
        Examples:
                >>> import py_entitymatching as em
                >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker
                >>> rb = DaskRuleBasedBlocker()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> block_f = em.get_features_for_blocking(A, B)
                >>> rule = ['name_name_lev(ltuple, rtuple) > 3']
                >>> rb.add_rule(rule, feature_table=block_f)
                >>> C = rb.block_tables(A, B)
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")

        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate input parameters
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]

        # # get attributes to project
        l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key,
                                                               l_output_attrs_1,
                                                               r_output_attrs_1)
        l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs]

        candset, rule_applied = self.block_tables_with_filters(l_df, r_df,
                                                               l_key, r_key,
                                                               l_output_attrs_1,
                                                               r_output_attrs_1,
                                                               l_output_prefix,
                                                               r_output_prefix,
                                                               verbose,
                                                               show_progress,
                                                               get_num_cores())
                                                               # pass number of splits as
        #  the number of cores in the machine

        if candset is None:
            # no filterable rule was applied
            candset = self.block_tables_without_filters(l_df, r_df, l_key,
                                                        r_key, l_output_attrs_1,
                                                        r_output_attrs_1,
                                                        l_output_prefix,
                                                        r_output_prefix,
                                                        verbose, show_progress,
                                                        n_ltable_chunks, n_rtable_chunks)
        elif len(self.rules) > 1:
            # one filterable rule was applied but other rules are left
            # block candset by applying other rules and excluding the applied rule
            candset = self.block_candset_excluding_rule(candset, l_df, r_df,
                                                        l_key, r_key,
                                                        l_output_prefix + l_key,
                                                        r_output_prefix + r_key,
                                                        rule_applied,
                                                        show_progress, get_num_cores())

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1,
                                               r_output_attrs_1,
                                               l_output_prefix, r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset