def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True):

        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        # ----------------------------------- metadata related stuff----------------------------------------------

        # required metadata: keys for the input tables.
        helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # get metadata
        l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # validate metadata
        cg.validate_metadata_for_table(ltable, l_key, 'left', logger, verbose)
        cg.validate_metadata_for_table(rtable, r_key, 'right', logger, verbose)

        # ----------------------------------- metadata related stuff----------------------------------------------

        # remove nans : should be modified based on the policy for handling missing values
        l_df, r_df = self.rem_nan(ltable, l_block_attr), self.rem_nan(rtable, r_block_attr)

        # do blocking
        candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable'))

        # construct output table
        retain_cols, final_cols = self.output_columns(l_key, r_key, list(candset.columns),
                                                      l_output_attrs, r_output_attrs,
                                                      l_output_prefix, r_output_prefix)

        candset = candset[retain_cols]
        candset.columns = final_cols

        # Update catalog for the candidate set
        key = helper.get_name_for_key(candset.columns)
        candset = helper.add_key_column(candset, key)
        cg.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable)

        # return the candidate set
        return candset
Exemple #2
0
    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True, show_progress=True):

        # validations
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        # required metadata; keys from ltable and rtable
        helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # get metadata
        l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # do blocking

        if word_level == True and q_val != None:
            raise SyntaxError('Parameters word_level and q_val cannot be set together; Note that word_level is '
                              'set to True by default, so explicity set word_level=false to use qgram with the '
                              'specified q_val')

        # #rem nans
        l_df = self.rem_nan(ltable, l_overlap_attr)
        r_df = self.rem_nan(rtable, r_overlap_attr)

        # #reset indexes in the dataframe
        l_df.reset_index(inplace=True, drop=True)
        r_df.reset_index(inplace=True, drop=True)

        # #create a dummy column with all values set to 1.
        l_df['_dummy_'] = 1 # need to fix this - should be a name that does not occur in the col. names
        r_df['_dummy_'] = 1

        # #case the column to string if required.
        if l_df.dtypes[l_overlap_attr] != object:
            logger.warning('Left overlap attribute is not of type string; coverting to string temporarily')
            l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str)

        if r_df.dtypes[r_overlap_attr] != object:
            logger.warning('Right overlap attribute is not of type string; coverting to string temporarily')
            r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str)

        l_dict = {}
        r_dict = {}

        # #create a lookup table for quick access
        for k, r in l_df.iterrows():
            l_dict[k] = r

        for k, r in r_df.iterrows():
            r_dict[k] = r

        l_colvalues_chopped = self.process_table(l_df, l_overlap_attr, q_val, rem_stop_words)
        zipped_l_colvalues = zip(l_colvalues_chopped, range(0, len(l_colvalues_chopped)))
        appended_l_colidx_values = [self. append_index_values(val[0], val[1]) for val in zipped_l_colvalues]

        inv_idx = {}
        sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_colidx_values for t in c]


        r_colvalues_chopped = self.process_table(r_df, r_overlap_attr, q_val, rem_stop_words)
        r_idx = 0

        white_list = []
        if show_progress:
            bar = pyprind.ProgBar(len(r_colvalues_chopped))

        df_list = []
        for col_values in r_colvalues_chopped:
            if show_progress:
                bar.update()

            qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size)
            r_row = r_dict[r_idx]
            r_row_dict = r_row.to_frame().T

            l_rows_dict = l_df.iloc[qualifying_ltable_indices]
            df = l_rows_dict.merge(r_row_dict, on='_dummy_', suffixes=('_ltable', '_rtable'))

            if len(df) > 0:
                df_list.append(df)

        # Construct the output table
        candset = pd.concat(df_list)
        l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left')
        r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right')

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs,
                                               l_output_prefix, r_output_prefix)

        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # Update metadata in the catalog
        key = helper.get_name_for_key(candset.columns)
        candset = helper.add_key_column(candset, key)
        cg.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable)

        # return the candidate set
        return candset
    def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True, show_progress=True):

        # validate the presence of black box function
        if self.black_box_function is None:
            raise AssertionError('Black box function is not set')

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        # metadata related stuff..

        # required metadata: keys for the input tables
        helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # get metadata
        l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # validate metadata.
        cg.validate_metadata_for_table(ltable, l_key, 'left', logger, verbose)
        cg.validate_metadata_for_table(rtable, r_key, 'right', logger, verbose)

        # do blocking
        if show_progress:
            bar = pyprind.ProgBar(len(ltable)*len(rtable))

        # #keep track of the list that survives blocking
        block_list = []

        # #set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # #create look up index for faster processing.
        l_dict={}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict={}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # #get the position of the id attribute in the tables.
        l_id_pos = list(ltable.columns).index(l_key)
        r_id_pos = list(rtable.columns).index(r_key)

        # #iterate through the tuples and apply the black box function
        for l_t in ltable.itertuples(index=False):
            for r_t in rtable.itertuples(index=False):
                if show_progress:
                    bar.update()

                l_tuple = l_dict[l_t[l_id_pos]]
                r_tuple = r_dict[r_t[r_id_pos]]

                res = self.black_box_function(l_tuple, r_tuple)

                if not res is True: # "not" because, we want to include only tuple pairs that SURVIVE the blocking fn.
                    d = OrderedDict()

                    # #add ltable and rtable ids
                    ltable_id = l_output_prefix+l_key
                    rtable_id = r_output_prefix+r_key

                    d[ltable_id] = l_tuple[l_key]
                    d[rtable_id] = r_tuple[r_key]

                    # #add left table attributes
                    if l_output_attrs:
                        l_out = l_tuple[l_output_attrs]
                        l_out.index = l_output_prefix + l_out.index
                        d.update(l_out)

                    # #add right table attributes
                    if r_output_attrs:
                        r_out = r_tuple(r_output_attrs)
                        r_out.index = r_output_prefix + r_out.index
                        d.update(r_out)

                    # #add the ordered dict to block_list
                    block_list.append(d)

        # Construct the output table
        candset = pd.DataFrame(block_list)
        l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left')
        r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right')

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs,
                                               l_output_prefix, r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update metadata in the catalog.
        key = helper.get_name_for_key(candset.columns)
        candset = helper.add_key_column(candset, key)
        cg.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable)

        # return the candidate set
        return candset