Ejemplo n.º 1
0
def index_candidate_set(candidate_set, lrecord_id_to_index_map,
                        rrecord_id_to_index_map, verbose):
    if len(candidate_set) == 0:
        return {}
    new_formatted_candidate_set = {}
    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(candidate_set, logger, verbose)

    # # validate metadata
    # cm._validate_metadata_for_candset(candidate_set, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
    #                                  logger, verbose)

    ltable_key_data = list(candidate_set[fk_ltable])
    rtable_key_data = list(candidate_set[fk_rtable])

    for i in range(len(ltable_key_data)):
        if ltable_key_data[i] in lrecord_id_to_index_map and \
                        rtable_key_data[i] in rrecord_id_to_index_map:
            # new_formatted_candidate_set.add((lrecord_id_to_index_map[ltable_key_data[i]],
            #                                 rrecord_id_to_index_map[rtable_key_data[i]]))
            l_key_data = lrecord_id_to_index_map[ltable_key_data[i]]
            r_key_data = rrecord_id_to_index_map[rtable_key_data[i]]
            if l_key_data in new_formatted_candidate_set:
                new_formatted_candidate_set[l_key_data].add(r_key_data)
            else:
                new_formatted_candidate_set[l_key_data] = {r_key_data}

    return new_formatted_candidate_set
Ejemplo n.º 2
0
    def _predict_candset(self, candset, verbose=False):
        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)

        # # keep track of predictions
        predictions = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the index of fk_ltable and fk_rtable from the cand. set
        col_names = list(candset.columns)
        lid_idx = col_names.index(fk_ltable)
        rid_idx = col_names.index(fk_rtable)

        # # iterate through the cand. set
        for row in candset.itertuples(index=False):
            l_row = l_df.ix[row[lid_idx]]
            r_row = r_df.ix[row[rid_idx]]
            res = self.apply_rules(l_row, r_row)
            if res is True:
                predictions.append(1)
            else:
                predictions.append(0)

        return predictions
Ejemplo n.º 3
0
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_',
                          r_output_prefix='rtable_', validate=True, copy_props=True,
                          delete_from_catalog=True, verbose=False):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)
    if validate:
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)
    index_values = candset.index

    df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix,
                                validate=False)

    df.set_index(index_values, inplace=True)
    if copy_props:
        cm.init_properties(df)
        cm.copy_properties(candset, df)
        if delete_from_catalog:
            cm.del_all_properties(candset)
    return df
Ejemplo n.º 4
0
    def _predict_candset(self, candset, verbose=False):
        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # # keep track of predictions
        predictions = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the index of fk_ltable and fk_rtable from the cand. set
        col_names = list(candset.columns)
        lid_idx = col_names.index(fk_ltable)
        rid_idx = col_names.index(fk_rtable)

        # # iterate through the cand. set
        for row in candset.itertuples(index=False):
            l_row = l_df.ix[row[lid_idx]]
            r_row = r_df.ix[row[rid_idx]]
            res = self.apply_rules(l_row, r_row)
            if res is True:
                predictions.append(1)
            else:
                predictions.append(0)

        return predictions
Ejemplo n.º 5
0
 def test_get_metadata_for_candset_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(C, None, False)
     self.assertEqual(key, '_id')
     self.assertEqual(fk_ltable, 'ltable_ID')
     self.assertEqual(fk_rtable, 'rtable_ID')
     self.assertEqual(l_key, 'ID')
     self.assertEqual(r_key, 'ID')
     self.assertEqual(ltable.equals(A), True)
     self.assertEqual(rtable.equals(B), True)
Ejemplo n.º 6
0
def train_test_split(labeled_data,
                     train_proportion=0.5,
                     random_state=None,
                     verbose=True):
    if not isinstance(labeled_data, pd.DataFrame):
        logger.error('Input table is not of type dataframe')
        raise AssertionError('Input table is not of type dataframe')

    log_info(
        logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        labeled_data, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key, logger,
                                     verbose)

    num_rows = len(labeled_data)
    assert train_proportion >= 0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1"
    assert num_rows > 0, 'The input table is empty'

    train_size = int(math.floor(num_rows * train_proportion))
    test_size = int(num_rows - train_size)

    # use sk learn to split the data
    idx_values = pd.np.array(labeled_data.index.values)
    idx_train, idx_test = cv.train_test_split(idx_values,
                                              test_size=test_size,
                                              train_size=train_size,
                                              random_state=random_state)

    # construct output tables.
    lbl_train = labeled_data.ix[idx_train]
    lbl_test = labeled_data.ix[idx_test]

    # update catalog
    cm.init_properties(lbl_train)
    cm.copy_properties(labeled_data, lbl_train)

    cm.init_properties(lbl_test)
    cm.copy_properties(labeled_data, lbl_test)

    # return output tables
    result = OrderedDict()
    result['train'] = lbl_train
    result['test'] = lbl_test

    return result
Ejemplo n.º 7
0
    def execute(self, input_table, label_column, inplace=True, verbose=False):

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(input_table, logger,
                                                                                              verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(input_table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)


        assert ltable is not None, 'Left table is not set'
        assert rtable is not None, 'Right table is not set'
        assert label_column in input_table.columns, 'Label column not in the input table'
        if inplace == False:
            table = input_table.copy()
        else:
            table = input_table




        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(l_key, drop=False)
        r_tbl = rtable.set_index(r_key, drop=False)

        # keep track of valid ids
        y = []




        column_names = list(input_table.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)
        id_idx = column_names.index(key)

        label_idx = column_names.index(label_column)
        test_idx = 0
        idx = 0
        for row in input_table.itertuples(index=False):

            if row[label_idx] != self.value_to_set:
                l_row = l_tbl.ix[row[lid_idx]]
                r_row = r_tbl.ix[row[rid_idx]]
                res = self.apply_rules(l_row, r_row)
                if res == self.cond_status:
                    table.iat[idx, label_idx] = self.value_to_set
            idx += 1
        return table
Ejemplo n.º 8
0
    def execute(self, input_table, label_column, inplace=True, verbose=False):

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            input_table, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(input_table, key, fk_ltable,
                                         fk_rtable, ltable, rtable, l_key,
                                         r_key, logger, verbose)

        assert ltable is not None, 'Left table is not set'
        assert rtable is not None, 'Right table is not set'
        assert label_column in input_table.columns, 'Label column not in the input table'
        if inplace == False:
            table = input_table.copy()
        else:
            table = input_table

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(l_key, drop=False)
        r_tbl = rtable.set_index(r_key, drop=False)

        # keep track of valid ids
        y = []

        column_names = list(input_table.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)
        id_idx = column_names.index(key)

        label_idx = column_names.index(label_column)
        test_idx = 0
        idx = 0
        for row in input_table.itertuples(index=False):

            if row[label_idx] != self.value_to_set:
                l_row = l_tbl.ix[row[lid_idx]]
                r_row = r_tbl.ix[row[rid_idx]]
                res = self.apply_rules(l_row, r_row)
                if res == self.cond_status:
                    table.iat[idx, label_idx] = self.value_to_set
            idx += 1
        return table
Ejemplo n.º 9
0
def add_output_attributes(candset,
                          l_output_attrs=None,
                          r_output_attrs=None,
                          l_output_prefix='ltable_',
                          r_output_prefix='rtable_',
                          validate=True,
                          copy_props=True,
                          delete_from_catalog=True,
                          verbose=False):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        candset, logger, verbose)
    if validate:
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)
    index_values = candset.index

    df = _add_output_attributes(candset,
                                fk_ltable,
                                fk_rtable,
                                ltable,
                                rtable,
                                l_key,
                                r_key,
                                l_output_attrs,
                                r_output_attrs,
                                l_output_prefix,
                                r_output_prefix,
                                validate=False)

    df.set_index(index_values, inplace=True)
    if copy_props:
        cm.init_properties(df)
        cm.copy_properties(candset, df)
        if delete_from_catalog:
            cm.del_all_properties(candset)
    return df
Ejemplo n.º 10
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error(
            'The label column name (%s) is already present in the '
            'input table', label_column_name)
        raise AssertionError(
            'The label column name (%s) is already present '
            'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Return True if everything was successful
    return True
Ejemplo n.º 11
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Return True if everything was successful
    return True
Ejemplo n.º 12
0
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                      verbose=True, show_progress=True):

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                         'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)

        # do blocking
        # # initialize the progress bar

        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set the index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candset
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            num_overlap = self.get_token_overlap_bt_two_tuples(ltuple, rtuple,
                                                               l_overlap_attr, r_overlap_attr,
                                                               q_val, rem_stop_words)

            if num_overlap >= overlap_size:
                valid.append(True)
            else:
                valid.append(False)

        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable, rtable)

        # return candidate set
        return candset
Ejemplo n.º 13
0
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')


    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')


    log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                     'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                     logger, verbose)


    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)


    # # apply feature functions
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table
Ejemplo n.º 14
0
def extract_feature_vecs(candset,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')

    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    log_info(
        logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key, logger,
                                     verbose)

    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # apply feature functions
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table
Ejemplo n.º 15
0
    def block_candset(self, candset, verbose=True, show_progress=True):

        # validate black box functionn
        assert self.black_box_function != None, 'Black box function is not set'

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                                'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # do blocking

        # # initialize the progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candidate set
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            res = self.black_box_function(ltuple, rtuple)
            if res != True:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable, rtable)

        # return candidate set
        return candset
Ejemplo n.º 16
0
    def block_candset(self, candset, l_block_attr, r_block_attr, verbose=True,
                      show_progress=True):

        self.validate_types_candset(candset, l_block_attr, r_block_attr,
				    verbose, show_progress)
        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, ' +
                 'fk rtable, ltable, rtable, ltable key, rtable key',
                 verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # do blocking

        # # initialize progress bar
        if show_progress:
            prog_bar = pyprind.ProgBar(len(candset))

        # # initialize list to keep track of valid ids
        valid = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the indexes for the key attributes in the candset
        col_names = list(candset.columns)
        lkey_idx = col_names.index(fk_ltable)
        rkey_idx = col_names.index(fk_rtable)

        # # create a look up table for the blocking attribute values
        l_dict = {}
        r_dict = {}

        # # iterate the rows in candset
        for row in candset.itertuples(index=False):

            # # update the progress bar
            if show_progress:
                prog_bar.update()

            # # get the value of block attributes
            row_lkey = row[lkey_idx]
            if row_lkey not in l_dict:
                l_dict[row_lkey] = l_df.ix[row_lkey, l_block_attr]
            l_val = l_dict[row_lkey]

            row_rkey = row[rkey_idx]
            if row_rkey not in r_dict:
                r_dict[row_rkey] = r_df.ix[row_rkey, r_block_attr]
            r_val = r_dict[row_rkey]

            if l_val == r_val:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
Ejemplo n.º 17
0
    def block_candset(self, candset, verbose=True, show_progress=True):

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)

        # do blocking

        # # initialize the progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candidate set
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            res = self.apply_rules(ltuple, rtuple)
            if res != True:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return candset
Ejemplo n.º 18
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table),
                                         sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Ejemplo n.º 19
0
    def block_candset(self,
                      candset,
                      l_block_attr,
                      r_block_attr,
                      verbose=True,
                      show_progress=True):

        self.validate_types_candset(candset, l_block_attr, r_block_attr,
                                    verbose, show_progress)
        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, ' +
            'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # do blocking

        # # initialize progress bar
        if show_progress:
            prog_bar = pyprind.ProgBar(len(candset))

        # # initialize list to keep track of valid ids
        valid = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the indexes for the key attributes in the candset
        col_names = list(candset.columns)
        lkey_idx = col_names.index(fk_ltable)
        rkey_idx = col_names.index(fk_rtable)

        # # create a look up table for the blocking attribute values
        l_dict = {}
        r_dict = {}

        # # iterate the rows in candset
        for row in candset.itertuples(index=False):

            # # update the progress bar
            if show_progress:
                prog_bar.update()

            # # get the value of block attributes
            row_lkey = row[lkey_idx]
            if row_lkey not in l_dict:
                l_dict[row_lkey] = l_df.ix[row_lkey, l_block_attr]
            l_val = l_dict[row_lkey]

            row_rkey = row[rkey_idx]
            if row_rkey not in r_dict:
                r_dict[row_rkey] = r_df.ix[row_rkey, r_block_attr]
            r_val = r_dict[row_rkey]

            if l_val == r_val:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return the output table
        return out_table
Ejemplo n.º 20
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Ejemplo n.º 21
0
 def test_get_metadata_for_candset_invalid_df(self):
     cm.get_metadata_for_candset(None, None, False)