Esempio n. 1
0
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True):

        # validate data types of input parameters
        self.validate_types_tables(ltable, rtable, l_block_attr, r_block_attr,
			    l_output_attrs, r_output_attrs, l_output_prefix,
			    r_output_prefix, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)

        # do blocking

        # # remove nans: should be modified based on missing data policy
        l_df, r_df = rem_nan(ltable, l_block_attr), rem_nan(rtable, r_block_attr)

        # # do projection before merge
        l_proj_attrs = self.get_proj_attrs(l_key, l_block_attr, l_output_attrs)
        l_df = l_df[l_proj_attrs]
        r_proj_attrs = self.get_proj_attrs(r_key, r_block_attr, r_output_attrs)
        r_df = r_df[r_proj_attrs]

        # # use pandas merge to do equi join
        candset = pd.merge(l_df, r_df, left_on=l_block_attr,
                           right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))

        # construct output table
        retain_cols, final_cols = self.output_columns(l_key, r_key,
                                                      list(candset.columns),
                                                      l_output_attrs,
                                                      r_output_attrs,
                                                      l_output_prefix,
                                                      r_output_prefix)
        candset = candset[retain_cols]
        candset.columns = final_cols

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
Esempio n. 2
0
def train_test_split(labeled_data,
                     train_proportion=0.5,
                     random_state=None,
                     verbose=True):
    if not isinstance(labeled_data, pd.DataFrame):
        logger.error('Input table is not of type dataframe')
        raise AssertionError('Input table is not of type dataframe')

    log_info(
        logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        labeled_data, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key, logger,
                                     verbose)

    num_rows = len(labeled_data)
    assert train_proportion >= 0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1"
    assert num_rows > 0, 'The input table is empty'

    train_size = int(math.floor(num_rows * train_proportion))
    test_size = int(num_rows - train_size)

    # use sk learn to split the data
    idx_values = pd.np.array(labeled_data.index.values)
    idx_train, idx_test = cv.train_test_split(idx_values,
                                              test_size=test_size,
                                              train_size=train_size,
                                              random_state=random_state)

    # construct output tables.
    lbl_train = labeled_data.ix[idx_train]
    lbl_test = labeled_data.ix[idx_test]

    # update catalog
    cm.init_properties(lbl_train)
    cm.copy_properties(labeled_data, lbl_train)

    cm.init_properties(lbl_test)
    cm.copy_properties(labeled_data, lbl_test)

    # return output tables
    result = OrderedDict()
    result['train'] = lbl_train
    result['test'] = lbl_test

    return result
Esempio n. 3
0
def get_metadata_for_candset(candset, lgr, verbose):
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input candset is not of type pandas data frame')
        raise AssertionError('Input candset is not of type pandas data frame')

    ch.log_info(lgr, 'Getting metadata from the catalog', verbose)
    key = get_key(candset)
    fk_ltable = get_fk_ltable(candset)
    fk_rtable = get_fk_rtable(candset)
    ltable = get_ltable(candset)
    rtable = get_rtable(candset)
    l_key = get_key(ltable)
    r_key = get_key(rtable)
    ch.log_info(lgr, '..... Done', verbose)
    return key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key
Esempio n. 4
0
def get_metadata_for_candset(candset, lgr, verbose):
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input candset is not of type pandas data frame')
        raise AssertionError('Input candset is not of type pandas data frame')

    ch.log_info(lgr, 'Getting metadata from the catalog', verbose)
    key = get_key(candset)
    fk_ltable = get_fk_ltable(candset)
    fk_rtable = get_fk_rtable(candset)
    ltable = get_ltable(candset)
    rtable = get_rtable(candset)
    l_key = get_key(ltable)
    r_key = get_key(rtable)
    ch.log_info(lgr, '..... Done', verbose)
    return key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key
Esempio n. 5
0
def validate_metadata_for_table(table, key, out_str, lgr, verbose):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not key in table.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    ch.log_info(lgr, 'Validating ' + out_str + ' key: ' + str(key), verbose)
    assert isinstance(key, six.string_types) is True, 'Key attribute must be a string.'
    assert ch.check_attrs_present(table,
                                  key) is True, 'Key attribute is not present in the ' + out_str + ' table'
    assert ch.is_key_attribute(table, key, verbose) == True, 'Attribute ' + str(key) + \
                                                             ' in the ' + out_str + ' table ' \
                                                                                    'does not qualify to be the key'
    ch.log_info(lgr, '..... Done', verbose)
    return True
Esempio n. 6
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error(
            'The label column name (%s) is already present in the '
            'input table', label_column_name)
        raise AssertionError(
            'The label column name (%s) is already present '
            'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Return True if everything was successful
    return True
Esempio n. 7
0
def validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable,
                                  rtable, l_key, r_key, lgr, verbose):
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type pandas data frame')
        raise AssertionError('Input cand.set is not of type pandas data frame')

    if not key in candset.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    if not fk_ltable in candset.columns:
        logger.error('Input fk_ltable ( %s ) not in the dataframe' % fk_ltable)
        raise KeyError('Input fk_ltable ( %s ) not in the dataframe' %
                       fk_ltable)

    if not fk_rtable in candset.columns:
        logger.error('Input fk_rtable ( %s ) not in the dataframe' % fk_rtable)
        raise KeyError('Input fk_rtable ( %s ) not in the dataframe' %
                       fk_rtable)

    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    if not l_key in ltable:
        logger.error('ltable key ( %s ) not in ltable' % l_key)
        raise KeyError('ltable key ( %s ) not in ltable' % l_key)

    if not r_key in rtable:
        logger.error('rtable key ( %s ) not in rtable' % r_key)
        raise KeyError('rtable key ( %s ) not in rtable' % r_key)

    validate_metadata_for_table(candset, key, 'cand.set', lgr, verbose)

    ch.log_info(lgr, 'Validating foreign key constraint for left table',
                verbose)
    assert ch.check_fk_constraint(candset, fk_ltable, ltable,
                                  l_key) == True, 'Cand.set does not satisfy foreign key ' \
                                                  'constraint with the left table'
    ch.log_info(lgr, '..... Done', verbose)
    ch.log_info(lgr, 'Validating foreign key constraint for right table',
                verbose)
    assert ch.check_fk_constraint(candset, fk_rtable, rtable,
                                  r_key) == True, 'Cand.set does not satisfy foreign key ' \
                                                  'constraint with the right table'
    ch.log_info(lgr, '..... Done', verbose)

    return True
Esempio n. 8
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Return True if everything was successful
    return True
Esempio n. 9
0
def validate_metadata_for_table(table, key, out_str, lgr, verbose):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not key in table.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    ch.log_info(lgr, 'Validating ' + out_str + ' key: ' + str(key), verbose)
    assert isinstance(
        key, six.string_types) is True, 'Key attribute must be a string.'
    assert ch.check_attrs_present(
        table, key
    ) is True, 'Key attribute is not present in the ' + out_str + ' table'
    assert ch.is_key_attribute(table, key, verbose) == True, 'Attribute ' + str(key) + \
                                                             ' in the ' + out_str + ' table ' \
                                                                                    'does not qualify to be the key'
    ch.log_info(lgr, '..... Done', verbose)
    return True
Esempio n. 10
0
def validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable,
                                  l_key, r_key,
                                  lgr, verbose):
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type pandas data frame')
        raise AssertionError('Input cand.set is not of type pandas data frame')

    if not key in candset.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    if not fk_ltable in candset.columns:
        logger.error('Input fk_ltable ( %s ) not in the dataframe' % fk_ltable)
        raise KeyError('Input fk_ltable ( %s ) not in the dataframe' % fk_ltable)

    if not fk_rtable in candset.columns:
        logger.error('Input fk_rtable ( %s ) not in the dataframe' % fk_rtable)
        raise KeyError('Input fk_rtable ( %s ) not in the dataframe' % fk_rtable)

    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    if not l_key in ltable:
        logger.error('ltable key ( %s ) not in ltable' % l_key)
        raise KeyError('ltable key ( %s ) not in ltable' % l_key)

    if not r_key in rtable:
        logger.error('rtable key ( %s ) not in rtable' % r_key)
        raise KeyError('rtable key ( %s ) not in rtable' % r_key)

    validate_metadata_for_table(candset, key, 'cand.set', lgr, verbose)

    ch.log_info(lgr, 'Validating foreign key constraint for left table', verbose)
    assert ch.check_fk_constraint(candset, fk_ltable, ltable,
                                  l_key) == True, 'Cand.set does not satisfy foreign key ' \
                                                  'constraint with the left table'
    ch.log_info(lgr, '..... Done', verbose)
    ch.log_info(lgr, 'Validating foreign key constraint for right table', verbose)
    assert ch.check_fk_constraint(candset, fk_rtable, rtable,
                                  r_key) == True, 'Cand.set does not satisfy foreign key ' \
                                                  'constraint with the right table'
    ch.log_info(lgr, '..... Done', verbose)

    return True
Esempio n. 11
0
def get_keys_for_ltable_rtable(ltable, rtable, lgr, verbose):
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    ch.log_info(lgr, 'Required metadata: ltable key, rtable key', verbose)
    ch.log_info(lgr, 'Getting metadata from the catalog', verbose)
    l_key = get_key(ltable)
    r_key = get_key(rtable)
    ch.log_info(lgr, '..... Done', verbose)
    return l_key, r_key
Esempio n. 12
0
def get_keys_for_ltable_rtable(ltable, rtable, lgr, verbose):
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    ch.log_info(lgr, 'Required metadata: ltable key, rtable key', verbose)
    ch.log_info(lgr, 'Getting metadata from the catalog', verbose)
    l_key = get_key(ltable)
    r_key = get_key(rtable)
    ch.log_info(lgr, '..... Done', verbose)
    return l_key, r_key
Esempio n. 13
0
def combine_blocker_outputs_via_union(blocker_output_list, l_prefix='ltable_',
                                      r_prefix='rtable_', verbose=False):
    """
    Combine multiple blocker outputs by doing an union of their tuple pair ids (
    foreign
    key ltable, foreign key rtable).

    This function combines multiple blocker outputs via union. Specifically,
    this function takes in a list of DataFrames (candidate sets, typically the
    output from blockers) and returns a consolidated DataFrame. The output
    DataFrame contains the union of tuple pair ids (foreign key ltable,
    foreign key rtable) and other attributes from the input list of DataFrames.

    This function makes some assumptions about the input DataFrames. First,
    each DataFrame is expected to contain the following metadata in the
    catalog: (1) key,fk_ltable, fk_rtable, ltable, and rtable. Second,
    all the DataFrames must be a result of blocking from the same underlying
    tables. Concretely the ltable and rtable properties must refer to the
    same DataFrame across all the input tables. Third, all the input
    DataFrames must have the same fk_ltable and fk_rtable properties.
    Finally, in each input DataFrame, for the attributes included from the
    ltable or rtable, the attribute names must be prefixed with the given
    l_prefix and r_prefix in the function. As an example, the schema of an
    input DataFrame may look like this: _id, ltable_ID, rtable_ID,
    ltable_name, rtable_name

    The input DataFrames may contain different attribute lists and it begs
    the question how to combine them. Currently Magellan takes an union
    of attribute names that has prefix l_prefix or r_prefix. across
    input tables. After taking the union, for each tuple id pair included
    in output, the attribute values (for union-ed attribute names) are
    probed from ltable/rtable and included in the output.

    A subtle point to note here is,  if an input DataFrame has a column
    added by user (say label for some reason), then that column will not
    be present in the output. The reason is, the same column may not be
    present in other candidate sets so it is not clear about how to
    combine them. One possibility is include label in output for all
    tuple id pairs, but set as NaN for the values not present. Currently
    magellan does not include such columns and addressing it will be part
    of future work.

    Args:
        blocker_output_list (list of DataFrames): List of DataFrames that
            should be combined. Refer notes section for a detailed
            description of the assumptions made by the function about the
            input list  of blocker outputs.
        l_prefix (str): Prefix given to the attributes from the ltable.
        r_prefix (str): Prefix given to the attributes from the rtable.
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the combined tuple pairs and other attributes from
        all the blocker lists.

    Raises:
        AssertionError: If the l_prefix is not of type string.
        AssertionError: If the r_prefix is not of type string.
        AssertionError: If the length of the input DataFrame list if 0.
        AssertionError: If the input blocker output list is not a list of
            DataFrames.
        AssertionError: If the ltables are different across the input list of
            DataFrames.
        AssertionError: If the rtables are different across the input list of
            DataFrames.
        AssertionError: If the fk_ltable values are different across the
            input list of DataFrames.
        AssertionError: If the fk_rtable values are different across the
            input list of DataFrames.
    """

    # validate input parameters

    # The l_prefix is expected to be of type string
    if not isinstance(l_prefix, six.string_types):
        logger.error('l_prefix is not of type string')
        raise AssertionError('l_prefix is not of type string')

    # The r_prefix is expected to be of type string
    if not isinstance(r_prefix, six.string_types):
        logger.error('r_prefix is not of type string')
        raise AssertionError('r_prefix is not of type string')

    # We cannot combine empty DataFrame list
    if not len(blocker_output_list) > 0:
        logger.error('There no DataFrames to combine')
        raise AssertionError('There are no DataFrames to combine')

    # Validate the assumptions about the input tables.
    # # 1) All the input object must be DataFrames
    # # 2) All the input DataFrames must have the metadata as that of a
    # candidate set
    # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable
    _validate_lr_tables(blocker_output_list)

    # # Get the ltable and rtable. We take it from the first DataFrame as all
    #  the DataFrames contain the same ltables and rtables
    ltable = cm.get_ltable(blocker_output_list[0])
    rtable = cm.get_rtable(blocker_output_list[0])

    # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as
    #  all the DataFrames contain the same ltables and rtables
    fk_ltable = cm.get_fk_ltable(blocker_output_list[0])
    fk_rtable = cm.get_fk_rtable(blocker_output_list[0])

    # Retrieve the keys for the ltable and rtables.
    l_key = cm.get_key(ltable)
    r_key = cm.get_key(rtable)

    # Check if the fk_ltable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_ltable.startswith(l_prefix) is False:
        logger.warning(
            'Foreign key for ltable is not starting with the given prefix ('
            '%s)', l_prefix)

    # Check if the fk_rtable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_rtable.startswith(r_prefix) is False:
        logger.warning(
            'Foreign key for rtable is not starting with the given prefix ('
            '%s)', r_prefix)

    # Initialize lists
    # # keep track of projected tuple pair ids
    tuple_pair_ids = []
    # # keep track of output attributes from the left table
    l_output_attrs = []
    # # keep track of output attributes from the right table
    r_output_attrs = []

    # for each DataFrame in the given list, project out tuple pair ids, get the
    #  attributes from the ltable and rtable
    for data_frame in blocker_output_list:
        # Project out the tuple pair ids. A tuple pair id is a fk_ltable,
        # fk_rtable pair
        projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]]
        # Update the list that tracks tuple pair ids
        tuple_pair_ids.append(projected_tuple_pair_ids)

        # Get the columns, which should be segregated into the attributes
        # from the ltable and table
        col_set = (
            gh.list_diff(list(data_frame.columns),
                         [fk_ltable, fk_rtable, cm.get_key(data_frame)]))

        # Segregate the columns as attributes from the ltable and rtable
        l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix)

        # Update the l_output_attrs, r_output_attrs
        l_output_attrs.extend(l_attrs)
        # the reason we use extend because l_attrs a list
        r_output_attrs.extend(r_attrs)

    ch.log_info(logger, 'Concatenating the tuple pair ids across given '
                        'blockers ...', verbose)

    # concatenate the tuple pair ids from the list of input DataFrames
    concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids)

    ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose)
    ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose)

    # Deduplicate the DataFrame. Now the returned DataFrame will contain
    # unique tuple pair ids.

    # noinspection PyUnresolvedReferences
    deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates()

    ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose)

    # Construct output table
    # # Get unique list of attributes across different tables
    l_output_attrs = gh.list_drop_duplicates(l_output_attrs)
    r_output_attrs = gh.list_drop_duplicates(r_output_attrs)

    # Reset the index that might have lingered from concatenation.
    deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True)

    # Add the output attribtues from the ltable and rtable.
    # NOTE: This approach may be inefficient as it probes the ltable, rtable
    # to get the attribute values. A better way would be to fill the
    # attribute values from the input list of DataFrames. This attribute values
    # could be harvested (at the expense of some space) while we iterate the
    # input blocker output list for the first time.

    # noinspection PyProtectedMember
    consolidated_data_frame = gh._add_output_attributes(
        deduplicated_tuple_pair_ids, fk_ltable,
        fk_rtable,
        ltable, rtable, l_key, r_key,
        l_output_attrs, r_output_attrs,
        l_prefix,
        r_prefix,
        validate=False)
    # Sort the DataFrame ordered by fk_ltable and fk_rtable.
    # The function "sort" will be depreciated in the newer versions of
    # pandas DataFrame, and it will replaced by 'sort_values' function. So we
    # will first try to use sort_values if this fails we will use sort.
    try:
        consolidated_data_frame.sort_values([fk_ltable, fk_rtable],
                                            inplace=True)
    except AttributeError:
        consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True)

    # update the catalog for the consolidated DataFrame
    # First get a column name for the key
    key = ch.get_name_for_key(consolidated_data_frame.columns)
    # Second, add the column name as the key
    consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key)
    # Third, reset the index to remove any out of order index  values from
    # the sort.
    consolidated_data_frame.reset_index(inplace=True, drop=True)
    # Finally, set the properties for the consolidated DataFrame in the catalog
    cm.set_candset_properties(consolidated_data_frame, key, fk_ltable,
                              fk_rtable, ltable,
                              rtable)

    # Return the consolidated DataFrame
    return consolidated_data_frame
Esempio n. 14
0
    def block_candset(self, candset, l_block_attr, r_block_attr, verbose=True,
                      show_progress=True):

        self.validate_types_candset(candset, l_block_attr, r_block_attr,
				    verbose, show_progress)
        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, ' +
                 'fk rtable, ltable, rtable, ltable key, rtable key',
                 verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # do blocking

        # # initialize progress bar
        if show_progress:
            prog_bar = pyprind.ProgBar(len(candset))

        # # initialize list to keep track of valid ids
        valid = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the indexes for the key attributes in the candset
        col_names = list(candset.columns)
        lkey_idx = col_names.index(fk_ltable)
        rkey_idx = col_names.index(fk_rtable)

        # # create a look up table for the blocking attribute values
        l_dict = {}
        r_dict = {}

        # # iterate the rows in candset
        for row in candset.itertuples(index=False):

            # # update the progress bar
            if show_progress:
                prog_bar.update()

            # # get the value of block attributes
            row_lkey = row[lkey_idx]
            if row_lkey not in l_dict:
                l_dict[row_lkey] = l_df.ix[row_lkey, l_block_attr]
            l_val = l_dict[row_lkey]

            row_rkey = row[rkey_idx]
            if row_rkey not in r_dict:
                r_dict[row_rkey] = r_df.ix[row_rkey, r_block_attr]
            r_val = r_dict[row_rkey]

            if l_val == r_val:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table
Esempio n. 15
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=True):

        # validate data types of input parameters
        self.validate_types_tables(ltable, rtable, l_block_attr, r_block_attr,
                                   l_output_attrs, r_output_attrs,
                                   l_output_prefix, r_output_prefix, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                       verbose)
        cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                       verbose)

        # do blocking

        # # remove nans: should be modified based on missing data policy
        l_df, r_df = rem_nan(ltable,
                             l_block_attr), rem_nan(rtable, r_block_attr)

        # # do projection before merge
        l_proj_attrs = self.get_proj_attrs(l_key, l_block_attr, l_output_attrs)
        l_df = l_df[l_proj_attrs]
        r_proj_attrs = self.get_proj_attrs(r_key, r_block_attr, r_output_attrs)
        r_df = r_df[r_proj_attrs]

        # # use pandas merge to do equi join
        candset = pd.merge(l_df,
                           r_df,
                           left_on=l_block_attr,
                           right_on=r_block_attr,
                           suffixes=('_ltable', '_rtable'))

        # construct output table
        retain_cols, final_cols = self.output_columns(l_key, r_key,
                                                      list(candset.columns),
                                                      l_output_attrs,
                                                      r_output_attrs,
                                                      l_output_prefix,
                                                      r_output_prefix)
        candset = candset[retain_cols]
        candset.columns = final_cols

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
Esempio n. 16
0
def combine_blocker_outputs_via_union(blocker_output_list,
                                      l_prefix='ltable_',
                                      r_prefix='rtable_',
                                      verbose=False):
    """
    Combine multiple blocker outputs by doing an union of their tuple pair ids (
    foreign
    key ltable, foreign key rtable).

    This function combines multiple blocker outputs via union. Specifically,
    this function takes in a list of DataFrames (candidate sets, typically the
    output from blockers) and returns a consolidated DataFrame. The output
    DataFrame contains the union of tuple pair ids (foreign key ltable,
    foreign key rtable) and other attributes from the input list of DataFrames.

    This function makes some assumptions about the input DataFrames. First,
    each DataFrame is expected to contain the following metadata in the
    catalog: (1) key,fk_ltable, fk_rtable, ltable, and rtable. Second,
    all the DataFrames must be a result of blocking from the same underlying
    tables. Concretely the ltable and rtable properties must refer to the
    same DataFrame across all the input tables. Third, all the input
    DataFrames must have the same fk_ltable and fk_rtable properties.
    Finally, in each input DataFrame, for the attributes included from the
    ltable or rtable, the attribute names must be prefixed with the given
    l_prefix and r_prefix in the function. As an example, the schema of an
    input DataFrame may look like this: _id, ltable_ID, rtable_ID,
    ltable_name, rtable_name

    The input DataFrames may contain different attribute lists and it begs
    the question how to combine them. Currently Magellan takes an union
    of attribute names that has prefix l_prefix or r_prefix. across
    input tables. After taking the union, for each tuple id pair included
    in output, the attribute values (for union-ed attribute names) are
    probed from ltable/rtable and included in the output.

    A subtle point to note here is,  if an input DataFrame has a column
    added by user (say label for some reason), then that column will not
    be present in the output. The reason is, the same column may not be
    present in other candidate sets so it is not clear about how to
    combine them. One possibility is include label in output for all
    tuple id pairs, but set as NaN for the values not present. Currently
    magellan does not include such columns and addressing it will be part
    of future work.

    Args:
        blocker_output_list (list of DataFrames): List of DataFrames that
            should be combined. Refer notes section for a detailed
            description of the assumptions made by the function about the
            input list  of blocker outputs.
        l_prefix (str): Prefix given to the attributes from the ltable.
        r_prefix (str): Prefix given to the attributes from the rtable.
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the combined tuple pairs and other attributes from
        all the blocker lists.

    Raises:
        AssertionError: If the l_prefix is not of type string.
        AssertionError: If the r_prefix is not of type string.
        AssertionError: If the length of the input DataFrame list if 0.
        AssertionError: If the input blocker output list is not a list of
            DataFrames.
        AssertionError: If the ltables are different across the input list of
            DataFrames.
        AssertionError: If the rtables are different across the input list of
            DataFrames.
        AssertionError: If the fk_ltable values are different across the
            input list of DataFrames.
        AssertionError: If the fk_rtable values are different across the
            input list of DataFrames.
    """

    # validate input parameters

    # The l_prefix is expected to be of type string
    if not isinstance(l_prefix, six.string_types):
        logger.error('l_prefix is not of type string')
        raise AssertionError('l_prefix is not of type string')

    # The r_prefix is expected to be of type string
    if not isinstance(r_prefix, six.string_types):
        logger.error('r_prefix is not of type string')
        raise AssertionError('r_prefix is not of type string')

    # We cannot combine empty DataFrame list
    if not len(blocker_output_list) > 0:
        logger.error('There no DataFrames to combine')
        raise AssertionError('There are no DataFrames to combine')

    # Validate the assumptions about the input tables.
    # # 1) All the input object must be DataFrames
    # # 2) All the input DataFrames must have the metadata as that of a
    # candidate set
    # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable
    _validate_lr_tables(blocker_output_list)

    # # Get the ltable and rtable. We take it from the first DataFrame as all
    #  the DataFrames contain the same ltables and rtables
    ltable = cm.get_ltable(blocker_output_list[0])
    rtable = cm.get_rtable(blocker_output_list[0])

    # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as
    #  all the DataFrames contain the same ltables and rtables
    fk_ltable = cm.get_fk_ltable(blocker_output_list[0])
    fk_rtable = cm.get_fk_rtable(blocker_output_list[0])

    # Retrieve the keys for the ltable and rtables.
    l_key = cm.get_key(ltable)
    r_key = cm.get_key(rtable)

    # Check if the fk_ltable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_ltable.startswith(l_prefix) is False:
        logger.warning(
            'Foreign key for ltable is not starting with the given prefix ('
            '%s)', l_prefix)

    # Check if the fk_rtable is starting with the given prefix, if not its
    # not an error. Just raise a warning.
    if fk_rtable.startswith(r_prefix) is False:
        logger.warning(
            'Foreign key for rtable is not starting with the given prefix ('
            '%s)', r_prefix)

    # Initialize lists
    # # keep track of projected tuple pair ids
    tuple_pair_ids = []
    # # keep track of output attributes from the left table
    l_output_attrs = []
    # # keep track of output attributes from the right table
    r_output_attrs = []

    # for each DataFrame in the given list, project out tuple pair ids, get the
    #  attributes from the ltable and rtable
    for data_frame in blocker_output_list:
        # Project out the tuple pair ids. A tuple pair id is a fk_ltable,
        # fk_rtable pair
        projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]]
        # Update the list that tracks tuple pair ids
        tuple_pair_ids.append(projected_tuple_pair_ids)

        # Get the columns, which should be segregated into the attributes
        # from the ltable and table
        col_set = (gh.list_diff(list(data_frame.columns),
                                [fk_ltable, fk_rtable,
                                 cm.get_key(data_frame)]))

        # Segregate the columns as attributes from the ltable and rtable
        l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix)

        # Update the l_output_attrs, r_output_attrs
        l_output_attrs.extend(l_attrs)
        # the reason we use extend because l_attrs a list
        r_output_attrs.extend(r_attrs)

    ch.log_info(
        logger, 'Concatenating the tuple pair ids across given '
        'blockers ...', verbose)

    # concatenate the tuple pair ids from the list of input DataFrames
    concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids)

    ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose)
    ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose)

    # Deduplicate the DataFrame. Now the returned DataFrame will contain
    # unique tuple pair ids.

    # noinspection PyUnresolvedReferences
    deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates()

    ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose)

    # Construct output table
    # # Get unique list of attributes across different tables
    l_output_attrs = gh.list_drop_duplicates(l_output_attrs)
    r_output_attrs = gh.list_drop_duplicates(r_output_attrs)

    # Reset the index that might have lingered from concatenation.
    deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True)

    # Add the output attribtues from the ltable and rtable.
    # NOTE: This approach may be inefficient as it probes the ltable, rtable
    # to get the attribute values. A better way would be to fill the
    # attribute values from the input list of DataFrames. This attribute values
    # could be harvested (at the expense of some space) while we iterate the
    # input blocker output list for the first time.

    # noinspection PyProtectedMember
    consolidated_data_frame = gh._add_output_attributes(
        deduplicated_tuple_pair_ids,
        fk_ltable,
        fk_rtable,
        ltable,
        rtable,
        l_key,
        r_key,
        l_output_attrs,
        r_output_attrs,
        l_prefix,
        r_prefix,
        validate=False)
    # Sort the DataFrame ordered by fk_ltable and fk_rtable.
    # The function "sort" will be depreciated in the newer versions of
    # pandas DataFrame, and it will replaced by 'sort_values' function. So we
    # will first try to use sort_values if this fails we will use sort.
    try:
        consolidated_data_frame.sort_values([fk_ltable, fk_rtable],
                                            inplace=True)
    except AttributeError:
        consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True)

    # update the catalog for the consolidated DataFrame
    # First get a column name for the key
    key = ch.get_name_for_key(consolidated_data_frame.columns)
    # Second, add the column name as the key
    consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key)
    # Third, reset the index to remove any out of order index  values from
    # the sort.
    consolidated_data_frame.reset_index(inplace=True, drop=True)
    # Finally, set the properties for the consolidated DataFrame in the catalog
    cm.set_candset_properties(consolidated_data_frame, key, fk_ltable,
                              fk_rtable, ltable, rtable)

    # Return the consolidated DataFrame
    return consolidated_data_frame
Esempio n. 17
0
    def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True, show_progress=True):

        # validate black box functionn
        assert self.black_box_function != None, 'Black box function is not set'

        # validate input parameters
        self.validate_output_attrs(ltable, rtable, l_output_attrs,r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)

        # do blocking

        # # initialize progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(ltable)*len(rtable))

        # # list to keep track the tuple pairs that survive blocking
        valid = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create look up table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # get the position of the id attribute in the tables
        l_id_pos = list(ltable.columns).index(l_key)
        r_id_pos = list(rtable.columns).index(r_key)

        # # iterate through the tuples and apply the black box function
        for l_t in ltable.itertuples(index=False):
            ltuple = l_dict[l_t[l_id_pos]]
            for r_t in rtable.itertuples(index=False):
                # # update the progress bar
                if show_progress:
                    bar.update()

                rtuple = r_dict[r_t[r_id_pos]]

                res = self.black_box_function(ltuple, rtuple)

                if res != True:
                    d = OrderedDict()

                    # # add ltable and rtable ids
                    ltable_id = l_output_prefix + l_key
                    rtable_id = r_output_prefix + r_key

                    d[ltable_id] = ltuple[l_key]
                    d[rtable_id] = rtuple[r_key]

                    # # add l/r output attributes
                    if l_output_attrs:
                        l_out = ltuple[l_output_attrs]
                        l_out.index = l_output_prefix + l_out.index
                        d.update(l_out)

                    if r_output_attrs:
                        r_out = rtuple[r_output_attrs]
                        r_out.index = r_output_prefix + r_out.index
                        d.update(r_out)

                    # # add the ordered dict to the list
                    valid.append(d)

        # construct output table
        candset = pd.DataFrame(valid)
        l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, l_output_prefix)
        r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, r_output_prefix)

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs,
                                               l_output_prefix, r_output_prefix)

        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable)

        # return candidate set
        return candset
Esempio n. 18
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=True,
                     show_progress=True):
        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # validate input parameters
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm.validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                       verbose)
        cm.validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                       verbose)

        # do blocking

        # # initialize progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(ltable) * len(rtable))

        # # list to keep track of the tuple pairs that survive blocking
        valid = []

        #  # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create look up table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # get the position of the id attributes in the tables
        l_id_pos = list(ltable.columns).index(l_key)
        r_id_pos = list(rtable.columns).index(r_key)

        # # iterate through the tuples and apply the rules
        for l_t in ltable.itertuples(index=False):
            ltuple = l_dict[l_t[l_id_pos]]
            for r_t in rtable.itertuples(index=False):
                # # update the progress bar
                if show_progress:
                    bar.update()

                rtuple = r_dict[r_t[r_id_pos]]
                res = self.apply_rules(ltuple, rtuple)

                if res != True:
                    d = OrderedDict()
                    # # add ltable and rtable ids
                    ltable_id = l_output_prefix + l_key
                    rtable_id = r_output_prefix + r_key

                    d[ltable_id] = ltuple[l_key]
                    d[rtable_id] = rtuple[r_key]

                    # # add l/r output attributes
                    if l_output_attrs:
                        l_out = ltuple[l_output_attrs]
                        l_out.index = l_output_prefix + l_out.index
                        d.update(l_out)

                    if r_output_attrs:
                        r_out = rtuple[r_output_attrs]
                        r_out.index = r_output_prefix + r_out.index
                        d.update(r_out)

                    # # add the ordered dict to the list
                    valid.append(d)

        # construct output table
        candset = pd.DataFrame(valid)
        l_output_attrs = self.process_output_attrs(ltable, l_key,
                                                   l_output_attrs,
                                                   l_output_prefix)
        r_output_attrs = self.process_output_attrs(rtable, r_key,
                                                   r_output_attrs,
                                                   r_output_prefix)

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs, l_output_prefix,
                                               r_output_prefix)

        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
Esempio n. 19
0
    def block_candset(self, candset, verbose=True, show_progress=True):

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)

        # do blocking

        # # initialize the progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candidate set
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            res = self.apply_rules(ltuple, rtuple)
            if res != True:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return candset
Esempio n. 20
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table),
                                         sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Esempio n. 21
0
    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     verbose=True, show_progress=True):
        # validations
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        # required metadata; keys from ltable and rtable
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # do blocking

        if word_level == True and q_val != None:
            raise SyntaxError('Parameters word_level and q_val cannot be set together; Note that word_level is '
                              'set to True by default, so explicity set word_level=false to use qgram with the '
                              'specified q_val')

        # #rem nans
        l_df = rem_nan(ltable, l_overlap_attr)
        r_df = rem_nan(rtable, r_overlap_attr)

        # #reset indexes in the dataframe
        l_df.reset_index(inplace=True, drop=True)
        r_df.reset_index(inplace=True, drop=True)

        # #create a dummy column with all values set to 1.
        l_dummy_col_name = self.get_dummy_col_name(l_df.columns)
        r_dummy_col_name = self.get_dummy_col_name(r_df.columns)
        l_df[l_dummy_col_name] = 1  # need to fix this - should be a name that does not occur in the col. names
        r_df[r_dummy_col_name] = 1

        # #case the column to string if required.
        if l_df.dtypes[l_overlap_attr] != object:
            logger.warning('Left overlap attribute is not of type string; coverting to string temporarily')
            l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str)

        if r_df.dtypes[r_overlap_attr] != object:
            logger.warning('Right overlap attribute is not of type string; coverting to string temporarily')
            r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str)

        l_dict = {}
        r_dict = {}

        # #create a lookup table for quick access
        for k, r in l_df.iterrows():
            l_dict[k] = r

        for k, r in r_df.iterrows():
            r_dict[k] = r

        l_colvalues_chopped = self.process_table(l_df, l_overlap_attr, q_val, rem_stop_words)
        zipped_l_colvalues = zip(l_colvalues_chopped, range(0, len(l_colvalues_chopped)))
        appended_l_colidx_values = [self.append_index_values(val[0], val[1]) for val in zipped_l_colvalues]

        inv_idx = {}
        sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_colidx_values for t in c]

        r_colvalues_chopped = self.process_table(r_df, r_overlap_attr, q_val, rem_stop_words)
        r_idx = 0

        white_list = []
        if show_progress:
            bar = pyprind.ProgBar(len(r_colvalues_chopped))

        df_list = []
        for col_values in r_colvalues_chopped:
            if show_progress:
                bar.update()

            qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size)
            r_row = r_dict[r_idx]
            r_row_dict = r_row.to_frame().T

            l_rows_dict = l_df.iloc[qualifying_ltable_indices]
            df = l_rows_dict.merge(r_row_dict, left_on=l_dummy_col_name, right_on=r_dummy_col_name,
                                   suffixes=('_ltable', '_rtable'))

            if len(df) > 0:
                df_list.append(df)
            r_idx += 1

        # Construct the output table
        candset = pd.concat(df_list)
        l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left')
        r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right')

        # retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs,
        #                                        l_output_prefix, r_output_prefix)
        retain_cols, final_cols = self.output_columns(l_key, r_key, list(candset.columns),
                                                      l_output_attrs, r_output_attrs,
                                                      l_output_prefix, r_output_prefix)




        if len(candset) > 0:
            candset = candset[retain_cols]
            candset.columns = final_cols
        else:
            candset = pd.DataFrame(columns=final_cols)

        # Update metadata in the catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable)

        # return the candidate set
        return candset
Esempio n. 22
0
    def block_candset(self,
                      candset,
                      l_block_attr,
                      r_block_attr,
                      verbose=True,
                      show_progress=True):

        self.validate_types_candset(candset, l_block_attr, r_block_attr,
                                    verbose, show_progress)
        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, ' +
            'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                         ltable, rtable, l_key, r_key, logger,
                                         verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # do blocking

        # # initialize progress bar
        if show_progress:
            prog_bar = pyprind.ProgBar(len(candset))

        # # initialize list to keep track of valid ids
        valid = []

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get the indexes for the key attributes in the candset
        col_names = list(candset.columns)
        lkey_idx = col_names.index(fk_ltable)
        rkey_idx = col_names.index(fk_rtable)

        # # create a look up table for the blocking attribute values
        l_dict = {}
        r_dict = {}

        # # iterate the rows in candset
        for row in candset.itertuples(index=False):

            # # update the progress bar
            if show_progress:
                prog_bar.update()

            # # get the value of block attributes
            row_lkey = row[lkey_idx]
            if row_lkey not in l_dict:
                l_dict[row_lkey] = l_df.ix[row_lkey, l_block_attr]
            l_val = l_dict[row_lkey]

            row_rkey = row[rkey_idx]
            if row_rkey not in r_dict:
                r_dict[row_rkey] = r_df.ix[row_rkey, r_block_attr]
            r_val = r_dict[row_rkey]

            if l_val == r_val:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return the output table
        return out_table
Esempio n. 23
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Esempio n. 24
0
def extract_feature_vecs(candset,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')

    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    log_info(
        logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key, logger,
                                     verbose)

    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # apply feature functions
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table
Esempio n. 25
0
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')


    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')


    log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                     'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                     logger, verbose)


    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)


    # # apply feature functions
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table
Esempio n. 26
0
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                      verbose=True, show_progress=True):

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                         'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)

        # do blocking
        # # initialize the progress bar

        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set the index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candset
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            num_overlap = self.get_token_overlap_bt_two_tuples(ltuple, rtuple,
                                                               l_overlap_attr, r_overlap_attr,
                                                               q_val, rem_stop_words)

            if num_overlap >= overlap_size:
                valid.append(True)
            else:
                valid.append(False)

        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable, rtable)

        # return candidate set
        return candset
Esempio n. 27
0
    def block_candset(self, candset, verbose=True, show_progress=True):

        # validate black box functionn
        assert self.black_box_function != None, 'Black box function is not set'

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                                'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

        # # validate metadata
        cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                         logger, verbose)

        # do blocking

        # # initialize the progress bar
        if show_progress:
            bar = pyprind.ProgBar(len(candset))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # create lookup table for faster processing
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r

        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        # # list to keep track of valid ids
        valid = []
        l_id_pos = list(candset.columns).index(fk_ltable)
        r_id_pos = list(candset.columns).index(fk_rtable)

        # # iterate candidate set
        for row in candset.itertuples(index=False):
            # # update progress bar
            if show_progress:
                bar.update()

            ltuple = l_dict[row[l_id_pos]]
            rtuple = r_dict[row[r_id_pos]]

            res = self.black_box_function(ltuple, rtuple)
            if res != True:
                valid.append(True)
            else:
                valid.append(False)

        # construct output table
        if len(candset) > 0:
            candset = candset[valid]
        else:
            candset = pd.DataFrame(columns=candset.columns)

        # update catalog
        cm.set_candset_properties(candset, key, fk_ltable, fk_rtable, ltable, rtable)

        # return candidate set
        return candset