Python validate_object_type Examples, py_entitymatching.utils.validation_helper.validate_object_type Python Examples

Example #1

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def add_key_column(table, key):
    validate_object_type(table, pd.DataFrame)

    validate_object_type(key, six.string_types, error_prefix='Input key')

    table.insert(0, key, range(0, len(table)))
    return table

Example #2

0

Show file

    def validate_types_other_params(self, l_overlap_attr, r_overlap_attr,
                                    rem_stop_words, q_val, word_level,
                                    overlap_size):
        validate_object_type(
            l_overlap_attr,
            six.string_types,
            error_prefix='Overlap attribute name of left table')
        validate_object_type(
            r_overlap_attr,
            six.string_types,
            error_prefix='Overlap attribute name of right table')

        validate_object_type(rem_stop_words,
                             bool,
                             error_prefix='Parameter rem_stop_words')

        if q_val != None and not isinstance(q_val, int):
            logger.error('Parameter q_val is not of type int')
            raise AssertionError('Parameter q_val is not of type int')

        validate_object_type(word_level,
                             bool,
                             error_prefix='Parameter word_level')
        validate_object_type(overlap_size,
                             int,
                             error_prefix='Parameter overlap_size')

Example #3

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def does_contain_missing_vals(df, attr):
    """
    Check if the attribute contains missing values in the dataframe

    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe

    Returns:
        result (bool). Returns True, if the attribute contains missing values, else returns False

    Notes:
        This is an internal helper function

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    # nan_flag = (sum(df[attr].isnull()) != 0)
    nan_flag = any(pd.isnull(df[attr]))
    if not nan_flag:
        return False
    else:
        return True

Example #4

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def does_contain_missing_vals(df, attr):
    """
    Check if the attribute contains missing values in the dataframe

    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe

    Returns:
        result (bool). Returns True, if the attribute contains missing values, else returns False

    Notes:
        This is an internal helper function

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    # nan_flag = (sum(df[attr].isnull()) != 0)
    nan_flag = any(pd.isnull(df[attr]))
    if not nan_flag:
        return False
    else:
        return True

Example #5

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def add_key_column(table, key):
    validate_object_type(table, pd.DataFrame)

    validate_object_type(key, six.string_types, error_prefix='Input key')

    table.insert(0, key, range(0, len(table)))
    return table

Example #6

0

Show file

 def test_validate_object_type_with_invalid_type(self):
     self.assertRaises(AssertionError,
                       lambda: vh.validate_object_type('ABC', int))
     self.assertRaises(AssertionError,
                       lambda: vh.validate_object_type(123, str))
     self.assertRaises(AssertionError,
                       lambda: vh.validate_object_type(list(), dict))
     self.assertRaises(AssertionError,
                       lambda: vh.validate_object_type(dict(), list))

Example #7

0

Show file

 def validate_types_block_attrs(self, l_block_attr, r_block_attr):
     validate_object_type(
         l_block_attr,
         six.string_types,
         error_prefix='Blocking attribute name of left table')
     validate_object_type(
         r_block_attr,
         six.string_types,
         error_prefix='Blocking attribute name of right table')

Example #8

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def check_attrs_present(table, attrs):

    validate_object_type(table, pd.DataFrame)

    if attrs is None:
        logger.warning('Input attr. list is null')
        return False

    if isinstance(attrs, list) is False:
        attrs = [attrs]
    status = are_all_attrs_in_df(table, attrs, verbose=True)
    return status

Example #9

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def check_attrs_present(table, attrs):

    validate_object_type(table, pd.DataFrame)

    if attrs is None:
        logger.warning('Input attr. list is null')
        return False

    if isinstance(attrs, list) is False:
        attrs = [attrs]
    status = are_all_attrs_in_df(table, attrs, verbose=True)
    return status

Example #10

0

Show file

def debug_decisiontree_matcher(decision_tree,
                               tuple_1,
                               tuple_2,
                               feature_table,
                               table_columns,
                               exclude_attrs=None):
    """
    This function is used to debug a decision tree matcher using two input
    tuples.

    Specifically, this function takes in two tuples, gets the feature vector
    using the feature table and finally passes it to the decision tree and
    displays the path that the feature vector takes in the decision tree.

    Args:
        decision_tree (DTMatcher): The input
            decision tree object that should be debugged.
        tuple_1,tuple_2 (Series): Input tuples that should be debugged.
        feature_table (DataFrame): Feature table containing the functions
            for the features.
        table_columns (list): List of all columns that will be outputted
            after generation of feature vectors.
        exclude_attrs (list): List of attributes that should be removed from
            the table columns.

    Raises:
        AssertionError: If the input feature table is not of type pandas
            DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # devel is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # F is the feature vector got from evaluation set of the labeled data.
        >>> out = dt.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # A and B are input tables
        >>> em.debug_decisiontree_matcher(dt, A.loc[1], B.loc[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
    """

    validate_object_type(feature_table, pd.DataFrame,
                         'The input feature table')

    _debug_decisiontree_matcher(decision_tree,
                                tuple_1,
                                tuple_2,
                                feature_table,
                                table_columns,
                                exclude_attrs,
                                ensemble_flag=False)

Example #11

0

Show file

File: debugblocker.py Project: anhaidgroup/py_entitymatching

def _validate_types(ltable, rtable, candidate_set, output_size, attr_corres,
                    verbose):
    validate_object_type(ltable, pd.DataFrame, 'Input left table')

    validate_object_type(rtable, pd.DataFrame, 'Input right table')

    validate_object_type(candidate_set, pd.DataFrame, 'Input candidate set')

    validate_object_type(output_size, int, 'Output size')

    if attr_corres is not None:
        if not isinstance(attr_corres, list):
            logging.error('Input attribute correspondence is not of'
                          ' type list')
            raise AssertionError('Input attribute correspondence is'
                                 ' not of type list')

        for pair in attr_corres:
            if not isinstance(pair, tuple):
                logging.error('Pair in attribute correspondence list is not'
                              ' of type tuple')
                raise AssertionError('Pair in attribute correspondence list'
                                     ' is not of type tuple')

    if not isinstance(verbose, bool):
        logger.error('Parameter verbose is not of type bool')
        raise AssertionError('Parameter verbose is not of type bool')

Example #12

0

Show file

File: overlap_blocker.py Project: anhaidgroup/py_entitymatching

    def validate_types_other_params(self, l_overlap_attr, r_overlap_attr,
                                    rem_stop_words, q_val,
                                    word_level, overlap_size):
        validate_object_type(l_overlap_attr, six.string_types, error_prefix='Overlap attribute name of left table')
        validate_object_type(r_overlap_attr, six.string_types, error_prefix='Overlap attribute name of right table')

        validate_object_type(rem_stop_words, bool, error_prefix='Parameter rem_stop_words')

        if q_val != None and not isinstance(q_val, int):
            logger.error('Parameter q_val is not of type int')
            raise AssertionError('Parameter q_val is not of type int')

        validate_object_type(word_level, bool, error_prefix='Parameter word_level')
        validate_object_type(overlap_size, int, error_prefix='Parameter overlap_size')

Example #13

0

Show file

File: backup_debugblocker.py Project: anhaidgroup/py_entitymatching

def _validate_types(ltable, rtable, candidate_set, output_size,
                    attr_corres, verbose):
    validate_object_type(ltable, pd.DataFrame, 'Input left table')

    validate_object_type(rtable, pd.DataFrame, 'Input right table')

    validate_object_type(candidate_set, pd.DataFrame, 'Input candidate set')

    validate_object_type(output_size, int, 'Output size')

    if attr_corres is not None:
        if not isinstance(attr_corres, list):
            logging.error('Input attribute correspondence is not of'
                          ' type list')
            raise AssertionError('Input attribute correspondence is'
                                 ' not of type list')

        for pair in attr_corres:
            if not isinstance(pair, tuple):
                logging.error('Pair in attribute correspondence list is not'
                              ' of type tuple')
                raise AssertionError('Pair in attribute correspondence list'
                                     ' is not of type tuple')

    if not isinstance(verbose, bool):
        logger.error('Parameter verbose is not of type bool')
        raise AssertionError('Parameter verbose is not of type bool')

Example #14

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def are_all_attrs_in_df(df, col_names, verbose=False):

    validate_object_type(df, pd.DataFrame)

    if col_names is None:
        logger.warning('Input col_names is null')
        return False

    df_columns_names = list(df.columns)
    for c in col_names:
        if c not in df_columns_names:
            if verbose:
                logger.warning('Column name (' +c+ ') is not present in dataframe')
            return False
    return True

Example #15

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def are_all_attrs_in_df(df, col_names, verbose=False):

    validate_object_type(df, pd.DataFrame)

    if col_names is None:
        logger.warning('Input col_names is null')
        return False

    df_columns_names = list(df.columns)
    for c in col_names:
        if c not in df_columns_names:
            if verbose:
                logger.warning('Column name (' + c +
                               ') is not present in dataframe')
            return False
    return True

Example #16

0

Show file

File: autofeaturegen.py Project: anhaidgroup/py_entitymatching

def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres):
    """
    Check whether the order of tables matches with what is mentioned in
    l_attr_types, r_attr_type and attr_corres.
    """
    # Validate the input parameters
    # We expect the input object ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input left table')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input right table')

    # Get the ids of the input tables. This is used to validate the order
    # of tables present in the given data structures.
    # Note: This kind of checking is bit too aggressive, the reason is this
    # checking needs the ltable and rtable to point to exact memory location
    # across the given dictionaries and the input. Ideally, we just need to
    # check whether the contents of those DataFrames are same.
    ltable_id = id(ltable)
    rtable_id = id(rtable)

    # Check whether ltable id matches with id of table mentioned in l_attr_types
    if ltable_id != id(l_attr_types['_table']):
        logger.error(
            'ltable is not the same as table mentioned in left attr types')
        return False

    # Check whether rtable id matches with id of table mentioned in r_attr_types
    if rtable_id != id(r_attr_types['_table']):
        logger.error(
            'rtable is not the same as table mentioned in right attr types')
        return False

    # Check whether ltable matches with ltable mentioned in attr_corres
    if ltable_id != id(attr_corres['ltable']):
        logger.error(
            'ltable is not the same as table mentioned in attr correspondence')
        return False

    # Check whether rtable matches with rtable mentioned in attr_corres
    if rtable_id != id(attr_corres['rtable']):
        logger.error(
            'rtable is not the same as table mentioned in attr correspondence')
        return False

    # Finally, return True.
    return True

Example #17

0

Show file

File: magellan_modified_feature_generation.py Project: chu-data-lab/zeroer

def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres):
    """
    Check whether the order of tables matches with what is mentioned in
    l_attr_types, r_attr_type and attr_corres.
    """
    # Validate the input parameters
    # We expect the input object ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input left table')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input right table')

    # Get the ids of the input tables. This is used to validate the order
    # of tables present in the given data structures.
    # Note: This kind of checking is bit too aggressive, the reason is this
    # checking needs the ltable and rtable to point to exact memory location
    # across the given dictionaries and the input. Ideally, we just need to
    # check whether the contents of those DataFrames are same.
    ltable_id = id(ltable)
    rtable_id = id(rtable)

    # Check whether ltable id matches with id of table mentioned in l_attr_types
    if ltable_id != id(l_attr_types['_table']):
        logger.error(
            'ltable is not the same as table mentioned in left attr types')
        return False

    # Check whether rtable id matches with id of table mentioned in r_attr_types
    if rtable_id != id(r_attr_types['_table']):
        logger.error(
            'rtable is not the same as table mentioned in right attr types')
        return False

    # Check whether ltable matches with ltable mentioned in attr_corres
    if ltable_id != id(attr_corres['ltable']):
        logger.error(
            'ltable is not the same as table mentioned in attr correspondence')
        return False

    # Check whether rtable matches with rtable mentioned in attr_corres
    if rtable_id != id(attr_corres['rtable']):
        logger.error(
            'rtable is not the same as table mentioned in attr correspondence')
        return False

    # Finally, return True.
    return True

Example #18

0

Show file

File: debug_decisiontree_matcher.py Project: anhaidgroup/py_entitymatching

def debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table,
                               table_columns,
                               exclude_attrs=None):
    """
    This function is used to debug a decision tree matcher using two input
    tuples.

    Specifically, this function takes in two tuples, gets the feature vector
    using the feature table and finally passes it to the decision tree and
    displays the path that the feature vector takes in the decision tree.

    Args:
        decision_tree (DTMatcher): The input
            decision tree object that should be debugged.
        tuple_1,tuple_2 (Series): Input tuples that should be debugged.
        feature_table (DataFrame): Feature table containing the functions
            for the features.
        table_columns (list): List of all columns that will be outputted
            after generation of feature vectors.
        exclude_attrs (list): List of attributes that should be removed from
            the table columns.

    Raises:
        AssertionError: If the input feature table is not of type pandas
            DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # devel is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # F is the feature vector got from evaluation set of the labeled data.
        >>> out = dt.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # A and B are input tables
        >>> em.debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
    """

    validate_object_type(feature_table, pd.DataFrame, 'The input feature table')

    _debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table,
                                table_columns,
                                exclude_attrs,
                                ensemble_flag=False)

Example #19

0

Show file

File: pickles.py Project: anhaidgroup/py_entitymatching

def load_object(file_path):
    """
    Loads a Python object from disk.

    This function loads py_entitymatching objects from disk such as
    blockers, matchers, feature table, etc.

    Args:

        file_path (string): The file path to load the object from.

    Returns:
        A Python object read from the file path.

    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If a file does not exist at the given `file_path`.

    Examples:

        >>> rb = em.load_object('./rule_blocker.pkl')

    See Also:
        :meth:`~save_object`


    """
    # Validate input parameters

    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    # Check if a file exists at the given file path.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s', file_path)
        raise AssertionError('File does not exist at path', file_path)

    # Read the object from the file.

    # # Open the file with the mode set to binary.
    with open(file_path, 'rb') as file_handler:
        object_to_return = pickle.load(file_handler)

    # Return the object.
    return object_to_return

Example #20

0

Show file

File: mlmatcherselection.py Project: anhaidgroup/py_entitymatching

def _get_xy_data_ex(table, exclude_attrs, target_attr):
    # Validate the input parameters
    # # We expect the input table to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)
    # We expect exclude attributes to be of type list. If not convert it into
    #  a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Check if the exclude attributes are present in the input table
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs '
                     'is not present '
                     'in the input table')
        raise AssertionError(
            'The attributes mentioned in exclude_attrs '
            'is not present '
            'in the input table')
    # Check if the target attribute is present in the input table
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError(
            'The target_attr is not present in the input table')

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # Explicitly add the target attribute to exclude attribute (if it is not
    # already present)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Project the list of attributes that should be used for scikit-learn's
    # functions.
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)

    # Get the values for x
    x = table[attrs_to_project].values
    # Get the values for x
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    # Return x and y
    return x, y

Example #21

0

Show file

def _get_xy_data_ex(table, exclude_attrs, target_attr):
    # Validate the input parameters
    # # We expect the input table to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)
    # We expect exclude attributes to be of type list. If not convert it into
    #  a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Check if the exclude attributes are present in the input table
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs '
                     'is not present '
                     'in the input table')
        raise AssertionError('The attributes mentioned in exclude_attrs '
                             'is not present '
                             'in the input table')
    # Check if the target attribute is present in the input table
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError(
            'The target_attr is not present in the input table')

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # Explicitly add the target attribute to exclude attribute (if it is not
    # already present)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Project the list of attributes that should be used for scikit-learn's
    # functions.
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)

    # Get the values for x
    x = table[attrs_to_project].values
    # Get the values for x
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    # Return x and y
    return x, y

Example #22

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def check_fk_constraint(df_foreign, attr_foreign, df_base, attr_base):
    """
    Check if the foreign key is a primary key
    Args:
        df_foreign (pandas dataframe): Foreign dataframe
        attr_foreign (str): Attribute in the foreign dataframe
        df_base (pandas dataframe): Base dataframe
        attr_base (str): Attribute in the base dataframe
    Returns:
        result (bool). Returns True if the foreign key contraint is satisfied, else returns False
    Notes:
        This is an internal helper function
    """
    validate_object_type(df_foreign, pd.DataFrame, error_prefix='Input object (df_foreign)')

    validate_object_type(attr_foreign, six.string_types, error_prefix='Input attr (attr_foreign)')

    validate_object_type(df_base, pd.DataFrame, error_prefix='Input object (df_base)')

    validate_object_type(attr_base, six.string_types, error_prefix='Input attr (attr_base)')

    if not check_attrs_present(df_base, attr_base):
        logger.warning('The attribute %s is not in df_base' %attr_base)
        return False

    if not check_attrs_present(df_foreign, attr_foreign):
        logger.error('Input attr (attr_foreign) is not in df_foreign')
        return False

    if any(pd.isnull(df_foreign[attr_foreign])):
        logger.warning('The attribute %s in foreign table contains null values' %attr_foreign)
        return False

    uniq_fk_vals = set(pd.unique(df_foreign[attr_foreign]))
    base_attr_vals = df_base[attr_base].values
    d = uniq_fk_vals.difference(base_attr_vals)
    if len(d) > 0:
        logger.warning('For some attr. values in (%s) in the foreign table there are no values in '
                       '(%s) in the base table' %(attr_foreign, attr_base))
        return False

    # check whether those values are unique in the base table.
    t = df_base[df_base[attr_base].isin(pd.unique(df_foreign[attr_foreign]))]
    status = is_key_attribute(t, attr_base)

    if status == False:
        logger.warning('Key attr. constraint for the subset of values (derived from. %s)'
                       'in %s is not satisifed' %(attr_foreign, attr_base))
        return False
    else:
        return True

Example #23

0

Show file

def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)

    # # The label column name is expected to be of type string
    validate_object_type(label_column_name,
                         six.string_types,
                         error_prefix='Input attr.')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error(
            'The label column name (%s) is already present in the '
            'input table', label_column_name)
        raise AssertionError(
            'The label column name (%s) is already present '
            'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    # Return True if everything was successful
    return True

Example #24

0

Show file

File: pandastable_wrapper.py Project: soumyadsanyal/py_entitymatching

def data_explore_pandastable(df):
    """
    Wrapper function for pandastable. Gives user a GUI to examine and edit
    the dataframe passed in using pandastable.

    Args:
        df (Dataframe): The pandas dataframe to be explored with pandastable.

    Raises:
        AssertionError: If `df` is not of type pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
        >>> em.data_explore_pandastable(A)

    """

    # Validate input parameters
    # # We expect the df to be of type pandas DataFrame
    validate_object_type(df, pd.DataFrame, 'Input df')
    DataExplorePandastable(df)

Example #25

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def is_key_attribute(df, attr, verbose=False):
    """
    Check if an attribute is a key attribute
    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe
        verbose (bool):  Flag to indicate whether warnings should be printed out

    Returns:
        result (bool). Returns True, if the attribute is a key attribute (unique and without missing values), else
         returns False

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    # check if the length is > 0
    if len(df) > 0:
        # check for uniqueness
        uniq_flag = len(pd.unique(df[attr])) == len(df)
        if not uniq_flag:
            if verbose:
                logger.warning('Attribute ' + attr +
                               ' does not contain unique values')
            return False

        # check if there are missing or null values
        # nan_flag = sum(df[attr].isnull()) == 0
        nan_flag = not any(pd.isnull(df[attr]))
        if not nan_flag:
            if verbose:
                logger.warning('Attribute ' + attr +
                               ' contains missing values')
            return False
        return uniq_flag and nan_flag
    else:
        return True

Example #26

0

Show file

File: labeler.py Project: anhaidgroup/py_entitymatching

def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)

    # # The label column name is expected to be of type string
    validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Return True if everything was successful
    return True

Example #27

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def is_key_attribute(df, attr, verbose=False):
    """
    Check if an attribute is a key attribute
    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe
        verbose (bool):  Flag to indicate whether warnings should be printed out

    Returns:
        result (bool). Returns True, if the attribute is a key attribute (unique and without missing values), else
         returns False

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    # check if the length is > 0
    if len(df) > 0:
        # check for uniqueness
        uniq_flag = len(pd.unique(df[attr])) == len(df)
        if not uniq_flag:
            if verbose:
                logger.warning('Attribute ' + attr + ' does not contain unique values')
            return False

        # check if there are missing or null values
        # nan_flag = sum(df[attr].isnull()) == 0
        nan_flag = not any(pd.isnull(df[attr]))
        if not nan_flag:
            if verbose:
                logger.warning('Attribute ' + attr + ' contains missing values')
            return False
        return uniq_flag and nan_flag
    else:
        return True

Example #28

0

Show file

def data_explore_openrefine(df, server='http://127.0.0.1:3333', name=None):
    """
    Wrapper function for using OpenRefine. Gives user a GUI to examine and edit
    the dataframe passed in using OpenRefine.

    Args:
        df (Dataframe): The pandas dataframe to be explored with pandastable.
        server (String): The address of the OpenRefine server (defaults to
            http://127.0.0.1:3333).
        name (String): The name given to the file and project in OpenRefine.

    Raises:
        AssertionError: If `df` is not of type pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
        >>> em.data_explore_openrefine(A, name='Table')

    """
    # Validate input parameters
    # # We expect the df to be of type pandas DataFrame
    validate_object_type(df, pd.DataFrame, 'Input df')
    return DataExploreOpenRefine(df, server, name)

Example #29

0

Show file

File: catalog_helper.py Project: anhaidgroup/py_entitymatching

def is_attr_unique(df, attr):
    """
    Check if the attribute is unique in a dataframe

    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe

    Returns:
        result (bool). Returns True, if the attribute contains unique values, else returns False

    Notes:
        This is an internal helper function

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    uniq_flag = (len(pd.unique(df[attr])) == len(df))
    if not uniq_flag:
        return False
    else:
        return True

Example #30

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def is_attr_unique(df, attr):
    """
    Check if the attribute is unique in a dataframe

    Args:
        df (pandas dataframe): Input dataframe
        attr (str): Attribute in the pandas dataframe

    Returns:
        result (bool). Returns True, if the attribute contains unique values, else returns False

    Notes:
        This is an internal helper function

    """
    validate_object_type(df, pd.DataFrame)

    validate_object_type(attr, six.string_types, error_prefix='Input attr.')

    uniq_flag = (len(pd.unique(df[attr])) == len(df))
    if not uniq_flag:
        return False
    else:
        return True

Example #31

0

Show file

File: single_table.py Project: anhaidgroup/py_entitymatching

def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table

Example #32

0

Show file

File: addfeatures.py Project: anhaidgroup/py_entitymatching

def _parse_feat_str(feature_string, tokenizers, similarity_functions):
    """
    This function parses the feature string to get left attribute,
    right attribute, tokenizer, similarity function
    """
    # Validate the input parameters
    # # We expect the input feature string to be of type string
    validate_object_type(feature_string, six.string_types, error_prefix='Input feature')

    # # We expect the input object tokenizers to be of type python dictionary
    validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)')

    # # We expect the input object similarity functions to be of type python
    # dictionary
    validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)')

    # We will have to parse the feature string. Specifically we use pyparsing
    #  module for the parsing purposes

    from pyparsing import Word, alphanums, ParseException

    # initialization attributes, tokenizers and similarity function parsing
    # result
    left_attribute = 'PARSE_EXP'
    right_attribute = 'PARSE_EXP'
    left_attr_tokenizer = 'PARSE_EXP'
    right_attr_tokenizer = 'PARSE_EXP'
    sim_function = 'PARSE_EXP'

    exception_flag = False

    # Define structures for each type such as attribute name, tokenizer
    # function
    attr_name = Word(alphanums + "_" + "." + "[" + "]" + '"' + "'")
    tok_fn = Word(alphanums + "_") + "(" + attr_name + ")"
    wo_tok = Word(alphanums + "_") + "(" + attr_name + "," + attr_name + ")"
    wi_tok = Word(alphanums + "_") + "(" + tok_fn + "," + tok_fn + ")"
    feat = wi_tok | wo_tok
    # Try to parse the string
    try:
        parsed_string = feat.parseString(feature_string)
    except ParseException as _:
        exception_flag = True

    if not exception_flag:
        # Parse the tokenizers
        parsed_tokenizers = [value for value in parsed_string
                             if value in tokenizers.keys()]
        if len(parsed_tokenizers) is 2:
            left_attr_tokenizer = parsed_tokenizers[0]
            right_attr_tokenizer = parsed_tokenizers[1]

        # Parse the similarity functions
        parsed_similarity_function = [value for value in parsed_string
                                      if value in similarity_functions.keys()]
        if len(parsed_similarity_function) == 1:
            sim_function = parsed_similarity_function[0]
        # Parse the left attribute
        attribute = [value for value in parsed_string
                     if value.startswith('ltuple[')]
        if len(attribute) == 1:
            attribute = attribute[0]
            left_attribute = attribute[7:len(attribute) - 1].strip('"').strip(
                "'")

        # Parse the right attribute
        attribute = [val for val in parsed_string if val.startswith('rtuple[')]
        if len(attribute) == 1:
            attribute = attribute[0]
            right_attribute = attribute[7:len(attribute) - 1].strip('"').strip(
                "'")
    else:
        pass

    # Return the parsed information in a dictionary format.

    parsed_dict = {'left_attribute': left_attribute,
                   'right_attribute': right_attribute,
                   'left_attr_tokenizer': left_attr_tokenizer,
                   'right_attr_tokenizer': right_attr_tokenizer,
                   'simfunction': sim_function,
                   'is_auto_generated': False}

    return parsed_dict

Example #33

0

Show file

File: addfeatures.py Project: anhaidgroup/py_entitymatching

def add_feature(feature_table, feature_name, feature_dict):
    """
    Adds a feature to the feature table.

    Specifically, this function is used in combination with
    :meth:`~py_entitymatching.get_feature_fn`.
    First the user creates a dictionary using :meth:`~py_entitymatching.get_feature_fn`,
    then the user uses this function to add feature_dict to the feature table.

    Args:
        feature_table (DataFrame): A DataFrame containing features.
        feature_name (string): The name that should be given to the feature.
        feature_dict (dictionary): A Python dictionary, that is typically
            returned by executing :meth:`~py_entitymatching.get_feature_fn`.

    Returns:
        A Boolean value of True is returned if the addition was successful.

    Raises:
        AssertionError: If the input `feature_table` is not of type
            pandas DataFrame.
        AssertionError: If `feature_name` is not of type
            string.
        AssertionError: If `feature_dict` is not of type
            Python dictionary.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the feature
            table.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')

        >>> block_t = em.get_tokenizers_for_blocking()
        >>> block_s = em.get_sim_funs_for_blocking()
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s)
        >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r)

        >>> match_t = em.get_tokenizers_for_matching()
        >>> match_s = em.get_sim_funs_for_matching()
        >>> match_f = em.get_features_for_matching(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s)
        >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r)
    """
    # Validate input parameters
    # # We expect the feature_table to be of pandas DataFrame
    validate_object_type(feature_table, pd.DataFrame, 'Input feature table')

    # # We expect the feature_name to be of type string
    validate_object_type(feature_name, six.string_types, 'Input feature name')

    # # We expect the feature_dict to be of type python dictionary
    validate_object_type(feature_dict, dict, 'Input feature dictionary')

    # # We expect the feature table to contain certain columns
    missing_columns = get_missing_column_values(feature_table.columns)
    if missing_columns:
        error_msg = "Feature table does not have all required columns\n The following columns are missing: {0}".format(", ".join(missing_columns))
        raise AssertionError(error_msg)

    feature_names = list(feature_table['feature_name'])
    if feature_name in feature_names:
        logger.error('Input feature name is already present in feature table')
        raise AssertionError(
            'Input feature name is already present in feature table')

    # Add feature to the feature table at last
    feature_dict['feature_name'] = feature_name
    if len(feature_table) > 0:
        feature_table.loc[len(feature_table)] = feature_dict
    else:
        feature_table.columns = ['feature_name', 'left_attribute',
                                 'right_attribute', 'left_attr_tokenizer',
                                 'right_attr_tokenizer', 'simfunction',
                                 'function',
                                 'function_source',
                                 'is_auto_generated']
        feature_table.loc[len(feature_table)] = feature_dict
    # Finally, return True if everything was fine
    return True

Example #34

0

Show file

File: dask_attr_equiv_blocker.py Project: anhaidgroup/py_entitymatching

    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on attribute equivalence.
        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.
      
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.
            verbose (boolean): A flag to indicate whether the debug information
                              should be logged (defaults to False).
            
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
            
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
       
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
        """

        logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR "
                    "OWN RISK.")


        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1) # last arg is
                                         # set to 1 just to reuse the function from the
                                         # old blocker.

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            l_splits = np.array_split(l_df, n_ltable_chunks)
            r_splits = np.array_split(r_df, n_rtable_chunks)
            c_splits = []

            for l in l_splits:
                for r in r_splits:
                    partial_result = delayed(_block_tables_split)(l, r, l_key, r_key,
                                             l_block_attr, r_block_attr,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             allow_missing)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(ltable_proj,
                                                              rtable_proj,
                                                              l_key, r_key,
                                                              l_block_attr,
                                                              r_block_attr,
                                                              l_output_attrs,
                                                              r_output_attrs,
                                                              l_output_prefix,
                                                              r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset

Example #35

0

Show file

File: debug_gui_decisiontree_matcher.py Project: anhaidgroup/py_entitymatching

def _vis_debug_dt(matcher,
                  train,
                  test,
                  exclude_attrs,
                  target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """

    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError(
            'PyQt5 is not installed. Please install PyQt5 to use '
            'GUI related functions in py_entitymatching.')

    # Validate the input parameters
    # # We expect the matcher to be of type DTMatcher
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type '
                             'Decision Tree matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr,
                         six.string_types,
                         error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the '
                             'train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not ch.check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'test table columns')

    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = eval_matches(predicted, target_attr, predict_attr_name)

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    em._viewapp = QtWidgets.QApplication.instance()

    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])
    app = em._viewapp

    # Get the main window application
    app = em._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()

Example #36

0

Show file

def get_attr_corres(ltable, rtable):
    """
    This function gets the attribute correspondences between the attributes
    of ltable and rtable.

    The user may need to get the correspondences so
    that he/she can generate features based those correspondences.

    Args:
        ltable,rtable (DataFrame): Input DataFrames for which
            the attribute correspondences must be obtained.

    Returns:
        A Python dictionary is returned containing the attribute
        correspondences.

        Specifically, this returns a dictionary with the following key-value
        pairs:

        corres: points to the list correspondences as tuples. Each
        correspondence is a tuple with two attributes: one from ltable
        and the other from rtable.

        ltable: points to ltable.

        rtable: points to rtable.

        Currently, 'corres' contains only pairs of attributes with exact
        names in ltable and rtable.

    Raises:
        AssertionError: If `ltable` is not of type
            pandas DataFrame.
        AssertionError: If `rtable` is not of type
            pandas DataFrame.

    Examples:

        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_c = em.get_attr_corres(A, B)


    """
    # Validate input parameters
    # # We expect the input object (ltable) to be of type pandas
    # DataFrame
    validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable')
    # # We expect the input object (rtable) to be of type pandas
    # DataFrame
    validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable')

    # Initialize the correspondence list
    correspondence_list = []
    # Check for each column in ltable, if column exists in rtable,
    # If so, add it to the correspondence list.
    # Note: This may not be the fastest way to implement this. We could
    # refactor this later.
    for column in ltable.columns:
        if column in rtable.columns:
            correspondence_list.append((column, column))
    # Initialize a correspondence dictionary.
    correspondence_dict = dict()
    # Fill the corres, ltable and rtable.
    correspondence_dict['corres'] = correspondence_list
    correspondence_dict['ltable'] = ltable
    correspondence_dict['rtable'] = rtable
    # Finally, return the correspondence dictionary
    return correspondence_dict

Example #37

0

Show file

def get_false_negatives_as_df(table, eval_summary, verbose=False):
    """
    Select only the false negatives from the input table and return as a
    DataFrame based on the evaluation results.

    Args:
        table (DataFrame): The input table (pandas DataFrame) that was used for
            evaluation.
        eval_summary (dictionary): A Python dictionary containing evaluation
            results, typically from 'eval_matches' command.

    Returns:
        A pandas DataFrame containing only the false negatives from
        the input table.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
        >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
        >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary)


    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from the catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
        table, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    data_frame = _get_dataframe(table, eval_summary['false_neg_ls'])

    # # Update catalog
    ch.log_info(logger, 'Updating catalog', verbose)

    cm.init_properties(data_frame)
    cm.copy_properties(table, data_frame)

    # # Update catalog
    ch.log_info(logger, 'Returning the dataframe', verbose)

    return data_frame

Example #38

0

Show file

def eval_matches(data_frame, gold_label_attr, predicted_label_attr):
    """
    Evaluates the matches from the matcher.

    Specifically, given a DataFrame containing golden labels and predicted
    labels, this function would evaluate the matches and return the accuracy
    results such as precision, recall and F1.

    Args:
        data_frame (DataFrame): The input pandas DataFrame containing "gold"
            labels and "predicted" labels.
        gold_label_attr (string): An attribute in the input DataFrame containing
            "gold" labels.
        predicted_label_attr (string): An attribute in the input DataFrame
            containing "predicted" labels.

    Returns:
        A Python dictionary containing the accuracy measures such as
        precision, recall, F1.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `gold_label_attr` is not of
            type string.
        AssertionError: If `predicted_label_attr` is not of
            type string.
        AssertionError: If the `gold_label_attr` is not in
            the input dataFrame.
        AssertionError: If the `predicted_label_attr` is not in
            the input dataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
        >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
    """
    # Validate input parameters

    # # We expect the input object to be of type pandas DataFrame
    validate_object_type(data_frame, pd.DataFrame, 'The input table')

    # # We expect the input attribute (gold_label_attr) to be of type string
    validate_object_type(gold_label_attr, six.string_types,
                         'The input gold_label_attr')

    # # We expect the input attribute (predicted_label_attr) to be of type
    # string
    validate_object_type(predicted_label_attr, six.string_types,
                         'The input predicted_label_attr')

    # Check whether the gold label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, gold_label_attr):
        logger.error(
            'The gold_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The gold_label_attr is not present in the input DataFrame')

    # Check whether the predicted label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, predicted_label_attr):
        logger.error(
            'The predicted_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The predicted_label_attr is not present in the input DataFrame')

    # Reset the index to get the indices set as 0..len(table)
    new_data_frame = data_frame.reset_index(drop=False, inplace=False)

    # Project out the gold and label attributes.
    gold = new_data_frame[gold_label_attr]
    predicted = new_data_frame[predicted_label_attr]

    # Get gold negatives, positives
    gold_negative = gold[gold == 0].index.values
    gold_positive = gold[gold == 1].index.values

    # Get predicted negatives, positives
    predicted_negative = predicted[predicted == 0].index.values
    predicted_positive = predicted[predicted == 1].index.values

    # get false positive indices
    false_positive_indices = \
        list(set(gold_negative).intersection(predicted_positive))

    # get true positive indices
    true_positive_indices = \
        list(set(gold_positive).intersection(predicted_positive))

    # get false negative indices
    false_negative_indices = \
        list(set(gold_positive).intersection(predicted_negative))

    # get true negative indices
    true_negative_indices = \
        list(set(gold_negative).intersection(predicted_negative))

    # Get the number of TP, FP, FN, TN
    num_true_positives = float(len(true_positive_indices))
    num_false_positives = float(len(false_positive_indices))
    num_false_negatives = float(len(false_negative_indices))
    num_true_negatives = float(len(true_negative_indices))

    # Precision = num_tp/ (num_tp + num_fp)

    # Get precision numerator, denominator
    precision_numerator = num_true_positives
    precision_denominiator = num_true_positives + num_false_positives

    # Precision = num_tp/ (num_tp + num_fn)
    # Get recall numerator, denominator
    recall_numerator = num_true_positives
    recall_denominator = num_true_positives + num_false_negatives

    # Compute precision
    if precision_denominiator == 0.0:
        precision = 0.0
    else:
        precision = precision_numerator / precision_denominiator

    # Compute recall
    if recall_denominator == 0.0:
        recall = 0.0
    else:
        recall = recall_numerator / recall_denominator

    # Compute F1
    if precision == 0.0 and recall == 0.0:
        F1 = 0.0
    else:
        F1 = (2.0 * precision * recall) / (precision + recall)

    # Get the fk_ltable and fk_rtable
    fk_ltable = cm.get_property(data_frame, 'fk_ltable')
    fk_rtable = cm.get_property(data_frame, 'fk_rtable')

    # Check if the fk_ltable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_ltable):
        logger.error('The fk_ltable (%s) contains missing values' % fk_ltable)
        raise AssertionError('The fk_ltable (%s) contains missing values' %
                             fk_ltable)

    # Check if the fk_rtable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_rtable):
        logger.error('The fk_rtable (%s) contains missing values' % fk_rtable)
        raise AssertionError('The fk_rtable (%s) contains missing values' %
                             fk_rtable)

    # Set the index values to fk_ltable and fk_rtable
    new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)

    # Get the list of false positives and false negatives.
    false_pos_ls = list(
        new_data_frame.iloc[false_positive_indices].index.values)
    false_neg_ls = list(
        new_data_frame.iloc[false_negative_indices].index.values)

    # Store and return the accuracy results.
    accuracy_results = collections.OrderedDict()
    accuracy_results['prec_numerator'] = precision_numerator
    accuracy_results['prec_denominator'] = precision_denominiator
    accuracy_results['precision'] = precision
    accuracy_results['recall_numerator'] = recall_numerator
    accuracy_results['recall_denominator'] = recall_denominator
    accuracy_results['recall'] = recall
    accuracy_results['f1'] = F1
    accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives
    accuracy_results['false_pos_num'] = num_false_positives
    accuracy_results['false_pos_ls'] = false_pos_ls
    accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives
    accuracy_results['false_neg_num'] = num_false_negatives
    accuracy_results['false_neg_ls'] = false_neg_ls
    return accuracy_results

Example #39

0

Show file

File: dask_black_box_blocker.py Project: ale3385/py_entitymatching

    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=False,
                     show_progress=True,
                     n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.        

        Blocks two tables based on a black box blocking function specified
        by the user.
        Finds tuple pairs from left and right tables that survive the black
        box function. A tuple pair survives the black box blocking function if
        the function returns False for that pair, otherwise the tuple pair is
        dropped.
        
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            verbose (boolean): A flag to indicate whether the debug
             information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     
        Returns:

            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If values in `l_output_attrs` is not of type
                string.
            AssertionError: If values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
        Examples:
            >>> def match_last_name(ltuple, rtuple):
                # assume that there is a 'name' attribute in the input tables
                # and each value in it has two words
                l_last_name = ltuple['name'].split()[1]
                r_last_name = rtuple['name'].split()[1]
                if l_last_name != r_last_name:
                    return True
                else:
                    return False
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker
            >>> bb = DaskBlackBoxBlocker()
            >>> bb.set_black_box_function(match_last_name)
            >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] )
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of standard input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate black box function
        assert self.black_box_function != None, 'Black box function is not set'

        # validate output attributes
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # do blocking

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]

        # # pickle the black-box function before passing it as an arg to
        # # _block_tables_split to be executed by each child process
        black_box_function_pkl = cp.dumps(self.black_box_function)

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_output_attrs_1, r_output_attrs_1,
                                          l_output_prefix, r_output_prefix,
                                          black_box_function_pkl,
                                          show_progress)
        else:
            # multiprocessing
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)

            c_splits = []
            for i in range(len(l_splits)):
                for j in range(len(r_splits)):
                    partial_result = delayed(_block_tables_split)(
                        l_splits[i], r_splits[j], l_key, r_key,
                        l_output_attrs_1, r_output_attrs_1, l_output_prefix,
                        r_output_prefix, black_box_function_pkl, False)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            if show_progress:
                with ProgressBar():
                    c_splits = c_splits.compute(scheduler="processes",
                                                num_workers=get_num_cores())
            else:
                c_splits = c_splits.compute(scheduler="processes",
                                            num_workers=get_num_cores())

            candset = pd.concat(c_splits, ignore_index=True)

        # # determine the attributes to retain in the output candidate set
        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs,
                                               r_output_attrs, l_output_prefix,
                                               r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset

Example #40

0

Show file

File: catalog_helper.py Project: soumyadsanyal/py_entitymatching

def check_fk_constraint(df_foreign, attr_foreign, df_base, attr_base):
    """
    Check if the foreign key is a primary key
    Args:
        df_foreign (pandas dataframe): Foreign dataframe
        attr_foreign (str): Attribute in the foreign dataframe
        df_base (pandas dataframe): Base dataframe
        attr_base (str): Attribute in the base dataframe
    Returns:
        result (bool). Returns True if the foreign key contraint is satisfied, else returns False
    Notes:
        This is an internal helper function
    """
    validate_object_type(df_foreign,
                         pd.DataFrame,
                         error_prefix='Input object (df_foreign)')

    validate_object_type(attr_foreign,
                         six.string_types,
                         error_prefix='Input attr (attr_foreign)')

    validate_object_type(df_base,
                         pd.DataFrame,
                         error_prefix='Input object (df_base)')

    validate_object_type(attr_base,
                         six.string_types,
                         error_prefix='Input attr (attr_base)')

    if not check_attrs_present(df_base, attr_base):
        logger.warning('The attribute %s is not in df_base' % attr_base)
        return False

    if not check_attrs_present(df_foreign, attr_foreign):
        logger.error('Input attr (attr_foreign) is not in df_foreign')
        return False

    if any(pd.isnull(df_foreign[attr_foreign])):
        logger.warning(
            'The attribute %s in foreign table contains null values' %
            attr_foreign)
        return False

    uniq_fk_vals = set(pd.unique(df_foreign[attr_foreign]))
    base_attr_vals = df_base[attr_base].values
    d = uniq_fk_vals.difference(base_attr_vals)
    if len(d) > 0:
        logger.warning(
            'For some attr. values in (%s) in the foreign table there are no values in '
            '(%s) in the base table' % (attr_foreign, attr_base))
        return False

    # check whether those values are unique in the base table.
    t = df_base[df_base[attr_base].isin(pd.unique(df_foreign[attr_foreign]))]
    status = is_key_attribute(t, attr_base)

    if status == False:
        logger.warning(
            'Key attr. constraint for the subset of values (derived from. %s)'
            'in %s is not satisifed' % (attr_foreign, attr_base))
        return False
    else:
        return True

Example #41

0

Show file

    def block_candset(self,
                      candset,
                      verbose=False,
                      show_progress=True,
                      n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks an input candidate set of tuple pairs based on a sequence of
        blocking rules supplied by the user.
        Finds tuple pairs from an input candidate set of tuple pairs that
        survive the sequence of blocking rules. A tuple pair survives the
        sequence of blocking rules if none of the rules in the sequence returns
        True for that pair. If any of the rules returns True, then the pair is
        blocked (dropped).

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.
            verbose (boolean): A flag to indicate whether the debug
                information  should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `show_progress` is not of type boolean.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If there are no rules to apply.
            
        Examples:
                >>> import py_entitymatching as em
                >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker
                >>> rb = DaskRuleBasedBlocker()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> block_f = em.get_features_for_blocking(A, B)
                >>> rule = ['name_name_lev(ltuple, rtuple) > 3']
                >>> rb.add_rule(rule, feature_table=block_f)
                >>> D = rb.block_tables(C) # C is the candidate set.
        """
        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_chunks)

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, ' +
            'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          verbose)

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # validate n_chunks parameter
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)

        n_chunks = get_num_partitions(n_chunks, len(candset))
        # do blocking

        # # initialize the progress bar
        # if show_progress:
        #     bar = pyprind.ProgBar(len(candset))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # get attributes to project
        l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(
            l_key, r_key, [], [])
        l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs]

        c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key,
                                                 r_key, fk_ltable, fk_rtable,
                                                 None, show_progress, n_chunks)

        # update catalog
        cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return c_df

Example #42

0

Show file

File: matchtrigger.py Project: anhaidgroup/py_entitymatching

    def execute(self, input_table, label_column, inplace=True, verbose=False):
        """ Executes the rules of the match trigger for a table of matcher
            results.

            Args:
                input_table (DataFrame): The input table of type pandas DataFrame
                    containing tuple pairs and labels from matching (defaults to None).
                label_column (string): The attribute name where the predictions
                    are stored in the input table (defaults to None).
                inplace (boolean): A flag to indicate whether the append needs to be
                    done inplace (defaults to True).
                verbose (boolean): A flag to indicate whether the debug information
                    should be logged (defaults to False).

            Returns:
                A DataFrame with predictions updated.

            Examples:
                >>> import py_entitymatching as em
                >>> mt = em.MatchTrigger()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> match_f = em.get_features_for_matching(A, B)
                >>> rule = ['title_title_lev_sim(ltuple, rtuple) > 0.7']
                >>> mt.add_cond_rule(rule, match_f)
                >>> mt.add_cond_status(True)
                >>> mt.add_action(1)
                >>> # The table H is a table with prediction labels generated from matching
                >>> mt.execute(input_table=H, label_column='predicted_labels', inplace=False)

        """

        # Validate input parameters
        # # We expect the table to be of type pandas DataFrame
        validate_object_type(input_table, pd.DataFrame, 'Input table')

        # # We expect the target_attr to be of type string if not None
        if label_column is not None and not isinstance(label_column, str):
            logger.error('Input target_attr must be a string.')
            raise AssertionError('Input target_attr must be a string.')

        # # We expect the inplace to be of type boolean
        validate_object_type(inplace, bool, 'Input inplace')

        # # We expect the verbose to be of type boolean
        validate_object_type(verbose, bool, 'Input append')

        # Validate that there are some rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            input_table, logger, verbose)
        # # validate metadata
        cm._validate_metadata_for_candset(input_table, key, fk_ltable,
                                          fk_rtable, ltable, rtable, l_key,
                                          r_key, logger, verbose)

        assert ltable is not None, 'Left table is not set'
        assert rtable is not None, 'Right table is not set'
        assert label_column in input_table.columns, 'Label column not in the input table'

        # Parse conjuncts to validate that the features are in the feature table
        for rule in self.rule_conjunct_list:
            for conjunct in self.rule_conjunct_list[rule]:
                parse_conjunct(conjunct, self.rule_ft[rule])

        if inplace == False:
            table = input_table.copy()
        else:
            table = input_table

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(l_key, drop=False)
        r_tbl = rtable.set_index(r_key, drop=False)

        # keep track of valid ids
        y = []

        column_names = list(input_table.columns)
        lid_idx = column_names.index(fk_ltable)
        rid_idx = column_names.index(fk_rtable)

        label_idx = column_names.index(label_column)
        idx = 0
        for row in input_table.itertuples(index=False):
            if row[label_idx] != self.value_to_set:
                l_row = l_tbl.loc[row[lid_idx]]
                r_row = r_tbl.loc[row[rid_idx]]
                res = self.apply_rules(l_row, r_row)
                if res == self.cond_status:
                    table.iat[idx, label_idx] = self.value_to_set
            idx += 1
        return table

Example #43

0

Show file

    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     verbose=False,
                     show_progress=True,
                     n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on the sequence of rules supplied by the user.
        Finds tuple pairs from left and right tables that survive the sequence
        of blocking rules. A tuple pair survives the sequence of blocking rules
        if none of the rules in the sequence returns True for that pair. If any
        of the rules returns True, then the pair is blocked.
        
        Args:
            
            ltable (DataFrame): The left input table.
            
            rtable (DataFrame): The right input table.
            
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            
            verbose (boolean): A flag to indicate whether the debug
                information  should be logged (defaults to False).
                
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
                                     
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
                                     

        Returns:
            
            A candidate set of tuple pairs that survived the sequence of
            blocking rules (DataFrame).

        Raises:
            
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If the input `l_output_prefix` is not of type
                string.
            AssertionError: If the input `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
            AssertionError: If there are no rules to apply.
        Examples:
                >>> import py_entitymatching as em
                >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker
                >>> rb = DaskRuleBasedBlocker()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> block_f = em.get_features_for_blocking(A, B)
                >>> rule = ['name_name_lev(ltuple, rtuple) > 3']
                >>> rb.add_rule(rule, feature_table=block_f)
                >>> C = rb.block_tables(A, B)
        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
        )

        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, 1)

        # validate data type of show_progress
        self.validate_show_progress(show_progress)

        # validate input parameters
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # validate rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        # # set index for convenience
        l_df = ltable.set_index(l_key, drop=False)
        r_df = rtable.set_index(r_key, drop=False)

        # # remove l_key from l_output_attrs and r_key from r_output_attrs
        l_output_attrs_1 = []
        if l_output_attrs:
            l_output_attrs_1 = [x for x in l_output_attrs if x != l_key]
        r_output_attrs_1 = []
        if r_output_attrs:
            r_output_attrs_1 = [x for x in r_output_attrs if x != r_key]

        # # get attributes to project
        l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(
            l_key, r_key, l_output_attrs_1, r_output_attrs_1)
        l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs]

        candset, rule_applied = self.block_tables_with_filters(
            l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1,
            l_output_prefix, r_output_prefix, verbose, show_progress,
            get_num_cores())
        # pass number of splits as
        #  the number of cores in the machine

        if candset is None:
            # no filterable rule was applied
            candset = self.block_tables_without_filters(
                l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1,
                l_output_prefix, r_output_prefix, verbose, show_progress,
                n_ltable_chunks, n_rtable_chunks)
        elif len(self.rules) > 1:
            # one filterable rule was applied but other rules are left
            # block candset by applying other rules and excluding the applied rule
            candset = self.block_candset_excluding_rule(
                candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key,
                r_output_prefix + r_key, rule_applied, show_progress,
                get_num_cores())

        retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1,
                                               r_output_attrs_1,
                                               l_output_prefix,
                                               r_output_prefix)
        if len(candset) > 0:
            candset = candset[retain_cols]
        else:
            candset = pd.DataFrame(columns=retain_cols)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset

Example #44

0

Show file

def debug_randomforest_matcher(random_forest,
                               tuple_1,
                               tuple_2,
                               feature_table,
                               table_columns,
                               exclude_attrs=None):
    """
    This function is used to debug a random forest matcher using two input
    tuples.

    Specifically, this function takes in two tuples, gets the feature vector
    using the feature table and finally passes it to the random forest  and
    displays the path that the feature vector takes in each of the decision
    trees that make up the random forest matcher.

    Args:
        random_forest (RFMatcher): The input
            random forest object that should be debugged.
        tuple_1,tuple_2 (Series): Input tuples that should be debugged.
        feature_table (DataFrame): Feature table containing the functions
            for the features.
        table_columns (list): List of all columns that will be outputted
            after generation of feature vectors.
        exclude_attrs (list): List of attributes that should be removed from
            the table columns.

    Raises:
        AssertionError: If the input feature table is not of type pandas
            DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # devel is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels')
        >>> rf = em.RFMatcher()
        >>> rf.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # F is the feature vector got from evaluation set of the labeled data.
        >>> out = rf.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> # A and B are input tables
        >>> em.debug_randomforest_matcher(rf, A.ix[1], B.ix[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')

    """
    # Validate input parameters.
    # # We expect the feature table to be of type pandas DataFrame
    validate_object_type(feature_table,
                         pd.DataFrame,
                         error_prefix='The input feature')

    # Get the classifier based on the input object.
    i = 1
    if isinstance(random_forest, RFMatcher):
        clf = random_forest.clf
    else:
        clf = random_forest

    # Get the feature names based on the table columns and the exclude
    # attributes given
    if exclude_attrs is None:
        feature_names = table_columns
    else:
        cols = [c not in exclude_attrs for c in table_columns]
        feature_names = table_columns[cols]

    # Get the probability
    prob = _get_prob(clf, tuple_1, tuple_2, feature_table, feature_names)

    # Decide prediction based on the probability (i.e num. of trees that said
    #  match over total number of trees).
    prediction = False

    if prob[1] > prob[0]:
        prediction = True
    # Print the result summary.
    print("Summary: Num trees = {0}; Mean Prob. for non-match = {1}; "
          "Mean Prob for match = {2}; "
          "Match status =  {3}".format(str(len(clf.estimators_)), str(prob[0]),
                                       str(prob[1]), str(prediction)))

    print("")
    # Now, for each estimator (i.e the decision tree call the decision tree
    # matcher's debugger
    for estimator in clf.estimators_:
        print("Tree " + str(i))
        i += 1
        _ = _debug_decisiontree_matcher(estimator,
                                        tuple_1,
                                        tuple_2,
                                        feature_table,
                                        table_columns,
                                        exclude_attrs,
                                        ensemble_flag=True)
        print("")

Example #45

0

Show file

File: dask_attr_equiv_blocker.py Project: anhaidgroup/py_entitymatching

    def block_candset(self, candset, l_block_attr, r_block_attr,
                      allow_missing=False, verbose=False, show_progress=True,
                      n_chunks=1):
        """

        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks an input candidate set of tuple pairs based on attribute equivalence.
        Finds tuple pairs from an input candidate set of tuple pairs
        such that the value of attribute l_block_attr of the left tuple in a
        tuple pair exactly matches the value of attribute r_block_attr of the 
        right tuple in the tuple pair.
        
        Args:
            candset (DataFrame): The input candidate set of tuple pairs.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.
            verbose (boolean): A flag to indicate whether the debug
                              information should be logged (defaults to False).
            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                            is set to -1, the number of partitions will be set to the 
                            number of cores in the machine.  
       
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
       
        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `n_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()            
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1)
        """
        logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
              "RISK.")

        # validate data types of input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_chunks)

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # get and validate metadata
        log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                         'fk rtable, ltable, rtable, ltable key, rtable key',
                 verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)

        # validate n_chunks parameter
        validate_object_type(n_chunks, int, 'Parameter n_chunks')
        validate_chunks(n_chunks)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_block_attr]]
        r_df = rtable[[r_key, r_block_attr]]

        # # set index for convenience
        l_df = l_df.set_index(l_key, drop=False)
        r_df = r_df.set_index(r_key, drop=False)

        # # determine number of processes to launch parallely
        n_chunks = get_num_partitions(n_chunks, len(candset))

        valid = []
        if n_chunks == 1:
            # single process
            valid = _block_candset_split(candset, l_df, r_df, l_key, r_key,
                                         l_block_attr, r_block_attr, fk_ltable,
                                         fk_rtable, allow_missing, show_progress)
        else:
            c_splits = np.array_split(candset, n_chunks)

            valid_splits = []
            for i in range(len(c_splits)):
                partial_result = delayed(_block_candset_split)(c_splits[i],
                                                               l_df, r_df,
                                                               l_key, r_key,
                                                               l_block_attr, r_block_attr,
                                                               fk_ltable, fk_rtable,
                                                               allow_missing,
                                                               False)  # setting show
                # progress to False as we will use Dask diagnostics to display progress
                #  bar
                valid_splits.append(partial_result)

            valid_splits = delayed(wrap)(valid_splits)
            if show_progress:
                with ProgressBar():
                    valid_splits = valid_splits.compute(scheduler="processes",
                                                        num_workers=get_num_cores())
            else:
                valid_splits = valid_splits.compute(scheduler="processes",
                                                    num_workers=get_num_cores())

            valid = sum(valid_splits, [])

        # construct output table
        if len(candset) > 0:
            out_table = candset[valid]
        else:
            out_table = pd.DataFrame(columns=candset.columns)

        # update the catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable,
                                  ltable, rtable)

        # return the output table
        return out_table

Example #46

0

Show file

File: attributeutils.py Project: anhaidgroup/py_entitymatching

def get_attr_corres(ltable, rtable):
    """
    This function gets the attribute correspondences between the attributes
    of ltable and rtable.

    The user may need to get the correspondences so
    that he/she can generate features based those correspondences.

    Args:
        ltable,rtable (DataFrame): Input DataFrames for which
            the attribute correspondences must be obtained.

    Returns:
        A Python dictionary is returned containing the attribute
        correspondences.

        Specifically, this returns a dictionary with the following key-value
        pairs:

        corres: points to the list correspondences as tuples. Each
        correspondence is a tuple with two attributes: one from ltable
        and the other from rtable.

        ltable: points to ltable.

        rtable: points to rtable.

        Currently, 'corres' contains only pairs of attributes with exact
        names in ltable and rtable.

    Raises:
        AssertionError: If `ltable` is not of type
            pandas DataFrame.
        AssertionError: If `rtable` is not of type
            pandas DataFrame.

    Examples:

        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_c = em.get_attr_corres(A, B)


    """
    # Validate input parameters
    # # We expect the input object (ltable) to be of type pandas
    # DataFrame
    validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable')
    # # We expect the input object (rtable) to be of type pandas
    # DataFrame
    validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable')

    # Initialize the correspondence list
    correspondence_list = []
    # Check for each column in ltable, if column exists in rtable,
    # If so, add it to the correspondence list.
    # Note: This may not be the fastest way to implement this. We could
    # refactor this later.
    for column in ltable.columns:
        if column in rtable.columns:
            correspondence_list.append((column, column))
    # Initialize a correspondence dictionary.
    correspondence_dict = dict()
    # Fill the corres, ltable and rtable.
    correspondence_dict['corres'] = correspondence_list
    correspondence_dict['ltable'] = ltable
    correspondence_dict['rtable'] = rtable
    # Finally, return the correspondence dictionary
    return correspondence_dict

Example #47

0

Show file

File: debug_gui_randomforest_matcher.py Project: anhaidgroup/py_entitymatching

def _vis_debug_rf(matcher, train, test, exclude_attrs, target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """
    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError('PyQt5 is not installed. Please install PyQt5 to use '
                      'GUI related functions in py_entitymatching.')


    # Validate the input parameters
    # # We expect the matcher to be of type RfMatcher
    if not isinstance(matcher, RFMatcher):
        logger.error('Input matcher is not of type '
                     'Random Forest matcher')
        raise AssertionError('Input matcher is not of type '
                             'Random Forest matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr, six.string_types, error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the test table columns')


    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name, append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = em.eval_matches(predicted, target_attr, predict_attr_name)
    em._viewapp = QtWidgets.QApplication.instance()
    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    # Get the main window application

    app = em._viewapp

    m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe,
                          fn_dataframe)

    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()

Example #48

0

Show file

File: booleanrulematcher.py Project: anhaidgroup/py_entitymatching

    def predict(self, table=None, target_attr=None, append=False, inplace=True):
        """Predict interface for the matcher.

            A point to note is all the input parameters have a default value of
            None.

            Args:
                table (DataFrame): The input candidate set of type pandas DataFrame
                    containing tuple pairs (defaults to None).
                target_attr (string): The attribute name where the predictions
                    need to be stored in the input table (defaults to None).
                append (boolean): A flag to indicate whether the predictions need
                    to be appended in the input DataFrame (defaults to False).
                return_probs (boolean): A flag to indicate where the prediction probabilities
                    need to be returned (defaults to False). If set to True, returns the
                    probability if the pair was a match.
                inplace (boolean): A flag to indicate whether the append needs to be
                    done inplace (defaults to True).

            Returns:
                An array of predictions or a DataFrame with predictions updated.

            Examples:
                >>> import py_entitymatching as em
                >>> brm = em.BooleanRuleMatcher()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> match_f = em.get_features_for_matching(A, B)
                >>> rule = ['address_address_lev(ltuple, rtuple) > 6']
                >>> brm.add_rule(rule, match_f)
                >>> # The table S is a cand set generated by the blocking and then labeling phases
                >>> brm.predict(S, target_attr='pred_label', append=True)

        """

        # Validate input parameters
        # # We expect the table to be of type pandas DataFrame
        validate_object_type(table, pd.DataFrame, 'Input table')

        # # We expect the target_attr to be of type string if not None
        if target_attr is not None and not isinstance(target_attr, str):
                logger.error('Input target_attr must be a string.')
                raise AssertionError('Input target_attr must be a string.')

        # # We expect the append to be of type boolean
        validate_object_type(append, bool, 'Input append')

        # # We expect the inplace to be of type boolean
        validate_object_type(inplace, bool, 'Input inplace')

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            table, logger, False)

        # # validate metadata
        cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, False)

        # Validate that there are some rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # Parse conjuncts to validate that the features are in the feature table
        for rule in self.rule_conjunct_list:
            for conjunct in self.rule_conjunct_list[rule]:
                parse_conjunct(conjunct, self.rule_ft[rule])

        if table is not None:
            y = self._predict_candset(table)
            if target_attr is not None and append is True:
                if inplace == True:
                    table[target_attr] = y
                    return table
                else:
                    tbl = table.copy()
                    tbl[target_attr] = y
                    return tbl
            else:
                return y
        else:
            raise SyntaxError('The arguments supplied does not match the signatures supported !!!')

Example #49

0

Show file

File: extractfeatures.py Project: anhaidgroup/py_entitymatching

def extract_feature_vecs(candset, attrs_before=None, feature_table=None,
                         attrs_after=None, verbose=False,
                         show_progress=True, n_jobs=1):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.


    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, '
                        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Extract features



    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    n_procs = get_num_procs(n_jobs, len(candset))

    c_splits = pd.np.array_split(candset, n_procs)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj,
                                                                                           fk_ltable_idx,
                                                                                           fk_rtable_idx,
                                                                                           l_df, r_df,
                                                                                           c_splits[i],
                                                                                           show_progress and i == len(
                                                                                               c_splits) - 1)
                                                   for i in range(len(c_splits)))

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors

Example #50

0

Show file

File: dask_down_sample.py Project: anhaidgroup/py_entitymatching

def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False,
                seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1,
                n_sample_rtable_chunks=1):


    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks')


    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]


    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                             start_row_id,
                                                             rem_puncs, rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)


    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]


    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                              inverted_index, rem_puncs,
                                rem_stop_words, seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]



    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled

Example #51

0

Show file

File: addfeatures.py Project: anhaidgroup/py_entitymatching

def get_feature_fn(feature_string, tokenizers, similarity_functions):
    """
    This function creates a feature in a declarative manner.

    Specifically, this function uses the feature string, parses it and
    compiles it into a function using the given tokenizers and similarity
    functions. This compiled function will take in two tuples and return a
    feature value (typically a number).

    Args:
        feature_string (string): A feature expression
            to be converted into a function.
        tokenizers (dictionary): A Python dictionary containing tokenizers.
            Specifically, the dictionary contains tokenizer names as keys and
            tokenizer functions as values. The tokenizer function typically
            takes in a string and returns a list of tokens.
        similarity_functions (dictionary): A Python dictionary containing
            similarity functions. Specifically, the dictionary contains
            similarity function names as keys and similarity functions as
            values. The similarity function typically
            takes in a string or two lists of tokens and returns a number.

    Returns:

        This function returns a Python dictionary which contains sufficient
        information (such as attributes, tokenizers, function code) to be added
        to the feature table.

        Specifically the Python dictionary contains the following keys:
        'left_attribute', 'right_attribute',
        'left_attr_tokenizer',
        'right_attr_tokenizer', 'simfunction', 'function', and
        'function_source'.

        For all the keys except the 'function' and 'function_source' the
        value will be either a valid string (if the input feature string is
        parsed correctly) or PARSE_EXP (if the parsing was not successful).
        The 'function' will have a valid Python function as value,
        and 'function_source' will have the Python function's source in
        string format.

        The created function is a self-contained function
        which means that the tokenizers and sim functions that it calls are
        bundled along with the returned function code.

    Raises:
        AssertionError: If `feature_string` is not of type string.
        AssertionError: If the input `tokenizers` is not of type
            dictionary.
        AssertionError: If the input `similarity_functions` is not of
            type dictionary.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')

        >>> block_t = em.get_tokenizers_for_blocking()
        >>> block_s = em.get_sim_funs_for_blocking()
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s)
        >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r)

        >>> match_t = em.get_tokenizers_for_matching()
        >>> match_s = em.get_sim_funs_for_matching()
        >>> match_f = em.get_features_for_matching(A, B)
        >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s)
        >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r)


    See Also:
        :meth:`py_entitymatching.get_sim_funs_for_blocking`,
        :meth:`py_entitymatching.get_tokenizers_for_blocking`,
        :meth:`py_entitymatching.get_sim_funs_for_matching`,
        :meth:`py_entitymatching.get_tokenizers_for_matching`
    """
    # Validate input parameters
    # # We expect the input feature string to be of type string
    validate_object_type(feature_string, six.string_types, error_prefix='Input feature')

    # # We expect the input object tokenizers to be of type python dictionary
    validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)')

    # # We expect the input object similarity functions to be of type python
    # dictionary
    validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)')

    # Initialize a dictionary to have tokenizers/similarity functions
    dict_to_compile = {}
    # Update the dictionary with similarity functions
    if len(similarity_functions) > 0:
        dict_to_compile.update(similarity_functions)
    # Update the dictionary with tokenizers
    if len(tokenizers) > 0:
        dict_to_compile.update(tokenizers)

    # Create a python function string based on the input feature string
    function_string = 'def fn(ltuple, rtuple):\n'
    function_string += '    '
    function_string += 'return ' + feature_string

    # Parse the feature string to get the tokenizer, sim. function, and the
    # attribute that it is operating on
    parsed_dict = _parse_feat_str(feature_string, tokenizers,
                                  similarity_functions)

    # Compile the function string using the constructed dictionary
    six.exec_(function_string, dict_to_compile)

    # Update the parsed dict with the function and the function source
    parsed_dict['function'] = dict_to_compile['fn']
    parsed_dict['function_source'] = function_string

    # Finally, return the parsed dictionary
    return parsed_dict

Example #52

0

Show file

File: pickles.py Project: anhaidgroup/py_entitymatching

def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Saves a DataFrame to disk along with its metadata in a pickle format.

    This function saves a  DataFrame to disk along with its metadata from
    the catalog.

    Specifically, this function saves the DataFrame in the given
    file path, and saves the metadata in the same directory (as the
    file path) but with a different extension. This extension can be
    optionally given by the user (defaults to '.pklmetadata').

    Args:
        data_frame (DataFrame): The DataFrame that should be saved.

        file_path (string): The file path where the DataFrame must be stored.

        metadata_ext (string): The metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A Boolean value of True is returned if the DataFrame is successfully
        saved.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.
        AssertionError: If a file cannot written in the given `file_path`.

    Examples:

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta


    See Also:
        :meth:`~py_entitymatching.load_table`

    Note:
        This function is a bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed using a text editor. But a DataFrame stored using 'save_table' is
        stored in a special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    validate_object_type(data_frame, pd.DataFrame)

    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    validate_object_type(metadata_ext, six.string_types, error_prefix='Input Metadata ext')

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True

Example #53

0

Show file

File: blocker.py Project: anhaidgroup/py_entitymatching

 def validate_types_params_candset(self, candset, verbose, show_progress, n_jobs):
     validate_object_type(candset, pd.DataFrame, 'Input candset')
     validate_object_type(verbose, bool, 'Parameter verbose')
     validate_object_type(show_progress, bool, 'Parameter show_progress')
     validate_object_type(n_jobs, int, 'Parameter n_jobs')

Example #54

0

Show file

File: pickles.py Project: anhaidgroup/py_entitymatching

def save_object(object_to_save, file_path):
    """
    Saves a Python object to disk.

    This function is intended to be used to save py_entitymatching objects such as
    rule-based blocker, feature vectors, etc. A user would like to store
    py_entitymatching objects to disk, when he/she wants to save the workflow and
    resume it later. This function provides a way to save the required
    objects to disk.

    This function takes in the object to save the file path. It pickles the object and
    stores it in the file path specified.

    Args:
        object_to_save (Python object): The Python object to save. This can be
            a rule-based blocker, feature vectors, etc.

        file_path (string): The file path where the object must be saved.


    Returns:
        A Boolean value of True is returned, if the saving was successful.

    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If a file cannot be written in the given `file_path`.

    Examples:

        >>> import pandas as pd
        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]})
        >>> rb = em.RuleBasebBlocker()
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> rule1 = ['colA_colA_lev_dist(ltuple, rtuple) > 3']
        >>> rb.add_rule(rule1)
        >>> em.save_object(rb, './rule_blocker.pkl')



    See Also:
        :meth:`~load_object`

    """
    # Validate input parameters

    validate_object_type(file_path, six.string_types, 'Input file path')

    # Check whether the file path is valid and if a file is already present
    # at that path.
    # noinspection PyProtectedMember
    can_write, file_exists = ps._check_file_path(file_path)

    # Check whether we can write
    if can_write:
        # If a file already exists in that location, issue a warning and
        # overwrite the file.
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file in 'wb' mode as we are writing a binary file.
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(object_to_save, file_handler)
        else:
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(object_to_save, file_handler)

    # If we cannot write, then raise an error.
    else:
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Return True if everything was successful.
    return True

Example #55

0

Show file

File: blocker.py Project: anhaidgroup/py_entitymatching

 def validate_allow_missing(self, allow_missing):
     validate_object_type(allow_missing, bool, 'Parameter allow_missing')

Example #56

0

Show file

File: pickles.py Project: anhaidgroup/py_entitymatching

def load_table(file_path, metadata_ext='.pklmetadata'):
    """
    Loads a pickled DataFrame from a file along with its metadata.

    This function loads a DataFrame from a file stored in pickle format.

    Further, this function looks for a metadata file with the same file name
    but with an extension given by the user (defaults to '.pklmetadata'. If the
    metadata file is present, the function will update the metadata for that
    DataFrame in the catalog.

    Args:
        file_path (string): The file path to load the file from.
        metadata_ext (string): The metadata file extension (defaults to
            '.pklmetadata') that should be used to generate metadata file name.

    Returns:
        If the loading is successful, the function will return a pandas
        DataFrame read from the file. The catalog will be updated with the
        metadata read from the metadata file (if the file was present).

    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.


    Examples:

        >>> A = em.load_table('./A.pkl')

        >>> A = em.load_table('./A.pkl', metadata_ext='.pklmeta')


    See Also:
        :meth:`~py_entitymatching.save_table`



    Note:
        This function is different from read_csv_metadata in two aspects.
        First, this function currently does not support reading in candidate
        set tables, where there are more metadata such as ltable,
        rtable than just 'key', and conceptually the user is expected to
        provide ltable and rtable information while calling this function. (
        this support will be added shortly). Second, this function loads the
        table stored in a pickle format.


    """
    # Validate input parameters
    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    validate_object_type(metadata_ext, six.string_types)

    # Load the object from the file path. Note that we use a generic load
    # object to load in the DataFrame too.
    data_frame = load_object(file_path)

    # Load metadata from file path

    # # Check if the meta data file is present
    if ps._is_metadata_file_present(file_path, extension=metadata_ext):
        # Construct the metadata file name, and read it from the disk.

        # # Get the file name used to load the DataFrame
        file_name, _ = os.path.splitext(file_path)
        # # Construct the metadata file name
        metadata_filename = file_name + metadata_ext
        # # Load the metadata from the disk
        metadata_dict = load_object(metadata_filename)

        # Update the catalog with the properties read from the disk
        for property_name, property_value in six.iteritems(metadata_dict):
            if property_name == 'key':
                # If the property_name is key call set_key as the function
                # will check for the integrity of key before setting it in
                # the catalog
                cm.set_key(data_frame, property_value)
            else:
                cm.set_property(data_frame, property_name, property_value)
    else:
        # If the metadata file is not present then issue a warning
        logger.warning('There is no metadata file')

    # Return the DataFrame
    return data_frame

Example #57

0

Show file

File: blocker.py Project: anhaidgroup/py_entitymatching

 def validate_show_progress(self, show_progress):
     validate_object_type(show_progress, bool, 'Parameter show_progress')

Example #58

0

Show file

File: addfeatures.py Project: anhaidgroup/py_entitymatching

def add_blackbox_feature(feature_table, feature_name, feature_function):
    """
    Adds a black box feature to the feature table.

    Args:
        feature_table (DataFrame): The input DataFrame (typically a feature
            table) to which the feature must be added.
        feature_name (string): The name that should be given to the feature.
        feature_function (Python function): A Python function for the black box
            feature.

    Returns:
        A Boolean value of True is returned if the addition was successful.

    Raises:
        AssertionError: If the input `feature_table` is not of type
            DataFrame.
        AssertionError: If the input `feature_name` is not of type
            string.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the
            feature table.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> def age_diff(ltuple, rtuple):
        >>>     # assume that the tuples have age attribute and values are valid numbers.
        >>>   return ltuple['age'] - rtuple['age']
        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)

    """
    # Validate input parameters
    # # We expect the feature_table to be of type pandas DataFrame
    validate_object_type(feature_table, pd.DataFrame, 'Input feature table')

    # # We expect the feature_name to be of type string
    validate_object_type(feature_name, six.string_types, 'Input feature name')

    # Check if the input feature table contains necessary columns
    dummy_feature_table = create_feature_table()
    if sorted(dummy_feature_table.columns) != sorted(feature_table.columns):
        logger.error('Input feature table does not have the necessary columns')
        raise AssertionError(
            'Input feature table does not have the necessary columns')

    # Check if the feature table already contains the given feature name
    feat_names = list(feature_table['feature_name'])
    if feature_name in feat_names:
        logger.error('Input feature name is already present in feature table')
        raise AssertionError(
            'Input feature name is already present in feature table')

    feature_dict = {}
    feature_dict['feature_name'] = feature_name
    feature_dict['function'] = feature_function
    feature_dict['left_attribute'] = None
    feature_dict['right_attribute'] = None
    feature_dict['left_attr_tokenizer'] = None
    feature_dict['right_attr_tokenizer'] = None
    feature_dict['simfunction'] = None
    feature_dict['function_source'] = None
    feature_dict['is_auto_generated'] = False

    # Add the feature to the feature table as a last entry.
    if len(feature_table) > 0:
        feature_table.loc[len(feature_table)] = feature_dict
    else:
        feature_table.columns = ['feature_name', 'left_attribute',
                                 'right_attribute', 'left_attr_tokenizer',
                                 'right_attr_tokenizer', 'simfunction',
                                 'function',
                                 'function_source', 'is_auto_generated']
        feature_table.loc[len(feature_table)] = feature_dict
    # Finally return True if the addition was successful
    return True

Example #59

0

Show file

File: down_sample.py Project: anhaidgroup/py_entitymatching

def down_sample(table_a, table_b, size, y_param, show_progress=True,
                verbose=False, seed=None, rem_stop_words=True,
                rem_puncs=True, n_jobs=1):
    """
    This function down samples two tables A and B into smaller tables A' and
    B' respectively.

    Specifically, first it randomly selects `size` tuples
    from the table B to be table B'. Next, it builds an inverted index I
    (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
    finds a set P of k/2 tuples from I that match x,
    and a set Q of k/2 tuples randomly selected from A - P.
    The idea is for A' and B' to share some matches yet be
    as representative of A and B as possible.

    Args:
        table_a,table_b (DataFrame): The input tables A and B.
        size (int): The size that table B should be down sampled to.
        y_param (int): The parameter to control the down sample size of table A.
            Specifically, the down sampled size of table A should be close to
            size * y_param.
        show_progress (boolean): A flag to indicate whether a progress bar
            should be displayed (defaults to True).
        verbose (boolean): A flag to indicate whether the debug information
         should be displayed (defaults to False).
        seed (int): The seed for the pseudo random number generator to select
            the tuples from A and B (defaults to None).
        rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
         must be removed.
        rem_puncs (boolean): A flag to indicate whether the punctuations must be 
         removed from the strings.
        n_jobs (int): The number of parallel jobs to be used for computation
            (defaults to 1). If -1 all CPUs are used. If 0 or 1,
            no parallel computation is used at all, which is useful for
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
            used (where n_cpus is the total number of CPUs in the
            machine). Thus, for n_jobs = -2, all CPUs but one are used.
            If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
            computation is used (i.e., equivalent to the default).
            

    Returns:
        Down sampled tables A and B as pandas DataFrames.

    Raises:
        AssertionError: If any of the input tables (`table_a`, `table_b`) are
            empty or not a DataFrame.
        AssertionError: If `size` or `y_param` is empty or 0 or not a
            valid integer value.
        AssertionError: If `seed` is not a valid integer
            value.
        AssertionError: If `verbose` is not of type bool.
        AssertionError: If `show_progress` is not of type bool.
        AssertionError: If `n_jobs` is not of type int.

    Examples:
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1)

        # Example with seed = 0. This means the same sample data set will be returned
        # each time this function is run.
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1)
    """

    if not isinstance(table_a, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A is not of type pandas DataFrame')

    if not isinstance(table_b, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError(
            'Input table B is not of type pandas DataFrame')

    if len(table_a) == 0 or len(table_b) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(table_b) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_jobs, int, 'Parameter n_jobs')

    # get and validate required metadata
    log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

    # # # get metadata
    # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger,
    #                                              verbose)
    #
    # # # validate metadata
    # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger,
    #                                 verbose)
    # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger,
    #                                 verbose)

    # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have
    # good coverage in the down sampled A' and B'.
    s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs)

    # Randomly select size tuples from table B to be B'
    # If a seed value has been give, use a RandomState with the given seed
    b_sample_size = min(math.floor(size), len(table_b))
    if seed is not None:
        rand = RandomState(seed)
    else:
        rand = RandomState()
    b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False))

    n_jobs = get_num_procs(n_jobs, len(table_b))

    sample_table_b = table_b.loc[b_tbl_indices]
    if n_jobs <= 1:
        # Probe inverted index to find all tuples in A that share tokens with tuples in B'.
        s_tbl_indices = _probe_index_split(sample_table_b, y_param,
                                           len(table_a), s_inv_index, show_progress,
                                           seed, rem_stop_words, rem_puncs)
    else:
        sample_table_splits = np.array_split(sample_table_b, n_jobs)
        results = Parallel(n_jobs=n_jobs)(
            delayed(_probe_index_split)(sample_table_splits[job_index], y_param,
                                       len(table_a), s_inv_index,
                                        (show_progress and (job_index == n_jobs - 1)),
                                       seed, rem_stop_words, rem_puncs)
            for job_index in range(n_jobs)
        )
        results = map(list, results)
        s_tbl_indices = set(sum(results, []))

    s_tbl_indices = list(s_tbl_indices)
    l_sampled = table_a.iloc[list(s_tbl_indices)]
    r_sampled = table_b.iloc[list(b_tbl_indices)]

    # update catalog
    if cm.is_dfinfo_present(table_a):
        cm.copy_properties(table_a, l_sampled)
    if cm.is_dfinfo_present(table_b):
        cm.copy_properties(table_b, r_sampled)

    return l_sampled, r_sampled

Example #60

0

Show file

def dask_extract_feature_vecs(candset,
                              attrs_before=None,
                              feature_table=None,
                              attrs_after=None,
                              verbose=False,
                              show_progress=True,
                              n_chunks=1):
    """
    WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
            
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
            
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
            
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
            
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
            
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).
            
        n_chunks (int): The number of partitions to split the candidate set. If it 
            is set to -1, the number of partitions will be set to the 
            number of cores in the machine.  


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.
        AssertionError: If `n_chunks` is not of type
                int.

    Examples:
        >>> import py_entitymatching as em
        >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
    )

    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    validate_object_type(n_chunks, int, 'Parameter n_chunks')
    validate_chunks(n_chunks)

    n_chunks = get_num_partitions(n_chunks, len(candset))

    c_splits = np.array_split(candset, n_chunks)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = []

    for i in range(len(c_splits)):
        partial_result = delayed(get_feature_vals_by_cand_split)(
            pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i],
            False)
        feat_vals_by_splits.append(partial_result)

    feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits)
    if show_progress:
        with ProgressBar():
            feat_vals_by_splits = feat_vals_by_splits.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        feat_vals_by_splits = feat_vals_by_splits.compute(
            scheduler="processes", num_workers=get_num_cores())

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors