Ejemplo n.º 1
0
    def _fit_ex_attrs(self, table, exclude_attrs, target_attr):
        """
        This function supports the fit method, where the DataFrame can be
        given as input along with what attributes must be excluded and the
        target attribute.
        """
        # Validate the input parameters.
        # # We expect the input table to be of type pandas DataFrame.
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type DataFrame')
            raise AssertionError('Input table is not of type DataFrame')

        # Convert the exclude attributes into list (if the input is not of list)
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Check if the exclude attributes are present in the input table. If
        # not, raise an error.
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')

        # Check if the target attribute is present in the input table. If
        # not, raise an error.
        if not ch.check_attrs_present(table, target_attr):
            logger.error('The target_attr is not present in the input table')
            raise AssertionError(
                'The target_attr is not present in the input table')

        # We now remove duplicate attributes from the exclude_attrs
        exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

        # We explicitly append target attribute to exclude attributes
        if target_attr not in exclude_attrs:
            exclude_attrs.append(target_attr)

        # Now, we get the attributes to project
        attributes_to_project = gh.list_diff(list(table.columns),
                                             exclude_attrs)

        # Get the predictors and the target attribute from the input table
        # based on the exclude attrs and the target attribute.
        x = table[attributes_to_project]
        y = table[target_attr]

        self._fit_sklearn(x, y, check_rem=False)
Ejemplo n.º 2
0
    def _fit_ex_attrs(self, table, exclude_attrs, target_attr):
        """
        This function supports the fit method, where the DataFrame can be
        given as input along with what attributes must be excluded and the
        target attribute.
        """
        # Validate the input parameters.
        # # We expect the input table to be of type pandas DataFrame.
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type DataFrame')
            raise AssertionError('Input table is not of type DataFrame')

        # Convert the exclude attributes into list (if the input is not of list)
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Check if the exclude attributes are present in the input table. If
        # not, raise an error.
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')

        # Check if the target attribute is present in the input table. If
        # not, raise an error.
        if not ch.check_attrs_present(table, target_attr):
            logger.error('The target_attr is not present in the input table')
            raise AssertionError(
                'The target_attr is not present in the input table')

        # We now remove duplicate attributes from the exclude_attrs
        exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

        # We explicitly append target attribute to exclude attributes
        if target_attr not in exclude_attrs:
            exclude_attrs.append(target_attr)

        # Now, we get the attributes to project
        attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs)

        # Get the predictors and the target attribute from the input table
        # based on the exclude attrs and the target attribute.
        x = table[attributes_to_project]
        y = table[target_attr]

        self._fit_sklearn(x, y, check_rem=False)
Ejemplo n.º 3
0
    def _predict_ex_attrs(self, table, exclude_attrs, return_prob=False):
        """
        Variant of predict method, where data is derived based on exclude
        attributes.
        """
        # Validate input parameters
        # # We expect input table to be a pandas DataFrame.
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type DataFrame')
            raise AssertionError('Input table is not of type DataFrame')

        # # We expect the exclude attributes to be a list, if not convert it
        # into a list.
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Check if the input table contains the attributes to be excluded. If
        #  not raise an error.
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')

        # Get the attributes to project.
        attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs)
        # Get feature vectors and the target attribute
        x = table[attributes_to_project]


        # Do the predictions and return the probabilities (if required)
        res = self._predict_sklearn(x, check_rem=False, return_prob=return_prob)
        return res
Ejemplo n.º 4
0
    def _predict_ex_attrs(self, table, exclude_attrs, return_prob=False):
        """
        Variant of predict method, where data is derived based on exclude
        attributes.
        """
        # Validate input parameters
        # # We expect input table to be a pandas DataFrame.
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type DataFrame')
            raise AssertionError('Input table is not of type DataFrame')

        # # We expect the exclude attributes to be a list, if not convert it
        # into a list.
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Check if the input table contains the attributes to be excluded. If
        #  not raise an error.
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in exclude_attrs is not present ' \
                'in the input table')

        # Get the attributes to project.
        attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs)
        # Get feature vectors and the target attribute
        x = table[attributes_to_project]

        # Do the predictions and return the probabilities (if required)

        res = self._predict_sklearn(x, check_rem=False, return_prob=return_prob)
        return res
def _validate_metadata_for_table(table, key, output_string, lgr, verbose):
    """
    Validates metadata for table (DataFrame)

    """
    # Validate input parameters
    # # We expect the input table to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # Check the key column is present in the table
    if not ch.check_attrs_present(table, key):
        logger.error('Input key ( %s ) not in the DataFrame' % key)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key)

    # Validate the key
    ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key),
                verbose)
    # We expect the key to be of type string
    if not isinstance(key, six.string_types):
        logger.error('Key attribute must be of type string')
        raise AssertionError('Key attribute must be of type string')
    if not ch.is_key_attribute(table, key, verbose):
        logger.error('Attribute %s in the %s table does not '
                     'qualify to be the key' % (str(key), output_string))
        raise AssertionError('Attribute %s in the %s table does not '
                             'qualify to be the key' %
                             (str(key), output_string))
    ch.log_info(lgr, '..... Done', verbose)
    return True
def _get_xy_data_ex(table, exclude_attrs, target_attr):
    # Validate the input parameters
    # # We expect the input table to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type DataFrame')
        raise AssertionError(
            logger.error('Input table is not of type dataframe'))

    # We expect exclude attributes to be of type list. If not convert it into
    #  a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Check if the exclude attributes are present in the input table
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs '
                     'is not present '
                     'in the input table')
        raise AssertionError('The attributes mentioned in exclude_attrs '
                             'is not present '
                             'in the input table')
    # Check if the target attribute is present in the input table
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError(
            'The target_attr is not present in the input table')

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # Explicitly add the target attribute to exclude attribute (if it is not
    # already present)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Project the list of attributes that should be used for scikit-learn's
    # functions.
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)

    # Get the values for x
    x = table[attrs_to_project].values
    # Get the values for x
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    # Return x and y
    return x, y
def _get_xy_data_ex(table, exclude_attrs, target_attr):
    # Validate the input parameters
    # # We expect the input table to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)
    # We expect exclude attributes to be of type list. If not convert it into
    #  a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Check if the exclude attributes are present in the input table
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs '
                     'is not present '
                     'in the input table')
        raise AssertionError(
            'The attributes mentioned in exclude_attrs '
            'is not present '
            'in the input table')
    # Check if the target attribute is present in the input table
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError(
            'The target_attr is not present in the input table')

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # Explicitly add the target attribute to exclude attribute (if it is not
    # already present)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Project the list of attributes that should be used for scikit-learn's
    # functions.
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)

    # Get the values for x
    x = table[attrs_to_project].values
    # Get the values for x
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    # Return x and y
    return x, y
def set_fk_rtable(data_frame, foreign_key_rtable):
    """
    Sets the foreign key to rtable for a DataFrame in the catalog.

    Specifically this function is a sugar function that will set the foreign
    key to right table using set_property function. This function
    is typically called on a DataFrame which contains metadata such as
    fk_ltable, fk_rtable, ltable, rtable.



    Args:
        data_frame (DataFrame): The input DataFrame for which the foreign key
            rtable property must be set.
        foreign_key_rtable (string): The attribute that must be set as
            foreign key to rtable in the catalog.

    Returns:
        A Boolean value of True is returned if the foreign key to rtable was
            set successfully.

    Raises:
        AssertionError: If `data_frame` is not of type
          pandas DataFrame.
        AssertionError: If `foreign_key_rtable` is not of
            type string.
        AssertionError: If `fk_rtable` is not in the input
            DataFrame.

    See Also:
        :meth:`~py_entitymatching.set_property`

    """
    # Validate the input parameters
    # # The input object is expected to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not isinstance(foreign_key_rtable, six.string_types):
        logger.error('Input (foreign key ltable) is not of type pandas data '
                     'frame')
        raise AssertionError(
            'Input (foreign key rtable) is not of type pandas '
            'data frame')

    # Check if the given attribute is present in the DataFrame
    if not ch.check_attrs_present(data_frame, foreign_key_rtable):
        logger.error('Input attr. ( %s ) not in the DataFrame' %
                     foreign_key_rtable)
        raise KeyError('Input attr. ( %s ) not in the DataFrame' %
                       foreign_key_rtable)

    # Finally set the property and relay the result
    return set_property(data_frame, 'fk_rtable', foreign_key_rtable)
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df,
                                              [key, fk_ltable, fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df

        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
Ejemplo n.º 10
0
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df, [key, fk_ltable,
                                                     fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df


        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
def set_fk_ltable(data_frame, fk_ltable):
    """
    Sets the foreign key to ltable for a DataFrame in the catalog.

    Specifically this function is a sugar function that will set the foreign
    key to the left table using :meth:`py_entitymatching.set_property` function. This
    function is typically called on a DataFrame which contains metadata such as
    fk_ltable, fk_rtable, ltable, rtable.

    Args:
        data_frame (DataFrame): The input DataFrame for which the foreign key
            ltable property must be set.
        fk_ltable (string): The attribute that must ne set as the foreign key
            to the ltable in the catalog.

    Returns:
        A Boolean value of True is returned if the foreign key to ltable was
        set successfully.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `fk_ltable` is not of type
            string.
        AssertionError: If `fk_ltable` is not in the input
            DataFrame.

    See Also:
        :meth:`~py_entitymatching.set_property`

    """
    # Validate the input parameters
    # # We expect the input object to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # We expect the input fk_ltable to be of type string
    if not isinstance(fk_ltable, six.string_types):
        logger.error('The input (fk_ltable) is not of type string')
        raise AssertionError('The input (fk_ltable) is not of type string')

    # # The fk_ltable attribute should be one of the columns in the input
    # DataFrame
    if not ch.check_attrs_present(data_frame, fk_ltable):
        logger.error('Input attr. ( %s ) not in the DataFrame' % fk_ltable)
        raise KeyError('Input attr. ( %s ) not in the DataFrame' % fk_ltable)

    # Call the set_property function and relay the result.
    return set_property(data_frame, 'fk_ltable', fk_ltable)
Ejemplo n.º 12
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error(
            'The label column name (%s) is already present in the '
            'input table', label_column_name)
        raise AssertionError(
            'The label column name (%s) is already present '
            'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    # Return True if everything was successful
    return True
Ejemplo n.º 13
0
def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    validate_object_type(table, pd.DataFrame)

    # # The label column name is expected to be of type string
    validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Return True if everything was successful
    return True
def _vis_debug_dt(matcher,
                  train,
                  test,
                  exclude_attrs,
                  target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """

    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError(
            'PyQt5 is not installed. Please install PyQt5 to use '
            'GUI related functions in py_entitymatching.')

    # Validate the input parameters
    # # We expect the matcher to be of type DTMatcher
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type '
                             'Decision Tree matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr,
                         six.string_types,
                         error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not ch.check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the '
                             'train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not ch.check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the '
                             'test table columns')

    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = eval_matches(predicted, target_attr, predict_attr_name)

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    em._viewapp = QtWidgets.QApplication.instance()

    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])
    app = em._viewapp

    # Get the main window application
    app = em._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()
def set_key(data_frame, key_attribute):
    """
    Sets the value of 'key' property for a DataFrame in the catalog with the
    given attribute (i.e column name).

    Specifically, this function set the the key attribute for the DataFrame
    if the given attribute satisfies the following two properties:

        The key attribute should have unique values.

        The key attribute should not have missing values. A missing value
        is represented as np.NaN.

    Args:
        data_frame (DataFrame): The DataFrame for which the key must be set in
            the catalog.
        key_attribute (string): The key attribute (column name) in the
            DataFrame.

    Returns:
        A Boolean value of True is returned, if the given attribute
        satisfies the conditions for a key and the update was successful.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `key_attribute` is not of type string.
        KeyError: If given `key_attribute` is not in the DataFrame columns.

    See Also:
        :meth:`~py_entitymatching.set_property`


    """
    # Validate input parameters

    # # We expect the input object (data_frame) to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # We expect input key attribute to be of type string
    if not isinstance(key_attribute, six.string_types):
        logger.error('Input key attribute is not of type string')

    # Check if the key attribute is present as one of the columns in the
    # DataFrame
    if not ch.check_attrs_present(data_frame, key_attribute):
        logger.error('Input key ( %s ) not in the DataFrame' % key_attribute)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key_attribute)

    # Check if the key attribute satisfies the conditions to be a key. If
    # not, just return False.
    # Note: Currently it is not clear, whether we should return False from
    # here or raise an exception. As of now resorting to just returning
    # False, because this function is used by other computation
    # intensive commands in py_entitymatching and raising an exception might make all
    # the work done in those commands go in vain (or those commands should
    # catch the exception correctly, which may be complicated and require
    # changes to the current code). We need to revisit this
    # later.
    if ch.is_key_attribute(data_frame, key_attribute) is False:
        logger.warning('Attribute (%s ) does not qualify  to be a key; Not '
                       'setting/replacing the key' % key_attribute)
        return False
    else:
        # Set the key property for the input DataFrame
        return set_property(data_frame, 'key', key_attribute)
Ejemplo n.º 16
0
def eval_matches(data_frame, gold_label_attr, predicted_label_attr):
    """
    Evaluates the matches from the matcher.

    Specifically, given a DataFrame containing golden labels and predicted
    labels, this function would evaluate the matches and return the accuracy
    results such as precision, recall and F1.

    Args:
        data_frame (DataFrame): The input pandas DataFrame containing "gold"
            labels and "predicted" labels.
        gold_label_attr (string): An attribute in the input DataFrame containing
            "gold" labels.
        predicted_label_attr (string): An attribute in the input DataFrame
            containing "predicted" labels.

    Returns:
        A Python dictionary containing the accuracy measures such as
        precision, recall, F1.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `gold_label_attr` is not of
            type string.
        AssertionError: If `predicted_label_attr` is not of
            type string.
        AssertionError: If the `gold_label_attr` is not in
            the input dataFrame.
        AssertionError: If the `predicted_label_attr` is not in
            the input dataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
        >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
    """
    # Validate input parameters

    # # We expect the input object to be of type pandas DataFrame
    validate_object_type(data_frame, pd.DataFrame, 'The input table')

    # # We expect the input attribute (gold_label_attr) to be of type string
    validate_object_type(gold_label_attr, six.string_types,
                         'The input gold_label_attr')

    # # We expect the input attribute (predicted_label_attr) to be of type
    # string
    validate_object_type(predicted_label_attr, six.string_types,
                         'The input predicted_label_attr')

    # Check whether the gold label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, gold_label_attr):
        logger.error(
            'The gold_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The gold_label_attr is not present in the input DataFrame')

    # Check whether the predicted label attribute is present in the input table
    if not ch.check_attrs_present(data_frame, predicted_label_attr):
        logger.error(
            'The predicted_label_attr is not present in the input DataFrame')
        raise AssertionError(
            'The predicted_label_attr is not present in the input DataFrame')

    # Reset the index to get the indices set as 0..len(table)
    new_data_frame = data_frame.reset_index(drop=False, inplace=False)

    # Project out the gold and label attributes.
    gold = new_data_frame[gold_label_attr]
    predicted = new_data_frame[predicted_label_attr]

    # Get gold negatives, positives
    gold_negative = gold[gold == 0].index.values
    gold_positive = gold[gold == 1].index.values

    # Get predicted negatives, positives
    predicted_negative = predicted[predicted == 0].index.values
    predicted_positive = predicted[predicted == 1].index.values

    # get false positive indices
    false_positive_indices = \
        list(set(gold_negative).intersection(predicted_positive))

    # get true positive indices
    true_positive_indices = \
        list(set(gold_positive).intersection(predicted_positive))

    # get false negative indices
    false_negative_indices = \
        list(set(gold_positive).intersection(predicted_negative))

    # get true negative indices
    true_negative_indices = \
        list(set(gold_negative).intersection(predicted_negative))

    # Get the number of TP, FP, FN, TN
    num_true_positives = float(len(true_positive_indices))
    num_false_positives = float(len(false_positive_indices))
    num_false_negatives = float(len(false_negative_indices))
    num_true_negatives = float(len(true_negative_indices))

    # Precision = num_tp/ (num_tp + num_fp)

    # Get precision numerator, denominator
    precision_numerator = num_true_positives
    precision_denominiator = num_true_positives + num_false_positives

    # Precision = num_tp/ (num_tp + num_fn)
    # Get recall numerator, denominator
    recall_numerator = num_true_positives
    recall_denominator = num_true_positives + num_false_negatives

    # Compute precision
    if precision_denominiator == 0.0:
        precision = 0.0
    else:
        precision = precision_numerator / precision_denominiator

    # Compute recall
    if recall_denominator == 0.0:
        recall = 0.0
    else:
        recall = recall_numerator / recall_denominator

    # Compute F1
    if precision == 0.0 and recall == 0.0:
        F1 = 0.0
    else:
        F1 = (2.0 * precision * recall) / (precision + recall)

    # Get the fk_ltable and fk_rtable
    fk_ltable = cm.get_property(data_frame, 'fk_ltable')
    fk_rtable = cm.get_property(data_frame, 'fk_rtable')

    # Check if the fk_ltable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_ltable):
        logger.error('The fk_ltable (%s) contains missing values' % fk_ltable)
        raise AssertionError('The fk_ltable (%s) contains missing values' %
                             fk_ltable)

    # Check if the fk_rtable contain any missing values
    if ch.does_contain_missing_vals(data_frame, fk_rtable):
        logger.error('The fk_rtable (%s) contains missing values' % fk_rtable)
        raise AssertionError('The fk_rtable (%s) contains missing values' %
                             fk_rtable)

    # Set the index values to fk_ltable and fk_rtable
    new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)

    # Get the list of false positives and false negatives.
    false_pos_ls = list(
        new_data_frame.iloc[false_positive_indices].index.values)
    false_neg_ls = list(
        new_data_frame.iloc[false_negative_indices].index.values)

    # Store and return the accuracy results.
    accuracy_results = collections.OrderedDict()
    accuracy_results['prec_numerator'] = precision_numerator
    accuracy_results['prec_denominator'] = precision_denominiator
    accuracy_results['precision'] = precision
    accuracy_results['recall_numerator'] = recall_numerator
    accuracy_results['recall_denominator'] = recall_denominator
    accuracy_results['recall'] = recall
    accuracy_results['f1'] = F1
    accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives
    accuracy_results['false_pos_num'] = num_false_positives
    accuracy_results['false_pos_ls'] = false_pos_ls
    accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives
    accuracy_results['false_neg_num'] = num_false_negatives
    accuracy_results['false_neg_ls'] = false_neg_ls
    return accuracy_results
Ejemplo n.º 17
0
def extract_feature_vecs(candset,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         verbose=False,
                         show_progress=True):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.

    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
        candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(candset))
    # # Apply feature functions
    feat_vals = []
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)
    l_dict = {}
    r_dict = {}

    for row in candset.itertuples(index=False):

        if show_progress:
            prog_bar.update()
        fk_ltable_val = row[fk_ltable_idx]
        fk_rtable_val = row[fk_rtable_idx]

        if fk_ltable_val not in l_dict:
            l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val]
        l_tuple = l_dict[fk_ltable_val]

        if fk_rtable_val not in r_dict:
            r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val]
        r_tuple = r_dict[fk_rtable_val]

        f = apply_feat_fns(l_tuple, r_tuple, feature_table)
        feat_vals.append(f)

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
def _validate_metadata_for_candset(candset, key, foreign_key_ltable,
                                   foreign_key_rtable, ltable, rtable,
                                   ltable_key, rtable_key, lgr, verbose):
    """
    Validates metadata for a candidate set.

    """
    # Validate input parameters
    # # We expect candset to be of type pandas DataFrame
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input candset is not of type pandas DataFrame')
        raise AssertionError('Input candset is not of type pandas DataFrame')

    # Check if the key column is present in the candset
    if not ch.check_attrs_present(candset, key):
        logger.error('Input key ( %s ) not in the DataFrame' % key)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key)

    # Check if the foreign key ltable column is present in the candset
    if not ch.check_attrs_present(candset, foreign_key_ltable):
        logger.error('Input foreign_key_ltable ( %s ) not in the DataFrame' %
                     foreign_key_ltable)
        raise KeyError('Input foreign_key_ltable ( %s ) not in the DataFrame' %
                       foreign_key_ltable)

    # Check if the foreign key rtable column is present in the candset
    if not ch.check_attrs_present(candset, foreign_key_rtable):
        logger.error('Input fk_rtable ( %s ) not in the DataFrame' %
                     foreign_key_rtable)
        raise KeyError('Input fk_rtable ( %s ) not in the DataFrame' %
                       foreign_key_rtable)

    # We expect the ltable to be of type pandas DataFrame
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    # We expect the rtable to be of type pandas DataFrame
    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    # We expect the ltable key to be present in the ltable
    if not ch.check_attrs_present(ltable, ltable_key):
        logger.error('ltable key ( %s ) not in ltable' % ltable_key)
        raise KeyError('ltable key ( %s ) not in ltable' % ltable_key)

    # We expect the rtable key to be present in the rtable
    if not ch.check_attrs_present(rtable, rtable_key):
        logger.error('rtable key ( %s ) not in rtable' % rtable_key)
        raise KeyError('rtable key ( %s ) not in rtable' % rtable_key)

    # First validate metadata for the candidate set (as a table)
    _validate_metadata_for_table(candset, key, 'candset', lgr, verbose)

    ch.log_info(lgr, 'Validating foreign key constraint for left table',
                verbose)
    # Second check foreign key constraints
    if not ch.check_fk_constraint(candset, foreign_key_ltable, ltable,
                                  ltable_key):
        logger.error('Candset does not satisfy foreign key constraint with '
                     'the left table')
        raise AssertionError(
            'Candset does not satisfy foreign key constraint with '
            'the left table')

    if not ch.check_fk_constraint(candset, foreign_key_rtable, rtable,
                                  rtable_key):
        logger.error('Candset does not satisfy foreign key constraint with '
                     'the right table')
        raise AssertionError(
            'Candset does not satisfy foreign key constraint with '
            'the right table')

    ch.log_info(lgr, '..... Done', verbose)
    ch.log_info(lgr, 'Validating foreign key constraint for right table',
                verbose)
    ch.log_info(lgr, '..... Done', verbose)

    return True
Ejemplo n.º 19
0
def impute_table(table,
                 exclude_attrs=None,
                 missing_val='NaN',
                 strategy='mean',
                 axis=0,
                 val_all_nans=0,
                 verbose=True):
    """
    Impute table containing missing values.

    Args:
        table (DataFrame): DataFrame which values should be imputed.
        exclude_attrs (List) : list of attribute names to be excluded from
            imputing (defaults to None).
        missing_val (string or int):  The placeholder for the missing values.
            All occurrences of `missing_values` will be imputed.
            For missing values encoded as np.nan, use the string value 'NaN'
            (defaults to 'NaN').
        strategy (string): String that specifies on how to impute values. Valid
            strings: 'mean', 'median', 'most_frequent' (defaults to 'mean').
        axis (int):  axis=1 along rows, and axis=0 along columns  (defaults
            to 0).
        val_all_nans (float): Value to fill in if all the values in the column
            are NaN.

    Returns:
        Imputed DataFrame.


    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # H is the feature vector which should be imputed. Specifically, impute the missing values
        >>> # in each column, with the mean of that column
        >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean')


    """
    # Validate input paramaters
    # # We expect the input table to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type DataFrame')
        raise AssertionError('Input table is not of type DataFrame')

    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            table,
            logger, verbose)

    # # Validate metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    fv_columns = table.columns

    if exclude_attrs == None:
        feature_names = fv_columns

    else:

        # Check if the exclude attributes are present in the input table
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs '
                         'is not present '
                         'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs '
                                 'is not present '
                                 'in the input table')
        # We expect exclude attributes to be of type list. If not convert it into
        #  a list.
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Drop the duplicates from the exclude attributes
        exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

        cols = [c not in exclude_attrs for c in fv_columns]
        feature_names = fv_columns[cols]
    # print feature_names
    table_copy = table.copy()
    projected_table = table_copy[feature_names]

    projected_table_values = projected_table.values

    imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis)
    imp.fit(projected_table_values)
    imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans
    projected_table_values = imp.transform(projected_table_values)
    table_copy[feature_names] = projected_table_values
    # Update catalog
    cm.init_properties(table_copy)
    cm.copy_properties(table, table_copy)

    return table_copy
Ejemplo n.º 20
0
def dask_extract_feature_vecs(candset,
                              attrs_before=None,
                              feature_table=None,
                              attrs_after=None,
                              verbose=False,
                              show_progress=True,
                              n_chunks=1):
    """
    WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
            
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
            
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
            
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
            
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
            
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).
            
        n_chunks (int): The number of partitions to split the candidate set. If it 
            is set to -1, the number of partitions will be set to the 
            number of cores in the machine.  


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.
        AssertionError: If `n_chunks` is not of type
                int.

    Examples:
        >>> import py_entitymatching as em
        >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
    )

    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    validate_object_type(n_chunks, int, 'Parameter n_chunks')
    validate_chunks(n_chunks)

    n_chunks = get_num_partitions(n_chunks, len(candset))

    c_splits = np.array_split(candset, n_chunks)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = []

    for i in range(len(c_splits)):
        partial_result = delayed(get_feature_vals_by_cand_split)(
            pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i],
            False)
        feat_vals_by_splits.append(partial_result)

    feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits)
    if show_progress:
        with ProgressBar():
            feat_vals_by_splits = feat_vals_by_splits.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        feat_vals_by_splits = feat_vals_by_splits.compute(
            scheduler="processes", num_workers=get_num_cores())

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Ejemplo n.º 21
0
 def test_check_attrs_invalid_None(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, None)
     self.assertEqual(status, False)
Ejemplo n.º 22
0
 def test_check_attrs_present_invalid_df(self):
     ch.check_attrs_present(None, 'ID')
def extract_feature_vecs(candset, attrs_before=None, feature_table=None,
                         attrs_after=None, verbose=False,
                         show_progress=True, n_jobs=1,
                         FeatureExtractor=ParallelFeatureExtractor):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.


    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    # (Matt) Stage 1: Input validation
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')
    
    # # We expect the FeatureExtractor class to be of type BaseFeatureExtractor
    validate_subclass(FeatureExtractor, BaseFeatureExtractor, error_prefix='Input FeatureExtractor')

    # (Matt) The two blocks below are making sure that attributes that are to be appended
    # to this function's output do in fact exist in the input DataFrame
    
    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # (Matt) Why not make sure that this is a DataFrame instead of just nonempty?
    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, '
                        'ltable, rtable, ltable key, rtable key', verbose)

    # (Matt) ch ~ catalog helper
    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    # (Matt) cm ~ catalog manager
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Extract features



    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # (Matt) ParallelFeatureExtractor implementation starts here
    
    # # Apply feature functions
    feature_extractor = FeatureExtractor(
        feature_table,
        n_jobs=n_jobs,
        verbose=verbose,
        show_progress=show_progress
    )
    feat_vals = feature_extractor.extract_from(candset)
    
    # (Matt) ParallelFeatureExtractor implementation ends here; the rest is formatting

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
def _vis_debug_rf(matcher, train, test, exclude_attrs, target_attr,
                  show_window=True):
    """
    Wrapper function for debugging the Random Forest matcher visually.
    """
    try:
        from PyQt5 import QtWidgets
        from py_entitymatching.gui.debug_gui_base import MainWindowManager
    except ImportError:
        raise ImportError('PyQt5 is not installed. Please install PyQt5 to use '
                      'GUI related functions in py_entitymatching.')


    # Validate the input parameters
    # # We expect the matcher to be of type RfMatcher
    if not isinstance(matcher, RFMatcher):
        logger.error('Input matcher is not of type '
                     'Random Forest matcher')
        raise AssertionError('Input matcher is not of type '
                             'Random Forest matcher')

    # # We expect the target attribute to be of type string.
    validate_object_type(target_attr, six.string_types, error_prefix='Target attribute')

    # # Check whether the exclude attributes are indeed present in the train
    #  DataFrame.
    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the train table columns')

    # # Check whether the target attribute is indeed present in the train
    #  DataFrame.
    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the train table columns')

    # # Check whether the exclude attributes are indeed present in the test
    #  DataFrame.
    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the test table columns')


    # The exclude attributes is expected to be of type list, if not
    # explicitly convert this into a list.
    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    # Drop the duplicates from the exclude attributes
    exclude_attrs = list_drop_duplicates(exclude_attrs)

    # If the target attribute is not present in the exclude attributes,
    # then explicitly add it to the exclude attributes.
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # Now, fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    # Get a column name to store the predictions.
    predict_attr_name = get_name_for_predict_column(test.columns)

    # Predict using the test data
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name, append=True,
                                inplace=False)

    # Get the evaluation summary.
    eval_summary = em.eval_matches(predicted, target_attr, predict_attr_name)
    em._viewapp = QtWidgets.QApplication.instance()
    if em._viewapp is None:
        em._viewapp = QtWidgets.QApplication([])

    # Get metric in a form that can be displayed from the evaluation summary
    metric = _get_metric(eval_summary)

    # Get false negatives and false positives as a DataFrame
    fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls'])
    fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls'])

    # Get the main window application

    app = em._viewapp

    m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe,
                          fn_dataframe)

    # If the show window is true, then display the window.
    if show_window:
        m.show()
        app.exec_()
Ejemplo n.º 25
0
 def test_check_attrs_present_valid_1(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, 'ID')
     self.assertEqual(status, True)
def extract_feature_vecs(candset, attrs_before=None, feature_table=None,
                         attrs_after=None, verbose=False,
                         show_progress=True, n_jobs=1):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.


    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, '
                        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Extract features



    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    n_procs = get_num_procs(n_jobs, len(candset))

    c_splits = pd.np.array_split(candset, n_procs)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj,
                                                                                           fk_ltable_idx,
                                                                                           fk_rtable_idx,
                                                                                           l_df, r_df,
                                                                                           c_splits[i],
                                                                                           show_progress and i == len(
                                                                                               c_splits) - 1)
                                                   for i in range(len(c_splits)))

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Ejemplo n.º 27
0
 def test_check_attrs_present_valid_3(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, ['_ID'])
     self.assertEqual(status, False)