コード例 #1
0
def _validate_metadata_for_table(table, key, output_string, lgr, verbose):
    """
    Validates metadata for table (DataFrame)

    """
    # Validate input parameters
    # # We expect the input table to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # Check the key column is present in the table
    if not ch.check_attrs_present(table, key):
        logger.error('Input key ( %s ) not in the DataFrame' % key)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key)

    # Validate the key
    ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key),
                verbose)
    # We expect the key to be of type string
    if not isinstance(key, six.string_types):
        logger.error('Key attribute must be of type string')
        raise AssertionError('Key attribute must be of type string')
    if not ch.is_key_attribute(table, key, verbose):
        logger.error('Attribute %s in the %s table does not '
                     'qualify to be the key' % (str(key), output_string))
        raise AssertionError('Attribute %s in the %s table does not '
                             'qualify to be the key' %
                             (str(key), output_string))
    ch.log_info(lgr, '..... Done', verbose)
    return True
コード例 #2
0
def set_key(data_frame, key_attribute):
    """
    Sets the value of 'key' property for a DataFrame in the catalog with the
    given attribute (i.e column name).

    Specifically, this function set the the key attribute for the DataFrame
    if the given attribute satisfies the following two properties:

        The key attribute should have unique values.

        The key attribute should not have missing values. A missing value
        is represented as np.NaN.

    Args:
        data_frame (DataFrame): The DataFrame for which the key must be set in
            the catalog.
        key_attribute (string): The key attribute (column name) in the
            DataFrame.

    Returns:
        A Boolean value of True is returned, if the given attribute
        satisfies the conditions for a key and the update was successful.

    Raises:
        AssertionError: If `data_frame` is not of type
            pandas DataFrame.
        AssertionError: If `key_attribute` is not of type string.
        KeyError: If given `key_attribute` is not in the DataFrame columns.

    See Also:
        :meth:`~py_entitymatching.set_property`


    """
    # Validate input parameters

    # # We expect the input object (data_frame) to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # We expect input key attribute to be of type string
    if not isinstance(key_attribute, six.string_types):
        logger.error('Input key attribute is not of type string')

    # Check if the key attribute is present as one of the columns in the
    # DataFrame
    if not ch.check_attrs_present(data_frame, key_attribute):
        logger.error('Input key ( %s ) not in the DataFrame' % key_attribute)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key_attribute)

    # Check if the key attribute satisfies the conditions to be a key. If
    # not, just return False.
    # Note: Currently it is not clear, whether we should return False from
    # here or raise an exception. As of now resorting to just returning
    # False, because this function is used by other computation
    # intensive commands in py_entitymatching and raising an exception might make all
    # the work done in those commands go in vain (or those commands should
    # catch the exception correctly, which may be complicated and require
    # changes to the current code). We need to revisit this
    # later.
    if ch.is_key_attribute(data_frame, key_attribute) is False:
        logger.warning('Attribute (%s ) does not qualify  to be a key; Not '
                       'setting/replacing the key' % key_attribute)
        return False
    else:
        # Set the key property for the input DataFrame
        return set_property(data_frame, 'key', key_attribute)
コード例 #3
0
 def test_is_key_attribute_invalid_attr(self):
     A = pd.read_csv(path_a)
     ch.is_key_attribute(A, None)
コード例 #4
0
 def test_is_key_attribute_invalid_df(self):
     ch.is_key_attribute(None, 'id')
コード例 #5
0
 def test_is_key_attribute_valid_4(self):
     A = pd.DataFrame(columns=['id', 'name'])
     status = ch.is_key_attribute(A, 'id')
     self.assertEqual(status, True)
コード例 #6
0
 def test_is_key_attribute_valid_3(self):
     p = os.sep.join([catalog_datasets_path, 'A_mvals.csv'])
     A = pd.read_csv(p)
     status = ch.is_key_attribute(A, 'ID', True)
     self.assertEqual(status, False)
コード例 #7
0
 def test_is_key_attribute_valid_2(self):
     A = pd.read_csv(path_a)
     status = ch.is_key_attribute(A, 'zipcode', True)
     self.assertEqual(status, False)
コード例 #8
0
 def test_is_key_attribute_valid_1(self):
     A = pd.read_csv(path_a)
     status = ch.is_key_attribute(A, 'ID', True)
     self.assertEqual(status, True)