def test_transform_numeric(self): """Test the ``transform`` on numeric input. In this test ``transform`` should return a matrix representing each item in the input as one-hot encodings. Input: - Series with numeric input Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series([1, 2]) ohet.fit(data) expected = np.array([ [1, 0], [0, 1], ]) # Run out = ohet.transform(data) # Assert assert not ohet.dummy_encoded np.testing.assert_array_equal(out, expected)
def test_fit_single(self): # Setup ohet = OneHotEncodingTransformer() # Run data = pd.Series(['a', 'a', 'a']) ohet.fit(data) # Assert np.testing.assert_array_equal(ohet.dummies, ['a'])
def _fit_discrete(self, column_name, raw_column_data): """Fit one hot encoder for discrete column.""" ohe = OneHotEncodingTransformer() ohe.fit(raw_column_data) num_categories = len(ohe.dummies) return ColumnTransformInfo( column_name=column_name, column_type="discrete", transform=ohe, transform_aux=None, output_info=[SpanInfo(num_categories, 'softmax')], output_dimensions=num_categories)
def test_one_hot_numerical_nans(): """Ensure OneHotEncodingTransformer works on numerical + nan only columns.""" data = pd.Series([1, 2, float('nan'), np.nan]) transformer = OneHotEncodingTransformer() transformer.fit(data) transformed = transformer.transform(data) reverse = transformer.reverse_transform(transformed) pd.testing.assert_series_equal(reverse, data)
def _fit_discrete(self, column, data): ohe = OneHotEncodingTransformer() data = data[:, 0] ohe.fit(data) num_categories = len(ohe.dummies) return { "name": column, "encoder": ohe, "output_info": [(num_categories, "softmax")], "output_dimensions": num_categories, }
def _fit_discrete(self, column, data): ohe = OneHotEncodingTransformer() data = data[:, 0] ohe.fit(data) categories = len(set(data)) return { 'name': column, 'encoder': ohe, 'output_info': [(categories, 'softmax')], 'output_dimensions': categories }
def test_reverse_transform_no_nans(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', 'c']) ohet.fit(data) # Run transformed = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) out = ohet.reverse_transform(transformed) # Assert expected = pd.Series(['a', 'b', 'c']) pd.testing.assert_series_equal(out, expected)
def test_transform_unknown(self): """Test the ``transform`` with unknown data. In this test ``transform`` should raise an error due to the attempt of transforming data with previously unseen categories. Input: - Series with unknown categorical values """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a']) ohet.fit(data) # Assert with np.testing.assert_raises(ValueError): ohet.transform(['b'])
def test_transform_single(self): """Test the ``transform`` on a single category. In this test ``transform`` should return a column filled with ones. Input: - Series with a single categorical value Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) ohet.fit(data) # Run out = ohet.transform(data) # Assert expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected)
def test_transform_nans(self): """Test the ``transform`` with nans. In this test ``transform`` should return an identity matrix representing each item in the input as well as nans. Input: - Series with categorical values and nans Output: - one-hot encoding of the input """ # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', None]) ohet.fit(data) # Run out = ohet.transform(data) # Assert expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected)
def _fit_discrete(self, data): """Fit one hot encoder for discrete column. Args: data (pd.DataFrame): A dataframe containing a column. Returns: namedtuple: A ``ColumnTransformInfo`` object. """ column_name = data.columns[0] ohe = OneHotEncodingTransformer() ohe.fit(data, [column_name]) num_categories = len(ohe.dummies) return ColumnTransformInfo( column_name=column_name, column_type='discrete', transform=ohe, output_info=[SpanInfo(num_categories, 'softmax')], output_dimensions=num_categories)
def test_fit_nans_numeric(self): """Test the ``fit`` method with nans. Check that the settings of the transformer are properly set based on the input. Encoding should be deactivated and NA activated. Input: - Series with containing nan values """ # Setup ohet = OneHotEncodingTransformer() # Run data = pd.Series([1, 2, np.nan]) ohet.fit(data) # Assert np.testing.assert_array_equal(ohet.dummies, [1, 2]) np.testing.assert_array_equal(ohet.decoder, [1, 2, np.nan]) assert not ohet.dummy_encoded assert ohet.dummy_na
def test_fit_no_nans(self): """Test the ``fit`` method without nans. Check that the settings of the transformer are properly set based on the input. Encoding should be activated Input: - Series with values """ # Setup ohet = OneHotEncodingTransformer() # Run data = pd.Series(['a', 'b', 'c']) ohet.fit(data) # Assert np.testing.assert_array_equal(ohet.dummies, ['a', 'b', 'c']) np.testing.assert_array_equal(ohet.decoder, ['a', 'b', 'c']) assert ohet.dummy_encoded assert not ohet.dummy_na