def test_refit(data_raw, data_raw_2): expander = DataFrameETL(cols_to_drop=['pid', 'age', 'djinn_type'], cols_to_expand=['fruits', 'animal'], dataframe_output=True, dummy_na='expanded') expander.fit(data_raw) df = expander.transform(data_raw) df_expected = pd.concat([ pd.Series([1., 0., 0.], dtype='float32', name='fruits_1.0'), pd.Series([0., 0., 1.], dtype='float32', name='fruits_3.0'), pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'), pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'), pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'), pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'), ], axis=1) assert df.equals(df_expected) expander.fit(data_raw_2) df2 = expander.transform(data_raw_2) df_expected_2 = pd.concat([ pd.Series([1., 0., 0.], dtype='float32', name='fruits_-99999.0'), pd.Series([0., 0., 1.], dtype='float32', name='fruits_1.0'), pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'), pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'), pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'), pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'), ], axis=1) assert df2.equals(df_expected_2)
def test_na_in_transform_but_not_fit_all(): fit_df = pd.concat([ pd.Series( ['marid', 'effrit', 'sila'], dtype='object', name='djinn_type'), pd.Series([1.0, 2.0, 3.0], dtype='float', name='fruits'), ], axis=1) expander = DataFrameETL(cols_to_expand=['djinn_type'], dummy_na='all', dataframe_output=True) expander.fit(fit_df) # Add nans in the first row of pid and djinn_type for transforming transform_df = pd.concat([ pd.Series( [np.nan, 'effrit', 'sila'], dtype='object', name='djinn_type'), pd.Series([np.nan, 2.0, 3.0], dtype='float', name='fruits'), ], axis=1) df = expander.transform(transform_df) df_expected = pd.concat([ pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'), pd.Series([0., 0., 0.], dtype='float32', name='djinn_type_marid'), pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'), pd.Series([np.nan, 2.0, 3.0], dtype='float32', name='fruits'), ], axis=1) assert df.equals(df_expected)
def test_transform_reuse_transformer(data_raw, data_raw_2, dataframe_2_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], cols_to_drop=['djinn_type', 'age', 'animal'], dummy_na=True, dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw_2) assert df.equals(dataframe_2_expected)
def test_transform_two_levels(data_few_levels, few_levels_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'], dummy_na='expanded', fill_value=99., dataframe_output=True) expander.fit(data_few_levels) df = expander.transform(data_few_levels) assert df.shape == few_levels_expected.shape assert df.equals(few_levels_expected)
def test_transform_dataframe(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na='expanded', dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw) assert df.shape == dataframe_expected.shape assert df.equals(dataframe_expected)
def test_transform(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na='expanded') expander.fit(data_raw) arr = expander.transform(data_raw) expected_array = np.asarray(dataframe_expected) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_transform_preserve_col_order(data_raw, data_raw_2, dataframe_2_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], cols_to_drop=['djinn_type', 'age', 'animal'], dummy_na='expanded', dataframe_output=True) expander.fit(data_raw) # swap col order for second data file data_raw_2 = data_raw_2[['fruits', 'age', 'djinn_type', 'pid', 'animal']] df = expander.transform(data_raw_2) assert df.equals(dataframe_2_expected)
def test_transform_no_level_overlap(): df = pd.concat([ pd.Series([1.0, np.NaN, 3.0], dtype='float', name='fruits'), pd.Series(["2000", "2500", "3000"], dtype='object', name='age') ], axis=1) expander = DataFrameETL(cols_to_expand=['age'], dummy_na=False) expander.fit(df) # the dtype mismatch will cause incorrect categorical expansion; # DataFrameETL should issue a warning df['age'] = df['age'].astype('int') with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') expander.transform(df) assert len(w) == 1 msg = "No overlap between levels in column 'age' and " + \ "levels seen during fit" assert msg in w[0].message.args[0]
def test_transform_two_levels_no_dummy(data_few_levels, few_levels_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'], dummy_na=False, fill_value=99., dataframe_output=True) expander.fit(data_few_levels) df = expander.transform(data_few_levels) few_levels_expected.pop('pid_NaN_Sentinel') few_levels_expected.pop('fruits_NaN') few_levels_expected.pop('animal_NaN') assert df.shape == few_levels_expected.shape assert df.equals(few_levels_expected)
def test_transform_dataframe_no_dummy(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=False, dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw) # drop nan columns from expected data dataframe_expected.pop('djinn_type_NaN') dataframe_expected.pop('fruits_NaN') dataframe_expected.pop('animal_NaN') assert df.shape == dataframe_expected.shape assert df.equals(dataframe_expected)
def test_transform_no_dummy(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=False) expander.fit(data_raw) arr = expander.transform(data_raw) # drop nan columns from expected data dataframe_expected.pop('djinn_type_NaN') dataframe_expected.pop('fruits_NaN') dataframe_expected.pop('animal_NaN') expected_array = np.asarray(dataframe_expected) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_pickle(data_raw): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na='all') expected_array = expander.fit_transform(data_raw) # pickle the transformer buff = io.BytesIO() pickle.dump(expander, buff) buff.seek(0) # transform data after unpickling transformer expander = pickle.load(buff) arr = expander.transform(data_raw) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_expand_all_na(data_raw): # output df should only have nan columns for columns which # actually had nans during fit df_expected = pd.concat([ pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'), pd.Series([1., 0., 0.], dtype='float32', name='djinn_type_marid'), pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'), pd.Series([1., np.nan, 3.], dtype='float32', name='fruits'), pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'), pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'), pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'), pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'), ], axis=1) expander = DataFrameETL(cols_to_expand=['djinn_type', 'animal'], cols_to_drop=['pid', 'age'], dummy_na='all', dataframe_output=True) expander.fit(data_raw) assert expander._unexpanded_nans == {'fruits': True} df_out = expander.transform(data_raw) assert df_out.equals(df_expected)
def test_transform_notfitted(data_raw): expander = DataFrameETL() with pytest.raises(NotFittedError): expander.transform(data_raw)