def test_fit_drop_only(data_raw): # test dropping columns but not expanding them expander = DataFrameETL(cols_to_drop='fruits', cols_to_expand=None) expander.fit(data_raw) # both lists will be the same, since no cols were expanded assert expander.columns_ == ['pid', 'djinn_type', 'age', 'animal'] assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
def test_fit_none(data_raw): # No dropping, no expansion expander = DataFrameETL(cols_to_drop=None, cols_to_expand=None) expander.fit(data_raw) # column names should be identical to dataframe names data_cnames = data_raw.columns.tolist() assert expander.columns_ == data_cnames assert expander.required_columns_ == data_cnames
def test_expand_col_numeric(data_raw): expander = DataFrameETL(cols_to_drop=['pid', 'djinn_type', 'animal'], cols_to_expand=['fruits'], dummy_na='all', fill_value=0.0) expander.fit(data_raw) arr = expander._expand_col(data_raw, 'fruits') arr_exp = np.array([[1., 0., 0.], [0., 0., 1.], [0., 1., 0.]]) assert_almost_equal(arr, arr_exp)
def test_dummy_na_bad_value(data_raw): with pytest.raises(ValueError) as exc: expander = DataFrameETL( cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na="bad_value") expander.fit(data_raw) assert str(exc.value) == "dummy_na must be one of " + \ "[None, False, 'all', 'expanded']"
def test_dropped_cols_no_levels(data_raw): # If the user requests that we drop a column, we shouldn't create # levels for it. That risks raising a warning for too many levels # when it doesn't matter. expander = DataFrameETL(cols_to_drop=['pid']) expander.fit(data_raw) assert 'animal' in expander.levels_ assert 'pid' not in expander.levels_
def test_transform_dataframe(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop='pid', cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=True, dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw) assert df.shape == dataframe_expected.shape assert df.equals(dataframe_expected)
def test_transform_reuse_transformer(data_raw, data_raw_2, dataframe_2_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], cols_to_drop=['djinn_type', 'age', 'animal'], dummy_na=True, dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw_2) assert df.equals(dataframe_2_expected)
def test_transform_two_levels(data_few_levels, few_levels_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'], dummy_na=True, fill_value=99., dataframe_output=True) expander.fit(data_few_levels) df = expander.transform(data_few_levels) assert df.shape == few_levels_expected.shape assert df.equals(few_levels_expected)
def test_transform(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na='expanded') expander.fit(data_raw) arr = expander.transform(data_raw) expected_array = np.asarray(dataframe_expected) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_fit_list_numeric(data_raw, levels_dict_numeric): expander = DataFrameETL(cols_to_drop=['djinn_type', 'animal'], cols_to_expand=['pid', 'fruits'], dummy_na=True) expander.fit(data_raw) assert expander.levels_ == levels_dict_numeric cols_numeric = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'fruits_1.0', 'fruits_3.0', 'fruits_NaN', 'age'] assert expander.columns_ == cols_numeric assert expander.required_columns_ == ['pid', 'fruits', 'age']
def test_expand_col_no_dummy(data_raw): expander = DataFrameETL(cols_to_drop=['fruits'], dummy_na=None, fill_value=-1.0) expander.fit(data_raw) arr = expander._expand_col(data_raw, 'pid') arr_exp = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) assert_almost_equal(arr, arr_exp) arr = expander._expand_col(data_raw, 'animal') arr_exp = np.array([[1., 0.], [0., 1.], [-1., -1.]]) assert_almost_equal(arr, arr_exp)
def test_transform_preserve_col_order(data_raw, data_raw_2, dataframe_2_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], cols_to_drop=['djinn_type', 'age', 'animal'], dummy_na='expanded', dataframe_output=True) expander.fit(data_raw) # swap col order for second data file data_raw_2 = data_raw_2[['fruits', 'age', 'djinn_type', 'pid', 'animal']] df = expander.transform(data_raw_2) assert df.equals(dataframe_2_expected)
def test_expand_col(data_raw): expander = DataFrameETL(cols_to_drop=['fruits'], dummy_na='expanded', fill_value=-1.0) expander.fit(data_raw) # should expand even if there are no NaNs arr = expander._expand_col(data_raw, 'pid') arr_exp = np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.]]) assert_almost_equal(arr, arr_exp) arr = expander._expand_col(data_raw, 'animal') arr_exp = np.array([[1., 0., 0.], [0., 1., 0.], [-1., -1., 1.]]) assert_almost_equal(arr, arr_exp)
def test_transform_two_levels_no_dummy(data_few_levels, few_levels_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'], dummy_na=False, fill_value=99., dataframe_output=True) expander.fit(data_few_levels) df = expander.transform(data_few_levels) few_levels_expected.pop('pid_NaN_Sentinel') few_levels_expected.pop('fruits_NaN') few_levels_expected.pop('animal_NaN') assert df.shape == few_levels_expected.shape assert df.equals(few_levels_expected)
def test_transform_dataframe_no_dummy(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=False, dataframe_output=True) expander.fit(data_raw) df = expander.transform(data_raw) # drop nan columns from expected data dataframe_expected.pop('djinn_type_NaN') dataframe_expected.pop('fruits_NaN') dataframe_expected.pop('animal_NaN') assert df.shape == dataframe_expected.shape assert df.equals(dataframe_expected)
def test_transform_no_dummy(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop=['pid'], cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=False) expander.fit(data_raw) arr = expander.transform(data_raw) # drop nan columns from expected data dataframe_expected.pop('djinn_type_NaN') dataframe_expected.pop('fruits_NaN') dataframe_expected.pop('animal_NaN') expected_array = np.asarray(dataframe_expected) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_fit_list(data_raw, levels_dict): # test that fit handles list of cols to expand correctly expander = DataFrameETL(cols_to_drop=['fruits'], cols_to_expand=['pid', 'djinn_type', 'animal'], dummy_na='expanded') expander.fit(data_raw) cols_expected = [ 'pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'djinn_type_effrit', 'djinn_type_marid', 'djinn_type_sila', 'djinn_type_NaN', 'age', 'animal_cat', 'animal_dog', 'animal_NaN' ] assert expander.levels_ == levels_dict assert expander.columns_ == cols_expected assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
def test_fit_auto(data_raw, levels_dict): # test auto identification of columns to expand expander = DataFrameETL(cols_to_drop='fruits', cols_to_expand='auto', dummy_na=True) expander.fit(data_raw) cols_expected = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'djinn_type_effrit', 'djinn_type_marid', 'djinn_type_sila', 'djinn_type_NaN', 'age', 'animal_cat', 'animal_dog', 'animal_NaN'] assert expander.levels_ == levels_dict assert expander.columns_ == cols_expected assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
def test_expand_col_few_levels_no_dummy(data_few_levels, few_levels_expected): expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'], dummy_na=False, fill_value=99.) expander.fit(data_few_levels) arr = expander._expand_col(data_few_levels, 'pid') expected_array = np.asarray(few_levels_expected[['pid_NaN', 'pid_a']]) assert_almost_equal(arr, expected_array) arr = expander._expand_col(data_few_levels, 'fruits') expected_array = np.asarray(few_levels_expected[['fruits_1.0']]) assert_almost_equal(arr, expected_array) arr = expander._expand_col(data_few_levels, 'animal') expected_array = np.asarray(few_levels_expected[['animal_cat']]) assert_almost_equal(arr, expected_array)
def test_pickle(data_raw, dataframe_expected): expander = DataFrameETL(cols_to_drop='pid', cols_to_expand=['djinn_type', 'fruits', 'animal'], dummy_na=True) expander.fit(data_raw) # pickle the transformer buff = io.BytesIO() pickle.dump(expander, buff) buff.seek(0) # transform data after unpickling transformer expander = pickle.load(buff) arr = expander.transform(data_raw) expected_array = np.asarray(dataframe_expected) assert arr.shape == expected_array.shape assert_almost_equal(arr, expected_array)
def test_fit_auto(data_raw): # test auto identification of columns to expand expander = DataFrameETL(cols_to_drop=['fruits'], cols_to_expand='auto', dummy_na='all') expander.fit(data_raw) cols_expected = [ 'pid_a', 'pid_b', 'pid_c', 'djinn_type_effrit', 'djinn_type_marid', 'djinn_type_sila', 'age', 'animal_cat', 'animal_dog', 'animal_NaN' ] levels_expected = { 'pid': ['a', 'b', 'c'], 'djinn_type': ['effrit', 'marid', 'sila'], 'animal': ['cat', 'dog', 'NaN_Sentinel'] } assert expander.levels_ == levels_expected assert expander.columns_ == cols_expected assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
def test_transform_no_level_overlap(): df = pd.concat([ pd.Series([1.0, np.NaN, 3.0], dtype='float', name='fruits'), pd.Series(["2000", "2500", "3000"], dtype='object', name='age') ], axis=1) expander = DataFrameETL(cols_to_expand=['age'], dummy_na=False) expander.fit(df) # the dtype mismatch will cause incorrect categorical expansion; # DataFrameETL should issue a warning df['age'] = df['age'].astype('int') with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') expander.transform(df) assert len(w) == 1 msg = "No overlap between levels in column 'age' and " + \ "levels seen during fit" assert msg in w[0].message.args[0]
def test_expand_all_na(data_raw): # output df should only have nan columns for columns which # actually had nans during fit df_expected = pd.concat([ pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'), pd.Series([1., 0., 0.], dtype='float32', name='djinn_type_marid'), pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'), pd.Series([1., np.nan, 3.], dtype='float32', name='fruits'), pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'), pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'), pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'), pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'), ], axis=1) expander = DataFrameETL(cols_to_expand=['djinn_type', 'animal'], cols_to_drop=['pid', 'age'], dummy_na='all', dataframe_output=True) expander.fit(data_raw) assert expander._unexpanded_nans == {'fruits': True} df_out = expander.transform(data_raw) assert df_out.equals(df_expected)
def test_fit_exc(dataframe_expected): expander = DataFrameETL() arr = np.asarray(dataframe_expected) with pytest.raises(TypeError): expander.fit(arr)