def test_create_levels(data_raw, levels_dict): expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal']) expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'animal': 0} expander._nan_numeric = NAN_NUMERIC expander._nan_string = NAN_STRING expander._cols_to_drop = expander.cols_to_drop expander._cols_to_expand = expander.cols_to_expand actual_levels = expander._create_levels(data_raw) assert actual_levels == levels_dict
def test_create_levels_no_dummy(data_raw, levels_dict_numeric): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], dummy_na=False) # remove nan from pid levels levels_dict_numeric['pid'] = ['a', 'b', 'c'] expander._is_numeric = {'pid': 0, 'fruits': 1} expander._nan_numeric = NAN_NUMERIC expander._nan_string = NAN_STRING expander._cols_to_drop = expander.cols_to_drop expander._cols_to_expand = expander.cols_to_expand actual_levels = expander._create_levels(data_raw) assert actual_levels == levels_dict_numeric
def test_create_col_names_numeric(data_raw): expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], cols_to_drop=['djinn_type', 'animal'], dummy_na=True) expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'fruits': 0} expander._nan_numeric = NAN_NUMERIC expander._nan_string = NAN_STRING expander._cols_to_drop = expander.cols_to_drop expander._cols_to_expand = expander.cols_to_expand expander.levels_ = expander._create_levels(data_raw) (cnames, unexpanded) = expander._create_col_names(data_raw) cols_numeric = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'fruits_1.0', 'fruits_3.0', 'fruits_NaN', 'age'] assert cnames == cols_numeric assert unexpanded == ['pid', 'fruits', 'age']
def test_create_col_names_no_dummy(data_raw): expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal'], cols_to_drop=['fruits'], dummy_na=False) expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'animal': 0} expander._nan_numeric = NAN_NUMERIC expander._nan_string = NAN_STRING expander._cols_to_drop = expander.cols_to_drop expander._cols_to_expand = expander.cols_to_expand expander.levels_ = expander._create_levels(data_raw) (cnames, unexpanded) = expander._create_col_names(data_raw) cols_expected = ['pid_a', 'pid_b', 'pid_c', 'djinn_type_effrit', 'djinn_type_marid', 'djinn_type_sila', 'age', 'animal_cat', 'animal_dog'] assert cnames == cols_expected assert unexpanded == ['pid', 'djinn_type', 'age', 'animal']
def test_add_sentinel(data_raw): expander = DataFrameETL() expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'animal': 0, 'fruits': 1, 'age': 1} expander._nan_numeric = NAN_NUMERIC expander._nan_string = NAN_STRING # this shouldn't add any sentinels col = expander._add_sentinel('age', data_raw['age']) pd.testing.assert_series_equal(col, data_raw['age'].astype('uint16')) # this should add a sentinel col2 = expander._add_sentinel('animal', data_raw['animal']) pd.testing.assert_series_equal(col2, pd.Series(['cat', 'dog', NAN_STRING], dtype='object', name='animal')) # this should add a numeric sentinel col2 = expander._add_sentinel('fruits', data_raw['fruits']) pd.testing.assert_series_equal(col2, pd.Series([1.0, NAN_NUMERIC, 3.0], dtype='float', name='fruits'))
def test_check_sentinels(data_raw): expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal', 'fruits']) # fill in necessary parameters expander._nan_string = 'effrit' expander._nan_numeric = 1.0 expander._is_numeric = {} expander.levels_ = {} expander._cols_to_drop = expander.cols_to_drop expander._cols_to_expand = expander.cols_to_expand for col in expander.cols_to_expand: expander._is_numeric[col] = expander._flag_numeric( pd.unique(data_raw[col])) expander._check_sentinels(data_raw) assert expander._nan_string is not 'effrit' assert expander._nan_numeric is not 1.0 assert not (data_raw[['pid', 'djinn_type', 'animal']] == expander._nan_string).any().any() assert not (data_raw['fruits'] == expander._nan_numeric).any().any()