コード例 #1
0
def test_fit_drop_only(data_raw):
    # test dropping columns but not expanding them
    expander = DataFrameETL(cols_to_drop='fruits',
                            cols_to_expand=None)
    expander.fit(data_raw)
    # both lists will be the same, since no cols were expanded
    assert expander.columns_ == ['pid', 'djinn_type', 'age', 'animal']
    assert expander.required_columns_ == ['pid', 'djinn_type',
                                          'age', 'animal']
コード例 #2
0
def test_fit_none(data_raw):
    # No dropping, no expansion
    expander = DataFrameETL(cols_to_drop=None,
                            cols_to_expand=None)
    expander.fit(data_raw)
    # column names should be identical to dataframe names
    data_cnames = data_raw.columns.tolist()
    assert expander.columns_ == data_cnames
    assert expander.required_columns_ == data_cnames
コード例 #3
0
def test_expand_col_numeric(data_raw):
    expander = DataFrameETL(cols_to_drop=['pid', 'djinn_type', 'animal'],
                            cols_to_expand=['fruits'],
                            dummy_na='all',
                            fill_value=0.0)
    expander.fit(data_raw)
    arr = expander._expand_col(data_raw, 'fruits')
    arr_exp = np.array([[1., 0., 0.], [0., 0., 1.], [0., 1., 0.]])
    assert_almost_equal(arr, arr_exp)
コード例 #4
0
def test_dummy_na_bad_value(data_raw):
    with pytest.raises(ValueError) as exc:
        expander = DataFrameETL(
            cols_to_drop=['pid'],
            cols_to_expand=['djinn_type', 'fruits', 'animal'],
            dummy_na="bad_value")
        expander.fit(data_raw)
    assert str(exc.value) == "dummy_na must be one of " + \
        "[None, False, 'all', 'expanded']"
コード例 #5
0
def test_dropped_cols_no_levels(data_raw):
    # If the user requests that we drop a column, we shouldn't create
    # levels for it. That risks raising a warning for too many levels
    # when it doesn't matter.
    expander = DataFrameETL(cols_to_drop=['pid'])
    expander.fit(data_raw)

    assert 'animal' in expander.levels_
    assert 'pid' not in expander.levels_
コード例 #6
0
def test_transform_dataframe(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop='pid',
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=True,
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw)
    assert df.shape == dataframe_expected.shape
    assert df.equals(dataframe_expected)
コード例 #7
0
def test_transform_reuse_transformer(data_raw, data_raw_2,
                                     dataframe_2_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'age', 'animal'],
                            dummy_na=True,
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw_2)
    assert df.equals(dataframe_2_expected)
コード例 #8
0
def test_transform_two_levels(data_few_levels, few_levels_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'],
                            dummy_na=True,
                            fill_value=99.,
                            dataframe_output=True)
    expander.fit(data_few_levels)
    df = expander.transform(data_few_levels)
    assert df.shape == few_levels_expected.shape
    assert df.equals(few_levels_expected)
コード例 #9
0
def test_transform(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na='expanded')
    expander.fit(data_raw)
    arr = expander.transform(data_raw)

    expected_array = np.asarray(dataframe_expected)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
コード例 #10
0
def test_fit_list_numeric(data_raw, levels_dict_numeric):
    expander = DataFrameETL(cols_to_drop=['djinn_type', 'animal'],
                            cols_to_expand=['pid', 'fruits'],
                            dummy_na=True)
    expander.fit(data_raw)
    assert expander.levels_ == levels_dict_numeric
    cols_numeric = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'fruits_1.0',
                    'fruits_3.0', 'fruits_NaN', 'age']
    assert expander.columns_ == cols_numeric
    assert expander.required_columns_ == ['pid', 'fruits', 'age']
コード例 #11
0
def test_expand_col_no_dummy(data_raw):
    expander = DataFrameETL(cols_to_drop=['fruits'],
                            dummy_na=None,
                            fill_value=-1.0)
    expander.fit(data_raw)
    arr = expander._expand_col(data_raw, 'pid')
    arr_exp = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
    assert_almost_equal(arr, arr_exp)
    arr = expander._expand_col(data_raw, 'animal')
    arr_exp = np.array([[1., 0.], [0., 1.], [-1., -1.]])
    assert_almost_equal(arr, arr_exp)
コード例 #12
0
def test_transform_preserve_col_order(data_raw, data_raw_2,
                                      dataframe_2_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'age', 'animal'],
                            dummy_na='expanded',
                            dataframe_output=True)
    expander.fit(data_raw)
    # swap col order for second data file
    data_raw_2 = data_raw_2[['fruits', 'age', 'djinn_type', 'pid', 'animal']]
    df = expander.transform(data_raw_2)
    assert df.equals(dataframe_2_expected)
コード例 #13
0
def test_expand_col(data_raw):
    expander = DataFrameETL(cols_to_drop=['fruits'],
                            dummy_na='expanded',
                            fill_value=-1.0)
    expander.fit(data_raw)
    # should expand even if there are no NaNs
    arr = expander._expand_col(data_raw, 'pid')
    arr_exp = np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.]])
    assert_almost_equal(arr, arr_exp)
    arr = expander._expand_col(data_raw, 'animal')
    arr_exp = np.array([[1., 0., 0.], [0., 1., 0.], [-1., -1., 1.]])
    assert_almost_equal(arr, arr_exp)
コード例 #14
0
def test_transform_two_levels_no_dummy(data_few_levels, few_levels_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'],
                            dummy_na=False,
                            fill_value=99.,
                            dataframe_output=True)
    expander.fit(data_few_levels)
    df = expander.transform(data_few_levels)
    few_levels_expected.pop('pid_NaN_Sentinel')
    few_levels_expected.pop('fruits_NaN')
    few_levels_expected.pop('animal_NaN')
    assert df.shape == few_levels_expected.shape
    assert df.equals(few_levels_expected)
コード例 #15
0
def test_transform_dataframe_no_dummy(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=False,
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw)

    # drop nan columns from expected data
    dataframe_expected.pop('djinn_type_NaN')
    dataframe_expected.pop('fruits_NaN')
    dataframe_expected.pop('animal_NaN')
    assert df.shape == dataframe_expected.shape
    assert df.equals(dataframe_expected)
コード例 #16
0
def test_transform_no_dummy(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=False)
    expander.fit(data_raw)
    arr = expander.transform(data_raw)

    # drop nan columns from expected data
    dataframe_expected.pop('djinn_type_NaN')
    dataframe_expected.pop('fruits_NaN')
    dataframe_expected.pop('animal_NaN')
    expected_array = np.asarray(dataframe_expected)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
コード例 #17
0
def test_fit_list(data_raw, levels_dict):
    # test that fit handles list of cols to expand correctly
    expander = DataFrameETL(cols_to_drop=['fruits'],
                            cols_to_expand=['pid', 'djinn_type', 'animal'],
                            dummy_na='expanded')
    expander.fit(data_raw)
    cols_expected = [
        'pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'djinn_type_effrit',
        'djinn_type_marid', 'djinn_type_sila', 'djinn_type_NaN', 'age',
        'animal_cat', 'animal_dog', 'animal_NaN'
    ]
    assert expander.levels_ == levels_dict
    assert expander.columns_ == cols_expected
    assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
コード例 #18
0
def test_fit_auto(data_raw, levels_dict):
    # test auto identification of columns to expand
    expander = DataFrameETL(cols_to_drop='fruits',
                            cols_to_expand='auto',
                            dummy_na=True)
    expander.fit(data_raw)
    cols_expected = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN',
                     'djinn_type_effrit', 'djinn_type_marid',
                     'djinn_type_sila', 'djinn_type_NaN', 'age',
                     'animal_cat', 'animal_dog', 'animal_NaN']
    assert expander.levels_ == levels_dict
    assert expander.columns_ == cols_expected
    assert expander.required_columns_ == ['pid', 'djinn_type',
                                          'age', 'animal']
コード例 #19
0
def test_expand_col_few_levels_no_dummy(data_few_levels, few_levels_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'],
                            dummy_na=False,
                            fill_value=99.)
    expander.fit(data_few_levels)
    arr = expander._expand_col(data_few_levels, 'pid')
    expected_array = np.asarray(few_levels_expected[['pid_NaN', 'pid_a']])
    assert_almost_equal(arr, expected_array)

    arr = expander._expand_col(data_few_levels, 'fruits')
    expected_array = np.asarray(few_levels_expected[['fruits_1.0']])
    assert_almost_equal(arr, expected_array)

    arr = expander._expand_col(data_few_levels, 'animal')
    expected_array = np.asarray(few_levels_expected[['animal_cat']])
    assert_almost_equal(arr, expected_array)
コード例 #20
0
def test_pickle(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop='pid',
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=True)
    expander.fit(data_raw)
    # pickle the transformer
    buff = io.BytesIO()
    pickle.dump(expander, buff)
    buff.seek(0)
    # transform data after unpickling transformer
    expander = pickle.load(buff)

    arr = expander.transform(data_raw)
    expected_array = np.asarray(dataframe_expected)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
コード例 #21
0
def test_fit_auto(data_raw):
    # test auto identification of columns to expand
    expander = DataFrameETL(cols_to_drop=['fruits'],
                            cols_to_expand='auto',
                            dummy_na='all')
    expander.fit(data_raw)
    cols_expected = [
        'pid_a', 'pid_b', 'pid_c', 'djinn_type_effrit', 'djinn_type_marid',
        'djinn_type_sila', 'age', 'animal_cat', 'animal_dog', 'animal_NaN'
    ]
    levels_expected = {
        'pid': ['a', 'b', 'c'],
        'djinn_type': ['effrit', 'marid', 'sila'],
        'animal': ['cat', 'dog', 'NaN_Sentinel']
    }
    assert expander.levels_ == levels_expected
    assert expander.columns_ == cols_expected
    assert expander.required_columns_ == ['pid', 'djinn_type', 'age', 'animal']
コード例 #22
0
def test_transform_no_level_overlap():
    df = pd.concat([
        pd.Series([1.0, np.NaN, 3.0], dtype='float', name='fruits'),
        pd.Series(["2000", "2500", "3000"], dtype='object', name='age')
    ],
                   axis=1)

    expander = DataFrameETL(cols_to_expand=['age'], dummy_na=False)
    expander.fit(df)

    # the dtype mismatch will cause incorrect categorical expansion;
    # DataFrameETL should issue a warning
    df['age'] = df['age'].astype('int')

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')
        expander.transform(df)
        assert len(w) == 1
        msg = "No overlap between levels in column 'age' and " + \
              "levels seen during fit"
        assert msg in w[0].message.args[0]
コード例 #23
0
def test_expand_all_na(data_raw):
    # output df should only have nan columns for columns which
    # actually had nans during fit
    df_expected = pd.concat([
        pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'),
        pd.Series([1., 0., 0.], dtype='float32', name='djinn_type_marid'),
        pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'),
        pd.Series([1., np.nan, 3.], dtype='float32', name='fruits'),
        pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'),
        pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'),
        pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'),
        pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'),
    ],
                            axis=1)
    expander = DataFrameETL(cols_to_expand=['djinn_type', 'animal'],
                            cols_to_drop=['pid', 'age'],
                            dummy_na='all',
                            dataframe_output=True)
    expander.fit(data_raw)
    assert expander._unexpanded_nans == {'fruits': True}

    df_out = expander.transform(data_raw)
    assert df_out.equals(df_expected)
コード例 #24
0
def test_fit_exc(dataframe_expected):
    expander = DataFrameETL()
    arr = np.asarray(dataframe_expected)
    with pytest.raises(TypeError):
        expander.fit(arr)