コード例 #1
0
def test_create_levels(data_raw, levels_dict):
    expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal'])
    expander._nan_sentinel = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    actual_levels = expander._create_levels(data_raw)
    assert actual_levels == levels_dict
コード例 #2
0
def test_create_levels(data_raw, levels_dict):
    expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal'])
    expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'animal': 0}
    expander._nan_numeric = NAN_NUMERIC
    expander._nan_string = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    actual_levels = expander._create_levels(data_raw)
    assert actual_levels == levels_dict
コード例 #3
0
def test_create_levels_no_dummy(data_raw, levels_dict_numeric):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'], dummy_na=False)
    # remove nan from pid levels
    levels_dict_numeric['pid'] = ['a', 'b', 'c']
    expander._nan_sentinel = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    actual_levels = expander._create_levels(data_raw)
    assert actual_levels == levels_dict_numeric
コード例 #4
0
def test_create_col_names_numeric(data_raw):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'animal'],
                            dummy_na=True)
    expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'fruits': 0}
    expander._nan_numeric = NAN_NUMERIC
    expander._nan_string = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    expander.levels_ = expander._create_levels(data_raw)
    (cnames, unexpanded) = expander._create_col_names(data_raw)
    cols_numeric = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'fruits_1.0',
                    'fruits_3.0', 'fruits_NaN', 'age']
    assert cnames == cols_numeric
    assert unexpanded == ['pid', 'fruits', 'age']
コード例 #5
0
def test_create_col_names_no_dummy(data_raw):
    expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal'],
                            cols_to_drop=['fruits'],
                            dummy_na=False)
    expander._nan_sentinel = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    expander.levels_ = expander._create_levels(data_raw)
    (cnames, unexpanded) = expander._create_col_names(data_raw)
    cols_expected = [
        'pid_a', 'pid_b', 'pid_c', 'djinn_type_effrit', 'djinn_type_marid',
        'djinn_type_sila', 'age', 'animal_cat', 'animal_dog'
    ]
    assert cnames == cols_expected
    assert unexpanded == ['pid', 'djinn_type', 'age', 'animal']
コード例 #6
0
def test_create_col_names_numeric(data_raw):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'animal'],
                            dummy_na='expanded')
    expander._nan_sentinel = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    expander._dummy_na = 'expanded'
    expander.levels_ = expander._create_levels(data_raw)
    expander._unexpanded_nans = expander._flag_unexpanded_nans(data_raw)
    (cnames, unexpanded) = expander._create_col_names(data_raw)
    cols_numeric = [
        'pid_a', 'pid_b', 'pid_c', 'pid_NaN', 'fruits_1.0', 'fruits_3.0',
        'fruits_NaN', 'age'
    ]
    assert cnames == cols_numeric
    assert unexpanded == ['pid', 'fruits', 'age']
コード例 #7
0
def test_check_sentinels(data_raw):
    expander = DataFrameETL(
        cols_to_expand=['pid', 'djinn_type', 'animal', 'fruits'])
    # fill in necessary parameters
    expander._nan_sentinel = 'effrit'
    expander.levels_ = {}
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    expander._check_sentinels(data_raw)
    assert expander._nan_sentinel != 'effrit'
    assert not (data_raw[['pid', 'djinn_type', 'animal']]
                == expander._nan_sentinel).any().any()

    # The NaN sentinel can't be in the "fruits" column because
    # "fruits" is numeric and the sentinel is not.
    assert np.issubdtype(data_raw['fruits'].dtype, np.number)
    assert not np.issubdtype(type(expander._nan_sentinel), np.number)
コード例 #8
0
def test_create_col_names(data_raw):
    expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type', 'animal'],
                            cols_to_drop=['fruits'],
                            dummy_na=True)
    expander._is_numeric = {'pid': 0, 'djinn_type': 0, 'animal': 0}
    expander._nan_numeric = NAN_NUMERIC
    expander._nan_string = NAN_STRING
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    expander.levels_ = expander._create_levels(data_raw)
    (cnames, unexpanded) = expander._create_col_names(data_raw)
    cols_expected = ['pid_a', 'pid_b', 'pid_c', 'pid_NaN',
                     'djinn_type_effrit', 'djinn_type_marid',
                     'djinn_type_sila', 'djinn_type_NaN', 'age',
                     'animal_cat', 'animal_dog', 'animal_NaN']
    assert cnames == cols_expected
    assert unexpanded == ['pid', 'djinn_type', 'age', 'animal']
コード例 #9
0
def test_check_sentinels(data_raw):
    expander = DataFrameETL(cols_to_expand=['pid', 'djinn_type',
                                            'animal', 'fruits'])
    # fill in necessary parameters
    expander._nan_string = 'effrit'
    expander._nan_numeric = 1.0
    expander._is_numeric = {}
    expander.levels_ = {}
    expander._cols_to_drop = expander.cols_to_drop
    expander._cols_to_expand = expander.cols_to_expand
    for col in expander.cols_to_expand:
        expander._is_numeric[col] = expander._flag_numeric(
            pd.unique(data_raw[col]))
    expander._check_sentinels(data_raw)
    assert expander._nan_string is not 'effrit'
    assert expander._nan_numeric is not 1.0
    assert not (data_raw[['pid', 'djinn_type', 'animal']] ==
                expander._nan_string).any().any()
    assert not (data_raw['fruits'] == expander._nan_numeric).any().any()