Ejemplo n.º 1
0
def test_concatenate_dfs():
    test_df = testing.get_test_example()
    duplicate_df = testing.get_test_example()

    concat_df = dataframes.concatenate_dfs(test_df, duplicate_df)

    answer = list(test_df[["datetime"]]) * 2
    result = list(concat_df[["datetime"]])

    assert all([a == b for a, b in zip(answer, result)])
Ejemplo n.º 2
0
def test_remove_blank_cols():
    test_pd = testing.get_test_example()
    test_pd["Unnamed"] = 0
    assert "Unnamed" in list(test_pd.columns)

    result = dataframes.remove_blank_cols(test_pd)
    assert "Unnamed" not in list(result.columns)
Ejemplo n.º 3
0
def test_calc_rolling_agg():
    input_df = testing.get_test_example()
    hierarchy = ['category', 'product', 'state', 'store']

    # test 1: 2-day rolling average
    rolling_mean_df = dataframes.calc_rolling_agg(df=input_df,
                                                  hierarchy=hierarchy,
                                                  rolling_window=2,
                                                  target_var='sales_int',
                                                  agg_func='mean')

    answer = pd.Series([
        113, (10000 + 113) / 2, (10000 + 102) / 2, (102 + 123) / 2, 5,
        (800 + 5) / 2, (800 + 0) / 2, (0 + -20) / 2
    ])
    result = (rolling_mean_df['mean_2_sales_int'])
    assert (answer == result).all(), print(answer, result, input_df)

    # test 2: three-day rolling average
    rolling_mean_df = dataframes.calc_rolling_agg(df=input_df,
                                                  hierarchy=hierarchy,
                                                  rolling_window=3,
                                                  target_var='sales_int',
                                                  agg_func='mean')

    answer = pd.Series([
        113, (10000 + 113) / 2, (10000 + 102 + 113) / 3,
        (10000 + 102 + 123) / 3, 5, (800 + 5) / 2, (800 + 5 + 0) / 3,
        (0 + -20 + 800) / 3
    ])

    result = (rolling_mean_df['mean_3_sales_int'])
    assert (answer == result).all(), print(answer, result, input_df)
Ejemplo n.º 4
0
def test_deindex_features():
    # ensures that the right index mapping exists
    test_index_features()

    initial_df = pd.DataFrame(
        [
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
        ],
        columns=[
            "category_index", "product_index", "state_index", "store_index"
        ],
    )

    result_df = dataframes.deindex_features(initial_df)

    answer_df = testing.get_test_example()[[
        "category", "product", "state", "store"
    ]]

    assert (result_df.values == answer_df.values).all()
Ejemplo n.º 5
0
def test_auto_convert_datetime():
    test_df = testing.get_test_example(convert_dtypes=False)
    assert test_df["datetime"].dtype in ["object"]

    test_df = dataframes.auto_convert_datetime(test_df)

    assert utils.is_datetime_series(test_df["datetime"])
Ejemplo n.º 6
0
def test_compress_dataframe():
    test_df = testing.get_test_example()
    initial_memory = dataframes.get_memory_usage(test_df)

    dataframes.compress_dataframe(test_df)
    new_memory = dataframes.get_memory_usage(test_df)

    assert new_memory < initial_memory
Ejemplo n.º 7
0
def test_distribute_dask_df():
    test_pd = testing.get_test_example()

    test_dd = dataframes.convert_pandas_to_dask(test_pd)

    test_dd = dataframes.distribute_dask_df(test_dd)
    # TODO fix globals so this isn't necessary
    assert isinstance(dataframes.profile_dask_client(), dict)
Ejemplo n.º 8
0
def test_convert_pandas_to_dask():
    # also tested as part of integration tests in test_utils.py

    test_pd = testing.get_test_example()

    test_dd = dataframes.convert_pandas_to_dask(test_pd)

    assert utils.is_dask_df(test_dd)
Ejemplo n.º 9
0
def test_filter_using_multiindex():
    test_df = testing.get_test_example()

    filter_df = test_df[test_df["product"] == "Prod_3"]

    result_df = dataframes.filter_using_multiindex(test_df, filter_df,
                                                   ["product"])

    assert result_df.equals(filter_df)
Ejemplo n.º 10
0
def test_correct_suffixes_in_list():
    pandas_df = testing.get_test_example()
    answer = list(pandas_df.columns).copy()
    result = utils.correct_suffixes_in_list(pandas_df, pandas_df.columns)
    assert result == answer

    index_list = [col + "_index" for col in pandas_df.columns]
    result = utils.correct_suffixes_in_list(pandas_df, index_list)
    assert result == answer
Ejemplo n.º 11
0
def test_filter_using_dict():
    test_df = testing.get_test_example()
    filter_df = test_df[(test_df["product"] == "Prod_3")
                        & (test_df["category"] == "Cat_1")]
    result_df = dataframes.filter_using_dict(test_df, {
        "product": "Prod_3",
        "category": "Cat_1"
    })

    assert result_df.equals(filter_df)
Ejemplo n.º 12
0
def test_get_columns_of_type():
    test_pd = testing.get_test_example()
    float_columns = ["float_col"]
    int_columns = ["sales_int"]
    object_cols = ["category", "product", "state", "store"]

    assert dataframes.get_columns_of_type(test_pd, "float") == float_columns
    assert dataframes.get_columns_of_type(test_pd, "integer") == int_columns

    assert set(dataframes.get_columns_of_type(test_pd, "object")) == \
        set(object_cols)
Ejemplo n.º 13
0
def test_merge_by_concat():
    test_df = testing.get_test_example()
    small_df = pd.DataFrame([["Cat_1", "A"], ["Cat_2", "B"]],
                            columns=["category", "mapping"])

    merged_df = dataframes.merge_by_concat(test_df,
                                           small_df,
                                           index_cols=["category"])

    answer = ["A"] * 4 + ["B"] * 4
    result = list(merged_df["mapping"].values)
    assert result == answer
Ejemplo n.º 14
0
def test_fill_missing_timeseries():
    test_pd = testing.get_test_example()
    test_pd.loc[3, 'datetime'] = '2020-01-05'
    test_pd['datetime'] = pd.to_datetime(test_pd['datetime'])

    grouping_cols = ['category', 'product', 'state', 'store']
    result = timeseries.fill_missing_timeseries(test_pd, 
                                                grouping_cols=grouping_cols,
                                                datetime_col='datetime')
    answer = pd.to_datetime([
        '2019-12-30', 
        '2019-12-31', 
        '2020-01-01', 
        '2020-01-02', 
        '2020-01-03', 
        '2020-01-04', 
        '2020-01-05'
        ] * 2)

    assert (result['datetime'] == answer).all()
Ejemplo n.º 15
0
def test_create_outlier_mask():
    input_df = testing.get_test_example()

    # use custom sales column since the outliers are too pronounced
    input_df['sales_int'] = [4, 5, 1, 2, 4, 6, 100, 4]

    # test 1: flag outliers where a sales value is greater than two st devs.
    grouped_flag_mask = dataframes.create_outlier_mask(input_df,
                                                       target_var='sales_int',
                                                       grouping_cols='product',
                                                       number_of_stds=1)

    answer = [True, False, False, True, True, True, False, True]
    assert ((grouped_flag_mask == answer).all())

    # test 2: flag outliers for salves overall
    flag_mask = dataframes.create_outlier_mask(input_df,
                                               target_var='sales_int',
                                               number_of_stds=1)

    answer = [True] * 6 + [False, True]
    assert ((flag_mask == answer).all())
Ejemplo n.º 16
0
def test_index_features():
    test_df = testing.get_test_example()

    indexed_df = dataframes.index_features(test_df)

    answer_df = pd.DataFrame(
        [
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
            [1, 1, 0, 0],
        ],
        columns=[
            "category_index", "product_index", "state_index", "store_index"
        ],
    )

    indexed_columns = [col for col in indexed_df.columns if "_index" in col]
    assert (indexed_df[indexed_columns].values == answer_df.values).all()
Ejemplo n.º 17
0
def test_check_key_mismatches():
    keys = ['product', 'state', 'store']

    first_test_df = testing.get_test_example()
    second_test_df = pd.DataFrame([
        ["Prod_3", "CA", "Store_1", 2],
        ["Prod_3", "TX", "Store_1", 23.12],
    ],
                                  columns=keys + ['price'])

    answer = pd.DataFrame([
        ["Prod_3", "CA", "Store_1", 'both'],
        ["Prod_4", "CA", "Store_1", 'left_only'],
        ["Prod_3", "TX", "Store_1", 'right_only'],
    ],
                          columns=['product', 'stage', 'store', '_merge'])

    result = dataframes.check_key_mismatches(
        df1=first_test_df,
        df2=second_test_df,
        keys=keys,
    )

    assert np.array_equal(answer.values, result.values)
Ejemplo n.º 18
0
def test_is_pandas_df():
    assert utils.is_pandas_df(testing.get_test_example())
    assert not utils.is_pandas_df([12, 123])
Ejemplo n.º 19
0
def test_update_time():
    def check_answer():
        assert (result == pd.to_datetime(answer)).all()

    test_series = pd.to_datetime(testing.get_test_example()["datetime"])

    result = timeseries.update_time(test_series, 1, "weeks")
    answer = pd.Series(
        [
            "2020-01-08",
            "2020-01-09",
            "2020-01-10",
            "2020-01-11",
            "2020-01-06",
            "2020-01-07",
            "2020-01-08",
            "2020-01-09",
        ]
    )
    check_answer()

    result = timeseries.update_time(test_series, 1, "months")
    answer = pd.Series(
        [
            "2020-02-01",
            "2020-02-02",
            "2020-02-03",
            "2020-02-04",
            "2020-01-30",
            "2020-01-31",
            "2020-02-01",
            "2020-02-02",
        ]
    )
    check_answer()

    result = timeseries.update_time(test_series, 1, "days")
    answer = pd.Series(
        [
            "2020-01-02",
            "2020-01-03",
            "2020-01-04",
            "2020-01-05",
            "2019-12-31",
            "2020-01-01",
            "2020-01-02",
            "2020-01-03",
        ]
    )
    check_answer()

    result = timeseries.update_time(test_series, -1, "months")
    answer = pd.Series(
        [
            "2019-12-01",
            "2019-12-02",
            "2019-12-03",
            "2019-12-04",
            "2019-11-30",
            "2019-11-30",
            "2019-12-01",
            "2019-12-02",
        ]
    )
    check_answer()
Ejemplo n.º 20
0
def test_get_memory_usage():
    test_pd = testing.get_test_example()
    assert 3000 > dataframes.get_memory_usage(test_pd) > 1000
Ejemplo n.º 21
0
def test_print_memory_usage():
    test_df = testing.get_test_example()
    memory = dataframes.print_memory_usage(test_df)
    assert utils.is_string(memory)
    assert memory[-2:] == "KB"
Ejemplo n.º 22
0
def test_is_dask_df():
    pandas_df = testing.get_test_example()
    dask_df = dataframes.convert_pandas_to_dask(pandas_df)
    assert utils.is_dask_df(dask_df)
    assert not utils.is_dask_df(pandas_df)