Esempio n. 1
0
def bucket():
    df = load_data()
    frequency_cleaveland = pd.DataFrame()
    for index, group in df.groupby(["kunag", "matnr"]):
        ts = select_series(df, index[0], index[1])
        freq = normalized_frequency(ts)
        frequency_cleaveland = frequency_cleaveland.append(
            [[index[0], index[1], freq]])
    frequency_cleaveland.columns = ["kunag", "matnr", "frequency"]
    frequency_cleaveland.to_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/frequency_cleaveland.csv",
        index=False)
Esempio n. 2
0
def bucket():
    df = load_data()
    frequency_cleaveland = pd.DataFrame()
    for index, group in df.groupby(["kunag", "matnr"]):
        ts = select_series(df, index[0], index[1])
        freq = normalized_frequency(ts)
        frequency_cleaveland = frequency_cleaveland.append(
            [[index[0], index[1], freq]])
    frequency_cleaveland.columns = ["kunag", "matnr", "frequency"]
    frequency_cleaveland.to_csv(
        "/home/aman/Desktop/CSO_drug/file_generated/frequency_cleaveland.csv")
    frequency_cleaveland = pd.read_csv(
        "/home/aman/Desktop/CSO_drug/file_generated/frequency_cleaveland.csv")
Esempio n. 3
0
def samples_aggregate_seas():
    df = load_data()
    bucket_1_sample = pd.read_csv(
        "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv"
    )
    k = 0
    for index, row in bucket_1_sample.iterrows():
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)

        _result = ma_replace_outlier(data=aggregated_data,
                                     n_pass=3,
                                     aggressive=True,
                                     window_size=12,
                                     sigma=3.0)
        result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'})
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    #plt.figure(figsize=(16, 8))
    #plt.plot(final["quantity"], label='quantity', marker=".")
    #plt.title("200 sample aggregated plot")
    #plt.xlabel("dt_weeks")
    #plt.ylabel("aggregated quantities")
    #plt.show()
    result = seasonal_decompose(final["quantity"], model="additive")
    #result.plot()
    #plt.show()
    return result.seasonal
Esempio n. 4
0
def aggregate_seasonal_comp():
    df = load_data()[["date", "quantity"]]
    df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)]
    aggregate_data = df.groupby("date")["quantity"].sum()
    aggregate_data = aggregate_data.reset_index()
    aggregate_data["kunag"] = 1
    aggregate_data["matnr"] = 2
    aggregate_data["price"] = 3
    aggregate_data = get_weekly_aggregate(aggregate_data)
    aggregate_data["dt_week"] = aggregate_data["dt_week"].apply(
        lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
    aggregate_data = aggregate_data.set_index("dt_week")
    # plt.figure(figsize=(16, 8))
    # plt.plot(aggregate_data["quantity"], label='quantity')
    # plt.title("aggregated plot")
    # plt.show()
    result = seasonal_decompose(aggregate_data["quantity"], model="additive")
    # result.plot()
    plt.show()
    return result.seasonal
Esempio n. 5
0
from stl_decompose import product_seasonal_comp_7_point
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import time
import os


project = "/home/aman/PycharmProjects/seasonality_hypothesis/"
folder = "category_7_seasonality"
file = "seasonality.csv"
folder_address = os.path.join(project, folder)
file_address = os.path.join(folder_address, file)
# os.mkdir(folder_address)

df = load_data()
frequency_cleaveland = pd.read_csv(
    "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv")
# bucket_1_sample = frequency_cleaveland[(frequency_cleaveland["frequency"] > 26) & (frequency_cleaveland["days"] > 730)].sample(400, random_state=1)
# bucket_1_sample.to_csv(
#     "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv", index=False)
sample = frequency_cleaveland[(frequency_cleaveland["frequency"] >= 26) & (frequency_cleaveland["days"] > 92) &
                              (frequency_cleaveland["days"] <= 365 + 183)]
sample = sample["matnr"].unique()
# sample.to_csv(folder_address+"/sample.csv")
# sample = pd.read_csv(folder_address+"/sample.csv")
report = pd.DataFrame()
count = 0
# start = time.time()
for matnr in sample:
        if count<321:
Esempio n. 6
0
from selection import load_data
from selection import remove_negative_rows
import pandas as pd
from preprocess import splitter_2
from hypothesis import arima
from selection import individual_series


def individual_series_2(input_df, kunag=500057582, matnr=103029):
    """
    selects a dataframe corresponding to a particular kunag and matnr
    param: a pandas dataframe
    return: a pandas dataframe
    """
    df_copy = input_df.copy()
    df_copy = remove_negative_rows(df_copy)
    df_copy = df_copy[df_copy["date"] >= 20160703]
    output_df = df_copy[(df_copy["kunag"] == kunag)
                        & (df_copy["matnr"] == matnr)]
    output_df["dt_week"] = output_df["date"].apply(
        lambda x: pd.to_datetime(x, format="%Y%m%d"))
    output_df = output_df.sort_values("dt_week")
    output_df = output_df.set_index("dt_week")
    return output_df


if __name__ == "__main__":
    print(individual_series_2(load_data()))
    df_series = individual_series(load_data(), 500057582, 103029)
    train, validation, test = splitter_2(df_series)
    print(arima(train, validation, test)[1])
Esempio n. 7
0
from selection import load_data
from data_transform import get_weekly_aggregate
import matplotlib.pyplot as plt
import pandas as pd
df = load_data()[["date", "quantity"]]
df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)]
print(df["quantity"].value_counts()/df.shape[0])
aggregate_data = df.groupby("date")["quantity"].sum()
aggregate_data = aggregate_data.reset_index()
aggregate_data["date"] = aggregate_data["date"].apply(lambda x: pd.to_datetime(x, format="%Y%m%d"))
aggregate_data = aggregate_data.sort_values("date")
aggregate_data = aggregate_data.set_index("date")
plt.figure(figsize=(16,8))
plt.plot(aggregate_data["quantity"], label='quantity')
plt.title("aggregated plot")
plt.show()
Esempio n. 8
0
def overall_aggregate_seas():
    """
    This function aggregates whole cleaveland data with ma outliers removing different categories series outliers
    First week has been removed
    :return: pandas_df : seasonal component of the aggregated data
    """
    df = load_data()
    overall = pd.read_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv"
    )
    k = 0
    for index, row in tqdm(overall.iterrows()):
        frequency = row["frequency"]
        days = row["days"]
        df_series = df[(df["kunag"] == row["kunag"])
                       & (df["matnr"] == row["matnr"])]
        df_series = df_series[df_series["quantity"] >= 0]
        df_series = df_series[df_series["date"] >= 20160703]
        if frequency == 0:
            continue
        df_series = get_weekly_aggregate(df_series)
        _testing = df_series[["quantity", "dt_week"]].copy()
        aggregated_data = _testing.rename(columns={
            'dt_week': 'ds',
            'quantity': 'y'
        })

        aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse)
        aggregated_data.y = aggregated_data.y.apply(float)
        aggregated_data = aggregated_data.sort_values('ds')
        aggregated_data = aggregated_data.reset_index(drop=True)
        outlier = True
        if (frequency >= 26) & (days > 365 + 183):
            n_pass = 3
            window_size = 12
            sigma = 4.0
        elif (frequency >= 20) & (frequency < 26):
            n_pass = 3
            window_size = 12
            sigma = 5.0
        elif (frequency >= 26) & (days <= 365 + 183):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 12
                sigma = 4.0
            else:
                outlier = False
        elif (frequency >= 12) & (frequency < 20):
            if len(aggregated_data) >= 26:
                n_pass = 3
                window_size = 24
                sigma = 5.0
            else:
                outlier = False
        elif frequency < 12:
            outlier = False

        if outlier:
            _result = ma_replace_outlier(data=aggregated_data,
                                         n_pass=n_pass,
                                         aggressive=True,
                                         window_size=window_size,
                                         sigma=sigma)
            result = _result[0].rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        else:
            result = aggregated_data.rename(columns={
                'ds': 'dt_week',
                'y': 'quantity'
            })
        if k == 1:
            final = pd.concat([final, result])
        if k == 0:
            final = result
            k = 1
    final = final.groupby("dt_week")["quantity"].sum().reset_index()
    final = final.set_index("dt_week")
    final.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv"
    )
    result = seasonal_decompose(final["quantity"], model="additive")
    result.seasonal.to_csv(
        "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv"
    )
    return result.seasonal