def bucket(): df = load_data() frequency_cleaveland = pd.DataFrame() for index, group in df.groupby(["kunag", "matnr"]): ts = select_series(df, index[0], index[1]) freq = normalized_frequency(ts) frequency_cleaveland = frequency_cleaveland.append( [[index[0], index[1], freq]]) frequency_cleaveland.columns = ["kunag", "matnr", "frequency"] frequency_cleaveland.to_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/frequency_cleaveland.csv", index=False)
def bucket(): df = load_data() frequency_cleaveland = pd.DataFrame() for index, group in df.groupby(["kunag", "matnr"]): ts = select_series(df, index[0], index[1]) freq = normalized_frequency(ts) frequency_cleaveland = frequency_cleaveland.append( [[index[0], index[1], freq]]) frequency_cleaveland.columns = ["kunag", "matnr", "frequency"] frequency_cleaveland.to_csv( "/home/aman/Desktop/CSO_drug/file_generated/frequency_cleaveland.csv") frequency_cleaveland = pd.read_csv( "/home/aman/Desktop/CSO_drug/file_generated/frequency_cleaveland.csv")
def samples_aggregate_seas(): df = load_data() bucket_1_sample = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv" ) k = 0 for index, row in bucket_1_sample.iterrows(): df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) _result = ma_replace_outlier(data=aggregated_data, n_pass=3, aggressive=True, window_size=12, sigma=3.0) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") #plt.figure(figsize=(16, 8)) #plt.plot(final["quantity"], label='quantity', marker=".") #plt.title("200 sample aggregated plot") #plt.xlabel("dt_weeks") #plt.ylabel("aggregated quantities") #plt.show() result = seasonal_decompose(final["quantity"], model="additive") #result.plot() #plt.show() return result.seasonal
def aggregate_seasonal_comp(): df = load_data()[["date", "quantity"]] df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)] aggregate_data = df.groupby("date")["quantity"].sum() aggregate_data = aggregate_data.reset_index() aggregate_data["kunag"] = 1 aggregate_data["matnr"] = 2 aggregate_data["price"] = 3 aggregate_data = get_weekly_aggregate(aggregate_data) aggregate_data["dt_week"] = aggregate_data["dt_week"].apply( lambda x: pd.to_datetime(x, format="%Y-%m-%d")) aggregate_data = aggregate_data.set_index("dt_week") # plt.figure(figsize=(16, 8)) # plt.plot(aggregate_data["quantity"], label='quantity') # plt.title("aggregated plot") # plt.show() result = seasonal_decompose(aggregate_data["quantity"], model="additive") # result.plot() plt.show() return result.seasonal
from stl_decompose import product_seasonal_comp_7_point from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error import matplotlib.pyplot as plt import time import os project = "/home/aman/PycharmProjects/seasonality_hypothesis/" folder = "category_7_seasonality" file = "seasonality.csv" folder_address = os.path.join(project, folder) file_address = os.path.join(folder_address, file) # os.mkdir(folder_address) df = load_data() frequency_cleaveland = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv") # bucket_1_sample = frequency_cleaveland[(frequency_cleaveland["frequency"] > 26) & (frequency_cleaveland["days"] > 730)].sample(400, random_state=1) # bucket_1_sample.to_csv( # "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv", index=False) sample = frequency_cleaveland[(frequency_cleaveland["frequency"] >= 26) & (frequency_cleaveland["days"] > 92) & (frequency_cleaveland["days"] <= 365 + 183)] sample = sample["matnr"].unique() # sample.to_csv(folder_address+"/sample.csv") # sample = pd.read_csv(folder_address+"/sample.csv") report = pd.DataFrame() count = 0 # start = time.time() for matnr in sample: if count<321:
from selection import load_data from selection import remove_negative_rows import pandas as pd from preprocess import splitter_2 from hypothesis import arima from selection import individual_series def individual_series_2(input_df, kunag=500057582, matnr=103029): """ selects a dataframe corresponding to a particular kunag and matnr param: a pandas dataframe return: a pandas dataframe """ df_copy = input_df.copy() df_copy = remove_negative_rows(df_copy) df_copy = df_copy[df_copy["date"] >= 20160703] output_df = df_copy[(df_copy["kunag"] == kunag) & (df_copy["matnr"] == matnr)] output_df["dt_week"] = output_df["date"].apply( lambda x: pd.to_datetime(x, format="%Y%m%d")) output_df = output_df.sort_values("dt_week") output_df = output_df.set_index("dt_week") return output_df if __name__ == "__main__": print(individual_series_2(load_data())) df_series = individual_series(load_data(), 500057582, 103029) train, validation, test = splitter_2(df_series) print(arima(train, validation, test)[1])
from selection import load_data from data_transform import get_weekly_aggregate import matplotlib.pyplot as plt import pandas as pd df = load_data()[["date", "quantity"]] df = df[(df["quantity"] >= 0) & (df["quantity"] <= 10)] print(df["quantity"].value_counts()/df.shape[0]) aggregate_data = df.groupby("date")["quantity"].sum() aggregate_data = aggregate_data.reset_index() aggregate_data["date"] = aggregate_data["date"].apply(lambda x: pd.to_datetime(x, format="%Y%m%d")) aggregate_data = aggregate_data.sort_values("date") aggregate_data = aggregate_data.set_index("date") plt.figure(figsize=(16,8)) plt.plot(aggregate_data["quantity"], label='quantity') plt.title("aggregated plot") plt.show()
def overall_aggregate_seas(): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = load_data() overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv" ) k = 0 for index, row in tqdm(overall.iterrows()): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif (frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365 + 183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) else: result = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") final.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv" ) result = seasonal_decompose(final["quantity"], model="additive") result.seasonal.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv" ) return result.seasonal