def outlier(df_series, category): flag = False if category == 1 or category == 2 or category == 3: window_size, sigma, flag = 12, 3, True if category == 4 or category == 5 or category == 8: window_size, sigma, flag = 12, 5, True if category == 7 and df_series.shape[0] >= 26: window_size, sigma, flag = 12, 4, True if category == 9: window_size, sigma, flag = 24, 5, True _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) if not flag: aggregated_data = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) return aggregated_data _result = ma_replace_outlier(data=aggregated_data, n_pass=3, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) return result
def samples_aggregate_seas(): df = load_data() bucket_1_sample = pd.read_csv( "/home/aman/PycharmProjects/seasonality_hypothesis/data_generated/bucket_1_sample.csv" ) k = 0 for index, row in bucket_1_sample.iterrows(): df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) _result = ma_replace_outlier(data=aggregated_data, n_pass=3, aggressive=True, window_size=12, sigma=3.0) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") #plt.figure(figsize=(16, 8)) #plt.plot(final["quantity"], label='quantity', marker=".") #plt.title("200 sample aggregated plot") #plt.xlabel("dt_weeks") #plt.ylabel("aggregated quantities") #plt.show() result = seasonal_decompose(final["quantity"], model="additive") #result.plot() #plt.show() return result.seasonal
def outlier_material(df_series): _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) _result = ma_replace_outlier(data=aggregated_data, n_pass=3, aggressive=True, window_size=12, sigma=3) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) return result
def outlier_on_aggregated(aggregated_df): _testing = aggregated_df[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) n_pass = 3 window_size = 12 sigma = 3.0 _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="after outlier") # # plt.plot(result.set_index("dt_week").diff(), marker=".", label="differenced after outlier") # plt.title("aggregated outlier removed") # plt.show() return result
def dickey_fuller_test(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv" ) overall = overall[overall["matnr"] == matnr] product = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") product_name = product[product["matnr"] == str( matnr)]["description"].values[0] k = 0 for index, row in tqdm(overall.iterrows()): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue #print(df_series) df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif (frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365 + 183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) else: result = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") #temp = final # plt.plot(final, marker=".") # plt.title("original") # plt.show() final, Flag = cond_check(final) if Flag: final_detrended = detrend(final) # plt.plot(final_detrended, marker=".") # plt.title("detrended") # plt.show() final_aggregate = monthly_aggregate(final_detrended) # plt.plot(final_aggregate, marker=".") # plt.title("aggregated") # plt.show() result = adfuller(final_aggregate["quantity"], maxlag=10, autolag='t-stat') print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) print("No of lags used: %f" % result[2]) print("No of observations: %f" % result[3]) print('Critical Values:') for key, value in result[4].items(): print('\t%s: %.3f' % (key, value)) #plt.figure(figsize=(16, 8)) #plt.plot(temp, marker=".") if result[1] >= 0.05: #plt.title("Not Stationary") print("Not stationary") else: #plt.title("Stationary") print("Stationary") #plt.savefig("/home/aman/PycharmProjects/seasonality_hypothesis/seasonality_result_2/" + str(matnr) + "_" + product_name + ".png") else: print("length of series is less than 112")
def overall_aggregate_seas(input_df, matnr=103029): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C025.csv") overall = overall[overall["matnr"] == matnr] product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") product_name = product[product["matnr"] == str(matnr)]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if int(frequency) == 0: continue df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 try: final = final.groupby("dt_week")["quantity"].sum().reset_index() except: return None final = final.set_index("dt_week") print(final["quantity"]) print(final["quantity"].shape) result = seasonal_decompose(final["quantity"], model="additive") result.plot() #plt.show() plt.savefig( "/home/aman/PycharmProjects/seasonality_hypothesis/plots_product_C0025/"+str(matnr)+"_"+product_name+".png") #result.seasonal.to_csv( # "~/PycharmProjects/seasonality_hypothesis/data_generated/product_aggregate_seasonality_"+str(matnr)+".csv") return result.seasonal
def ljung_box_test(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv") overall = overall[overall["matnr"] == matnr] # product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") # product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue # print(df_series) df_series = get_weekly_aggregate(df_series) # plt.plot(df_series.set_index("dt_week")["quantity"], marker=".", label="individual series") # plt.title(str(row["matnr"]) + " before outlier") # plt.show() _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) # plt.plot(result.set_index("dt_week")["quantity"], marker=".", label="individual_series_after_outlier") # plt.title(str(row["matnr"]) + " after outlier") # plt.show() if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() # plt.figure(figsize=(16, 8)) # plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y") # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Product Weekly Aggregated Data", fontsize=16) # plt.legend(fontsize=14) # plt.show() final = outlier_on_aggregated(final) final_temp = final plt.figure(figsize=(16, 8)) plt.plot(final.set_index("dt_week"), marker=".", markerfacecolor="red", label="y") # plt.plot(final.set_index("dt_week").diff(), marker=".", label="differenced aggregated_data") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Product Weekly Aggregated Data Outlier Removed", fontsize=16) # plt.legend(fontsize=14) plt.show() # print("________", final.dtypes) final = final.set_index("dt_week") missing_more_24 = missing_data_detection(final) if missing_more_24: # print("data is missing for more than 6 months") return False, 1, 0, final, final_temp #temp = final # final = final.diff() # print("checking the length of aggregated series ...") final, Flag = cond_check(final) if Flag: # print("detrending the aggregated series ...") final_detrended = detrend(final) # plt.figure(figsize=(16, 8)) # plt.plot(final_detrended, marker=".", markerfacecolor="red", label="y") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Detrended", fontsize=16) # plt.legend(fontsize=14) # plt.show() # print("monthly aggregating the aggregated series ...") final_aggregate = monthly_aggregate(final_detrended) # plt.figure(figsize=(16, 8)) # plt.plot(final_aggregate, marker=".", markerfacecolor="red", label="y") # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.xlabel("Date", fontsize=14) # plt.ylabel("Quantity", fontsize=14) # plt.title("Monthly Aggregated", fontsize=16) # plt.legend(fontsize=14) # plt.show() # print("standard deviation is", final.std()/ final.mean()) # print("performing ljung box test ...") result = acorr_ljungbox(final_aggregate["quantity"], lags=[13]) # print(result) result_dickey = adfuller(final_aggregate["quantity"]) # print("statistic: %f" %result[0]) # print("p-value: %f" %result[1]) # print("p_value is :", result[1][0]) if result[1] < 0.02: # print(str(matnr)+" is seasonal") return True, result[1][0], result_dickey[1], final, final_temp else: # print(str(matnr) + " is not seasonal") return False, result[1][0], result_dickey[1], final, final_temp else: print("length of series is less than 112") return [False, "length is small", 0, final, final_temp]
def ljung_box_test_without_aggregation(input_df, matnr=112260): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = input_df.copy() df = df[df["matnr"] == matnr] overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_4200_C005.csv") overall = overall[overall["matnr"] == matnr] #product = pd.read_csv("~/PycharmProjects/seasonality_hypothesis/data/material_list.tsv", sep="\t") #product_name = product[product["matnr"] == str(int(matnr))]["description"].values[0] k = 0 for index, row in overall.iterrows(): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue #print(df_series) df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif(frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365+183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) else: result = aggregated_data.rename(columns={'ds': 'dt_week', 'y': 'quantity'}) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") result = acorr_ljungbox(final["quantity"], lags=[52]) # print("statistic: %f" %result[0]) # print("p-value: %f" %result[1]) if result[1] < 0.01: #print(result[1]) return True, result[1], final else: return False, result[1], final
def overall_aggregate_seas(): """ This function aggregates whole cleaveland data with ma outliers removing different categories series outliers First week has been removed :return: pandas_df : seasonal component of the aggregated data """ df = load_data() overall = pd.read_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/frequency_days_cleaveland.csv" ) k = 0 for index, row in tqdm(overall.iterrows()): frequency = row["frequency"] days = row["days"] df_series = df[(df["kunag"] == row["kunag"]) & (df["matnr"] == row["matnr"])] df_series = df_series[df_series["quantity"] >= 0] df_series = df_series[df_series["date"] >= 20160703] if frequency == 0: continue df_series = get_weekly_aggregate(df_series) _testing = df_series[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={ 'dt_week': 'ds', 'quantity': 'y' }) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) outlier = True if (frequency >= 26) & (days > 365 + 183): n_pass = 3 window_size = 12 sigma = 4.0 elif (frequency >= 20) & (frequency < 26): n_pass = 3 window_size = 12 sigma = 5.0 elif (frequency >= 26) & (days <= 365 + 183): if len(aggregated_data) >= 26: n_pass = 3 window_size = 12 sigma = 4.0 else: outlier = False elif (frequency >= 12) & (frequency < 20): if len(aggregated_data) >= 26: n_pass = 3 window_size = 24 sigma = 5.0 else: outlier = False elif frequency < 12: outlier = False if outlier: _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) else: result = aggregated_data.rename(columns={ 'ds': 'dt_week', 'y': 'quantity' }) if k == 1: final = pd.concat([final, result]) if k == 0: final = result k = 1 final = final.groupby("dt_week")["quantity"].sum().reset_index() final = final.set_index("dt_week") final.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed.csv" ) result = seasonal_decompose(final["quantity"], model="additive") result.seasonal.to_csv( "~/PycharmProjects/seasonality_hypothesis/data_generated/aggregated_complete_outliers_removed_seas.csv" ) return result.seasonal
"/home/aman/PycharmProjects/seasonality_hypothesis/115584.csv") y = data.quantity from outlier import ma_replace_outlier from dateutil import parser aggregated_df = data print(aggregated_df) _testing = aggregated_df[["quantity", "dt_week"]].copy() aggregated_data = _testing.rename(columns={'dt_week': 'ds', 'quantity': 'y'}) aggregated_data.ds = aggregated_data.ds.apply(str).apply(parser.parse) aggregated_data.y = aggregated_data.y.apply(float) aggregated_data = aggregated_data.sort_values('ds') aggregated_data = aggregated_data.reset_index(drop=True) n_pass = 3 window_size = 12 sigma = 3.0 _result = ma_replace_outlier(data=aggregated_data, n_pass=n_pass, aggressive=True, window_size=window_size, sigma=sigma) result = _result[0].rename(columns={'ds': 'dt_week', 'y': 'quantity'}) print(result.iloc[31:34]) # import numpy as np # a = np.ones(10) # # a[7] = 2 # print(a) # print(np.where(a == 2))