def save_predictions(self, header="NumberOfSales", csv="final_sales_predictions.csv"): new = pd.DataFrame() new[header] = pd.Series(self.final_predictions) ds.save_dataset(new, csv) return new
def full_prep_test_ds_to_cust_pred(): test_datas = d.read_imputed_onehot_test_dataset() data_from = d.read_imputed_onehot_dataset() data_from['Date'] = p.to_datetime(data_from['Date'], format='%d/%m/%Y') data_from['Day'] = data_from['Date'].dt.weekday_name datas = prepare_ds_to_customer_prediction(test_datas, data_from) d.save_dataset(datas, "test_dataset_for_customers_prediction.csv")
def build_sales_predictor_test_dataset(name): ds_tr = d.read_dataset("final_sales_only_train.csv") ds = d.read_imputed_onehot_test_dataset() ds = __prepare_sales_only_test_ds(ds, ds_tr) d.save_dataset(ds, name)
def select_features(name, featlist, fname): ds = d.read_dataset(name) ds = ds[featlist] d.save_dataset(ds, fname)
if not new_cols.__contains__(s): new_cols.append(s) else: new_cols = vals for new in new_cols: ds[header + new] = p.Series(np.zeros(len(ds)), ds.index) for i in ds.index.tolist(): if d.content_of(ds, attr, i).find(new) != -1: ds.set_value(i, header + new, 1) return ds def one_hot_numeric(ds, attr, header): """Transforms the given attribute of the given DataFrame object into one hot encoding. If you plan to use this, don't use split attribute. Returns a DataFrame object.""" vals = d.values_of(ds, attr) new_cols = vals for new in new_cols: ds[header + str(new)] = p.Series(np.zeros(len(ds)), ds.index) for i in ds.index.tolist(): if d.content_of(ds, attr, i) == new: ds.set_value(i, header + str(new), 1) return ds if __name__ == '__main__': ds = d.read_test_dataset() ds = full_preprocess(ds) d.save_dataset(ds, "imputed_test_ds_one_hot.csv")
error, totp, totr, shops, months = eva.region_error(preds, preds, regions, ids, dates, True) print(error) print(totp) print(shops) print(months) sh, mo, sa = gen_pandas_cols() print(len(shops)) sa = np.array(sa, np.int) print(sa.min()) final_sub = pd.DataFrame() final_sub['StoreID'] = pd.Series(sh) final_sub['Month'] = pd.Series(mo) final_sub['NumberOfSales'] = pd.Series(sa) print(final_sub) d.save_dataset(final_sub, "final_sub_new1.csv") print(d.to_numpy(final_sub['NumberOfSales']).sum()) # ADDING MORE STATISTICS trainset = "final_for_sales_train.csv" trainds = d.read_dataset(trainset) final_sub_stats = preu.mean_sales_per_month_per_shop(final_sub, trainds) final_sub_stats['Ratio'] = final_sub_stats['NumberOfSales']/final_sub_stats['MeanSalesPerShopPerMonth'] print(final_sub_stats)
preds += p preds[preds < 0] = 0 for i in range(min(len(preds), number_print)): print("PRED: ", preds[i], " y: ", y[i]) print("R2: ", eva.r2(ds, preds, 'NumberOfSales')) if __name__ == '__main__': TRAIN = True LOAD = False SAVE_DF = False name = "test" ds = d.read_imputed_onehot_dataset() ds = prepare_ds(ds) d.save_dataset(ds, "fully_preprocessed_ds.csv") ds_train = utils.get_frame_in_range(ds, 3, 2016, 12, 2017) ds_test = utils.get_frame_in_range(ds, 1, 2018, 2, 2018) y = prepare_out(ds_train) real_y = np.array(y) dy = np.zeros(y.shape) x = drop_useless(ds_train) y_test = prepare_out(ds_test) if SAVE_DF: d.save_dataset(ds_test, "dataset_to_predict_sales.csv") x_test = drop_useless(ds_test) models = [] for i in range(number_of_model): if not LOAD: models.append(m.nonsequentialNNDropout(x.shape[1], i == 0))
def reorder_datas_cols(name, attrlist, nameout): datas = d.read_dataset(name) datas = reorder_attributes(datas, attrlist) d.save_dataset(datas, nameout)
autoexclude=True).exclude('NumberOfSales', 'Month').build() #data = sb.SetBuilder(target='NumberOfSales').exclude('Day').build() nn = neural_network.MLPRegressor(hidden_layer_sizes=(100, 5), activation='relu', solver='adam', batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, max_iter=50, shuffle=True, random_state=9, tol=0.000001, verbose=True, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) n = nn.fit(data.xtr, data.ytr.ravel()) ypred = nn.predict(data.xts) ds.save_dataset(pd.DataFrame(ypred), 'customer_pred_jan_feb_NN.csv') print('R2 = %s' % eval.evaluate(data.yts, ypred))
yy = np.array(yy) pred = yy.mean(axis=0) print('Bagging R2 = %s' % eval.evaluate(data.ytr, pred)) re, totr, totp = ev_cust.region_error(data.ytr, pred, regions, ids, dates) diff = totr - totp print("REG_ERR: ", re * 100) print("REG_MEAN_ERR: ", re.mean() * 100) print("REAL_SUM: ", totr.sum()) print("PRED_SUM: ", totp.sum()) print("SUM_OF_DIFFS: ", diff.sum()) data_t = d.read_dataset("final_sales_only_test_r.csv") data_t = sb.SetBuilder(target='NumberOfSales', autoexclude=True, df=data_t.copy(), split=(3, 2016, 2, 2018, 3, 2018, 4, 2018))\ .exclude_list(excluded_feats())\ .build() yy = [] for i in range(it): y = models[i].predict(data_t.xts) yy.append(y) pred = np.array(yy).mean(axis=0) tosave = pandas.DataFrame() tosave['NumberOfSales'] = pandas.Series(pred) d.save_dataset(tosave, "sales_only_final_pred_nn.csv")
if __name__ == '__main__': split = [[(3, 2016, 12, 2016), (3, 2017, 2, 2018)], [(1, 2017, 2, 2017)]] datas = build_cust_predictor_train_dataset(1, 2017, 2, 2017) datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas, split=split)\ .exclude("Month", "NumberOfSales")\ .build() n = 10 mods = [] for i in range(n): print(i + 1) x, y = datas.random_sampling(1.0) mod = skc.LinearSklearn(1, model) mod.train(x, y) mods.append(mod) p = mod.predict(datas.xtr).squeeze() print("TRAIN R2: ", eval.r2_score(datas.ytr, p)) print("TEST R2: ", eval.r2_score(datas.yts, mod.predict(datas.xts))) print("##########################") preds = [] for i in range(n): preds.append(mods[i].predict(datas.xts)) custpred = np.array(preds).mean(axis=0) print("TEST R2: ", eval.r2_score(datas.yts, custpred)) new = pandas.DataFrame() new['NumberOfCustomers'] = pandas.Series(custpred) ds.save_dataset(new, "genfeb17.csv")
y_t = prepare_out(d_reg_t) x_t = drop_useless(d_reg_t, 1) mod = skc.LinearSklearn(1, model) mod.train(x, y) models[i] = mod p = mod.predict(x).squeeze() pt = mod.predict(x_t).squeeze() r2_t = eval.r2_score(y_t, pt) sum += r2_t * (len(d_reg) + len(d_reg_t)) print("TRAIN R2: ", eval.r2_score(y, p)) print("TEST R2: ", r2_t) print("##########################") print("AVG TEST R2: ", sum / len(datas)) custpred = [] for i in test.index.tolist(): row = test.loc[i] val = "" for t in types: if row[t] == 1: val = t row = drop_useless(row).reshape([1, -1]) custpred.append(models[val].predict(row).squeeze()) custpred = np.array(custpred) new = pandas.DataFrame(index=test.index) new['NumberOfCustomers'] = pandas.Series(custpred, test.index) ds.save_dataset(new, "cust_ensemble_per_region_predictions1.csv")
if __name__ == '__main__': TRAIN = True LOAD = False SAVE_DS = True name = "test2_" ds = d.read_imputed_onehot_dataset() ds = prepare_ds(ds) ds_train = utils.get_frame_in_range(ds, 3, 2016, 12, 2017) ds_test = utils.get_frame_in_range(ds, 1, 2018, 2, 2018) y = prepare_out(ds_train) real_y = np.array(y) dy = np.zeros(y.shape) x = drop_useless(ds_train) y_test = prepare_out(ds_test) if SAVE_DS: d.save_dataset(ds_test, "dataset_to_predict_customers.csv") x_test = drop_useless(ds_test) models = [] for i in range(number_of_model): if not LOAD: models.append(m.nonsequentialNNtest(x.shape[1], i == 0)) else: models.append(k.models.load_model("mod" + name + str(i) + ".h5")) opt = k.optimizers.adam(lr=3e-5) models[i].compile(optimizer=opt, loss='mean_squared_error', metrics=['mae']) models[i].summary() if TRAIN: models[i].fit(x=x, y=y, batch_size=10000, epochs=50, verbose=2, validation_data=(x_test, y_test)) models[i].save("mod" + name + str(i) + ".h5") dy = models[i].predict(x, 500)
from sklearn.neighbors import KNeighborsRegressor as knn from sklearn import preprocessing import pandas as pd def model(): return knn(n_neighbors=10, weights='uniform', algorithm='ball_tree', leaf_size=15) if __name__ == '__main__': datas = ds.read_dataset("best_for_customers.csv") datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas).exclude('NumberOfSales', 'Month').build() mod = skc.LinearSklearn(1, model) x = datas.xtr y = datas.ytr mod.train(x, y) custpred = mod.predict(datas.xts) print("TEST R2: ", eval.r2_score(datas.yts, custpred)) print("##########################") new = pandas.DataFrame() new['NumberOfCustomers'] = pandas.Series(custpred) ds.save_dataset(new, "knncustpreds1.csv")
def build_cust_predictor_test_dataset(name): ds_tr = d.read_dataset("final_for_customer_train.csv") ds = d.read_imputed_onehot_test_dataset() ds = __prepare_customers_test_ds(ds, ds_tr) d.save_dataset(ds, name)
def build_sales_predictor_train_dataset(name): ds = d.read_imputed_onehot_dataset() ds = __prepare_sales_train_ds(ds) d.save_dataset(ds, name)
modpreds_t = [] for i in range(n): print(i + 1) x, y = datas.xtr, datas.ytr mod = skc.LinearSklearn(1, model[i]) mod.train(x, y) mods.append(mod) p = mod.predict(x).squeeze() print(p) p_t = mod.predict(datas.xts).squeeze() modpreds.append(p) modpreds_t.append(p_t) print("TRAIN R2: ", eval.r2_score(y, p)) print("TEST R2: ", eval.r2_score(datas.yts, p_t)) print("##########################") modpreds = np.array(modpreds).transpose() modpreds_t = np.array(modpreds_t).transpose() x = np.hstack((datas.xtr, modpreds)) x_t = np.hstack((datas.xts, modpreds_t)) fin = skc.LinearSklearn(1, final) fin.train(x, datas.ytr) custpred = fin.predict(x_t) print(custpred) print("TEST R2: ", eval.r2_score(datas.yts, custpred)) new = pandas.DataFrame() new['NumberOfCustomers'] = pandas.Series(custpred) ds.save_dataset(new, "meta_mod2.csv")