def train(self, ds): datassc = ds[ds['StoreType_Shopping Center'] == 1] datasoth = ds[ds['StoreType_Shopping Center'] == 0] datasc = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datassc) \ .exclude('NumberOfSales', 'Month') \ .build() print("SHOPPING CENTERS MODEL TRAINING: ") for i in range(self.nsc): print("SC :", i) x, y = datasc.random_sampling(1.0) mod = self.model_shopc() mod.fit(x, y) pr = mod.predict(datasc.xtr) self.models_sc.append(mod) print("SC ", i, " TRAIN R2: ", eval.evaluate(datasc.ytr, pr)) dataoth = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datasoth) \ .exclude('NumberOfSales', 'Month') \ .build() print("OTHER SHOPS MODEL TRAINING: ") for i in range(self.noth): print("OTH :", i) x, y = dataoth.random_sampling(1.0) mod = self.model_others() mod.fit(x, y) pr = mod.predict(dataoth.xtr) self.models_o.append(mod) print("OTH ", i, " TRAIN R2: ", eval.evaluate(dataoth.ytr, pr))
def test(self, ds): datassc = ds[ds['StoreType_Shopping Center'] == 1] datasoth = ds[ds['StoreType_Shopping Center'] == 0] datasc = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datassc) \ .exclude('NumberOfSales', 'Month') \ .build() print("SHOPPING CENTERS MODEL EVALUATION") preds = self.predict_sc(datasc.xts) print("SC TEST R2: ", eval.evaluate(datasc.yts, preds)) dataoth = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datasoth) \ .exclude('NumberOfSales', 'Month') \ .build() print("OTHERS MODEL EVALUATION") preds = self.predict_oth(dataoth.xts) print("OTH TEST R2: ", eval.evaluate(dataoth.yts, preds))
def __init__(self, count): Bagging.__init__(self, count) self.description = "customers" self.training_ds = sb.SetBuilder( target='NumberOfCustomers', autoexclude=True, split=[[(3, 2016, 2, 2018)], []], dataset="final_for_customer_train.csv").exclude( 'NumberOfSales', 'Month').build() self.testing_ds = sb.SetBuilder( target='NumberOfCustomers', autoexclude=True, split=[[], [(3, 2018, 4, 2018)]], dataset="final_for_customer_test_r.csv").exclude( 'NumberOfSales', 'Month').build()
def __init__(self, count, target): Bagging.__init__(self, count) print( "Remember: IS IS MANDATORY THAT THE COLUMN ORDER OF TARGET SET IS THE SAME OF final_for_sales_train.csv" ) self.description = "sales" self.target = target self.training_ds = sb.SetBuilder( target='NumberOfSales', autoexclude=True, split=[[(3, 2016, 2, 2018)], []], dataset="final_for_sales_train.csv").exclude('Month').build() self.testing_ds = sb.SetBuilder(target='NumberOfSales', autoexclude=True, split=[[], [(3, 2018, 4, 2018)]], df=target).exclude('Month').build()
from numpy import loadtxt from xgboost import XGBRegressor from sklearn import linear_model from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures import dataset.setbuilder as sb import models.sklearn.evaluator as eval data = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True).exclude('NumberOfSales', 'Month').build() model = XGBRegressor(n_estimators=100, learning_rate=0.2, colsample_bytree=1, max_depth=4, silent=False, n_jobs=8) model.fit(data.xtr, data.ytr) pred_tr = model.predict(data.xtr) pred_ts = model.predict(data.xts) print('R2 TRAIN = %s' % eval.evaluate(data.ytr, pred_tr)) print('R2 TEST = %s' % eval.evaluate(data.yts, pred_ts))
preds += p preds[preds < 0] = 0 for i in range(min(len(preds), number_print)): print("PRED: ", preds[i], " y: ", y[i]) print("R2: ", eva.r2(ds, preds, 'NumberOfCustomers')) if __name__ == '__main__': TRAIN = True LOAD = False SAVE_DS = True name = "lin_" dts = d.read_dataset("best_for_customers.csv") ds = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=dts)\ .exclude('NumberOfSales', 'Month')\ .build() models = [] x, y = ds.xtr, ds.ytr real_y = np.array(y) x_test, y_test = ds.xts, ds.yts for i in range(number_of_model): if not LOAD: models.append(m.single_relu(x.shape[1], i == 0)) else: models.append(k.models.load_model("mod" + name + str(i) + ".h5")) opt = k.optimizers.adam(lr=2e-4) models[i].compile(optimizer=opt, loss='mean_squared_error', metrics=['mae']) models[i].summary() if TRAIN: models[i].fit(x=x, y=y, batch_size=20000, epochs=80, verbose=2, validation_data=(x_test, y_test))
return ensemble.GradientBoostingRegressor(max_depth=8, n_estimators=5) if __name__ == '__main__': datas = ds.read_dataset("mean_var_on_customers_from_tain.csv") datas['Month'] = datas['Date'] datas['Month'] = datas['Month'].apply(lambda x: x.split("-")[1]) datas = imp.one_hot_numeric(datas, 'Month', 'Month_') datas = imp.one_hot_numeric(datas, 'Region', 'Region_') datas = preprocessing_utils.mean_cust_per_month_per_region( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = preprocessing_utils.mean_cust_per_month_per_shop( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas)\ .exclude('NumberOfSales', 'Month', 'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC', 'Max_VisibilityKm', 'Max_Wind_SpeedKm_h', 'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_VisibilityKm', 'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity', 'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Min_VisibilitykM')\ .build() model = [ ridge, linear_model.LinearRegression, lasso, regtree, gradboostreg ] final = ridge n = len(model) mods = [] modpreds = [] modpreds_t = [] for i in range(n): print(i + 1) x, y = datas.xtr, datas.ytr mod = skc.LinearSklearn(1, model[i]) mod.train(x, y)
def excluded_feats(): return [ "Month", 'Max_Humidity', 'Max_TemperatureC', 'Max_VisibilityKm', 'Max_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity', 'Min_TemperatureC', 'Min_VisibilitykM', 'NumberOfCustomers' ] datas = d.read_dataset("final_sales_only_train.csv") regions = d.to_numpy(datas[['Region']]).squeeze() dates = d.to_numpy(datas[['Date']]).squeeze() ids = d.to_numpy(datas[['StoreID']]).squeeze() data = sb.SetBuilder(target='NumberOfSales', autoexclude=True, df=datas.copy(), split=(3, 2016, 2, 2018, 12, 2018, 12, 2018))\ .exclude_list(excluded_feats())\ .build() it = 1 yy = [] models = [] for i in range(it): bagx, bagy = data.xtr, data.ytr dt = nn.MLPRegressor(hidden_layer_sizes=(400, 3), activation='identity', solver='adam', batch_size=50000, learning_rate='adaptive', learning_rate_init=0.002, max_iter=50, shuffle=True,
das = preu.eliminate_IsOpen_zeros(das) das = preu.mean_std_cust_per_shop_per_day(das, dfrom) das = preu.add_avg_cust_per_shop(das, dfrom) das = preu.add_std_cust_per_shop(das, dfrom) das = preu.add_max_cust_per_shop(das, dfrom) das = preu.add_min_cust_per_shop(das, dfrom) das = preu.mean_cust_per_month_per_shop(das, dfrom) das = preu.mean_cust_per_month_per_region(das, dfrom) return das if __name__ == '__main__': split = [[(3, 2016, 12, 2016), (3, 2017, 2, 2018)], [(1, 2017, 2, 2017)]] datas = build_cust_predictor_train_dataset(1, 2017, 2, 2017) datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas, split=split)\ .exclude("Month", "NumberOfSales")\ .build() n = 10 mods = [] for i in range(n): print(i + 1) x, y = datas.random_sampling(1.0) mod = skc.LinearSklearn(1, model) mod.train(x, y) mods.append(mod) p = mod.predict(datas.xtr).squeeze() print("TRAIN R2: ", eval.r2_score(datas.ytr, p)) print("TEST R2: ", eval.r2_score(datas.yts, mod.predict(datas.xts))) print("##########################") preds = []
ds_name = "fully_preprocessed_ds.csv" NMODELS = 3 SET_OF_MODELS_DIM = 1 models = [] datas = ds.read_dataset(ds_name) datas = utils.get_frame_in_range(datas, 1, 2018, 2, 2018) regions = ds.to_numpy(datas[['Region']]).squeeze() dates = ds.to_numpy(datas[['Date']]).squeeze() ids = ds.to_numpy(datas[['StoreID']]).squeeze() def mod(): return skl.linear_model.Ridge(200) for i in range(SET_OF_MODELS_DIM): data = sb.SetBuilder(target='NumberOfSales', dataset=ds_name).build() data.random_sampling(1.0) model = skc.LinearSklearn(NMODELS, mod) # Performs simple linear regression model.train(data.xtr, data.ytr) models.append(model) data = sb.SetBuilder(target='NumberOfSales', dataset=ds_name).build() ypred = np.zeros(shape=data.yts.shape[0]) for i in range(SET_OF_MODELS_DIM): ypred += models[i].predict(data.xts) ypred = ypred/SET_OF_MODELS_DIM print('R2 = %s' % eval.evaluate(data.yts, ypred))
from sklearn import tree import dataset.setbuilder as sb import models.sklearn.evaluator as eval import numpy as np import models.sklearn.persistence as pr print("Plain Decision regression tree without bagging") # Build training & test sets # data = sb.SetBuilder( target='NumberOfCustomers', autoexclude=True, dataset='best_for_customers.csv', ).exclude('NumberOfSales', 'Month').build() # data = sb.SetBuilder(target='NumberOfSales', autoexclude=True, dataset='mean_var_on_cust_from_tain.csv').build() # Performs simple linear regression depth = 8 dtree = tree.DecisionTreeRegressor(max_depth=depth) dtree.fit(data.xtr, data.ytr) ypred = dtree.predict(data.xts) pr.save_model(dtree, 'decision_tree_cust') dtree = pr.load_model('decision_tree_cust') ypred = dtree.predict(data.xts)
import dataset.setbuilder as sb import models.sklearn.evaluator as eval from sklearn import svm data = sb.SetBuilder(target='NumberOfSales').build().random_sampling( percentage=0.1) clf = svm.SVR() clf.fit(data.xtr, data.ytr.ravel()) ypred = clf.predict(data.xts) print('R2 = %s' % eval.evaluate(data.yts, ypred))
import dataset.utility as dsutil import dataset.dataset as ds import seaborn as sea import dataset.setbuilder as sb from sklearn import linear_model import pandas as pd import numpy as np import matplotlib.pyplot as plt import models.sklearn.evaluator as eval from sklearn.preprocessing import PolynomialFeatures data = sb.SetBuilder(target="NumberOfSales", dataset="final_for_sales_train.csv", autoexclude=False, split=[[(3, 2016, 1, 2018)], [(3, 2016, 2, 2018)] ]).only('NearestCompetitor').build() poly_degree = 2 # Performs simple linear regression print("Linear regression started, polynomial degree = %s" % poly_degree) poly = PolynomialFeatures(degree=poly_degree) xtr_ = poly.fit_transform(data.xtr) xts_ = poly.fit_transform(data.xts) model = linear_model.LinearRegression() model.fit(data.xtr, data.ytr) print(eval.evaluate(data.ytr, model.predict(data.xtr))) print(eval.evaluate(data.yts, model.predict(data.xts)))