def best_pair_forecast(conf, forecasts_data, y_actual, submission_data, submissions_ids, feild_names): no_of_training_instances = int(forecasts_data.shape[0] * 0.3) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, forecasts_data, y_actual) test_forecasts_r1, submissions_pair_forecasts = find_n_best_pairs( X_train, y_train, X_test, y_test, submission_data, feild_names, pair_count=10) no_of_training_instances = int(test_forecasts_r1.shape[0] * 0.5) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, test_forecasts_r1, y_test) test_forecasts_r2, submissions_pair_forecasts = find_n_best_pairs( X_train, y_train, X_test, y_test, submissions_pair_forecasts, feild_names, pair_count=2) final_test_forecast = np.mean(test_forecasts_r2, axis=1) calculate_accuracy("final_pair_forecast", y_test, final_test_forecast) best_pair_ensamble_submission = np.mean(submissions_pair_forecasts, axis=1) save_submission_file("best_pair_submission.csv", submissions_ids, best_pair_ensamble_submission) sys.stdout.flush()
def best_pair_forecast(conf, forecasts_data, y_actual, submission_data, submissions_ids, feild_names): no_of_training_instances = int(forecasts_data.shape[0]*0.3) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, forecasts_data, y_actual) test_forecasts_r1, submissions_pair_forecasts = find_n_best_pairs(X_train, y_train, X_test, y_test, submission_data, feild_names, pair_count=10) no_of_training_instances = int(test_forecasts_r1.shape[0]*0.5) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, test_forecasts_r1, y_test) test_forecasts_r2, submissions_pair_forecasts = find_n_best_pairs(X_train, y_train, X_test, y_test, submissions_pair_forecasts, feild_names, pair_count=2) final_test_forecast = np.mean(test_forecasts_r2, axis=1) calculate_accuracy("final_pair_forecast", y_test, final_test_forecast) best_pair_ensamble_submission = np.mean(submissions_pair_forecasts, axis=1) save_submission_file("best_pair_submission.csv", submissions_ids, best_pair_ensamble_submission) sys.stdout.flush()
def run_regression_ensamble(models, y_test, parmsFromNormalization): training_set_size = int(len(y_test) * .7) X_train, X_test, y_train, y_test = train_test_split( training_set_size, models, y_test) print("results for combined Models") y_pred_lr = regression_with_LR(X_train, y_train, X_test, y_test, parmsFromNormalization)
def vote_with_lr(conf, forecasts, best_model_index, y_actual): start = time.time() best_forecast = forecasts[:, best_model_index] forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1) forecasts = np.where(forecasts <=0, 0.1, forecasts) data_train = [] for i in range(forecasts.shape[0]): f_row = forecasts[i,] min_diff_to_best = np.min([cal_rmsle(best_forecast[i], f) for f in f_row]) comb = list(itertools.combinations(f_row,2)) avg_error = scipy.stats.hmean([cal_rmsle(x,y) for (x,y) in comb]) data_train.append([min_diff_to_best, avg_error, scipy.stats.hmean(f_row), np.median(f_row), np.std(f_row)]) X_all = np.column_stack([np.row_stack(data_train), best_forecast]) if conf.target_as_log: y_actual = transfrom_to_log(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual)*0.25)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) y_actual_test = y_actual[no_of_training_instances:] lr_model =linear_model.Lasso(alpha = 0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) print_time_took(start, "vote_with_lr") return lr_forcast_revered
def predict_using_veriation(forecasts_data, best_forecast, y_actual, frac = 1.0): size_to_keep = int(forecasts_data.shape[0]*frac) forecasts_data = forecasts_data[:size_to_keep] y_actual = y_actual[:size_to_keep] forecasts_data = transfrom_to_log2d(forecasts_data) forecasts_stdev = np.std(forecasts_data, axis=1) forecasts_mean = np.mean(forecasts_data, axis=1) #forecasts_robust_mean = forecasts_data[abs(forecasts_data - forecasts_mean.reshape((-1,1))) < 2 * forecasts_stdev.reshape((-1,1))] forecasts_hmean = fillna_and_inf(scipy.stats.hmean(np.where(forecasts_data <= 0, 0.1, forecasts_data), axis=1)) forecasts_median = np.median(forecasts_data, axis=1) min_diff_to_best = np.min(np.abs(forecasts_data - best_forecast.reshape((-1,1))), axis=1) diff_best_to_mean = np.abs(best_forecast - forecasts_mean) print "forecasts_stdev", basic_stats_as_str(forecasts_stdev) print "forecasts_mean", basic_stats_as_str(forecasts_mean) print "diff_best_to_mean", basic_stats_as_str(diff_best_to_mean) print "min_diff_to_best", basic_stats_as_str(min_diff_to_best) #forecasts_stdev >>count=2.967219e+07,mean=2.072201e-01,std=1.256776e-01,min=3.159570e-03,50%=1.758286e-01,25%=1.243373e-01,50%=1.758286e-01,75%=2.542546e-01,95%=4.540049e-01,max=2.138248e+00,dtype:=float64 #forecasts_mean >>count=2.967219e+07,mean=1.594980e+00,std=6.205789e-01,min=8.732127e-02,50%=1.462738e+00,25%=1.151854e+00,50%=1.462738e+00,75%=1.893289e+00,95%=2.765280e+00,max=7.470179e+00,dtype:=float64 #diff_best_to_mean >>count=2.967219e+07,mean=4.604835e+00,std=1.701537e+01,min=2.758960e-07,50%=1.744317e+00,25%=8.145370e-01,50%=1.744317e+00,75%=3.950076e+00,95%=1.473034e+01,max=3.871046e+03,dtype:=float64 #min_diff_to_best >>count=2.967219e+07,mean=4.284890e+00,std=1.698368e+01,min=1.200819e-12,50%=1.479762e+00,25%=4.711246e-01,50%=1.479762e+00,75%=3.638548e+00,95%=1.426816e+01,max=3.870050e+03,dtype:=float64 final_forecast = np.zeros(size_to_keep) for i in range(size_to_keep): if min_diff_to_best[i] < 0.2 or diff_best_to_mean[i] < 0.3: final_forecast[i] = best_forecast[i] elif forecasts_stdev[i] < 0.3: final_forecast[i] = forecasts_mean[i] else: final_forecast[i] = (forecasts_median[i] + best_forecast[i])/2 calculate_accuracy("predict_using_veriation", y_actual, final_forecast) X_all = np.column_stack([best_forecast, forecasts_mean, forecasts_hmean, forecasts_stdev, min_diff_to_best, diff_best_to_mean, forecasts_median]) y_actual = transfrom_to_log(y_actual) no_of_training_instances = int(round(len(y_actual)*0.50)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) lr_model =linear_model.Lasso(alpha = 0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forecast = retransfrom_from_log(lr_forecast) calculate_accuracy("predict_using_veriation_lr_forecast ", retransfrom_from_log(y_test), lr_forecast) xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4 } model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=['best_forecast', 'forecasts_mean', 'forecasts_hmean', 'forecasts_stdev', 'min_diff_to_best', 'diff_best_to_mean', 'forecasts_median'], use_cv=True, use_sklean=False, xgb_params=xgb_params) xgb_forecast = model.predict(X_test) xgb_forecast_actual = retransfrom_from_log(xgb_forecast) calculate_accuracy(str(xgb_params) + "predict_using_veriation_xgb_forecast", retransfrom_from_log(y_test), xgb_forecast_actual) return final_forecast
def vote_with_lr(conf, forecasts, best_model_index, y_actual): start = time.time() best_forecast = forecasts[:, best_model_index] forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1) forecasts = np.where(forecasts <= 0, 0.1, forecasts) data_train = [] for i in range(forecasts.shape[0]): f_row = forecasts[i, ] min_diff_to_best = np.min( [cal_rmsle(best_forecast[i], f) for f in f_row]) comb = list(itertools.combinations(f_row, 2)) avg_error = scipy.stats.hmean([cal_rmsle(x, y) for (x, y) in comb]) data_train.append([ min_diff_to_best, avg_error, scipy.stats.hmean(f_row), np.median(f_row), np.std(f_row) ]) X_all = np.column_stack([np.row_stack(data_train), best_forecast]) if conf.target_as_log: y_actual = transfrom_to_log(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual) * 0.25)) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, X_all, y_actual) y_actual_test = y_actual[no_of_training_instances:] lr_model = linear_model.Lasso(alpha=0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) print_time_took(start, "vote_with_lr") return lr_forcast_revered
print("index2remove", index2remove) df = df.drop(df.index[index2remove]) print("Df", df.head()) #covert data frame to numpy array X_without_sales = df.values.copy() print("X_all.shape", X_all.shape, "X_without_sales.shape", X_without_sales.shape) #join two arrays ( also can do np.row_stack ) X_all = np.column_stack((X_without_sales,X_all)) print("X_all.shape", X_all.shape) #X_train_raw = preprocessing.normalize(X_train_raw.astype("float32"), norm='l2') X_train, X_test, y_train, y_test = train_test_split(training_set_size, X_all, Y_all) run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, 10) nodes_in_layer = 500 number_of_hidden_layers = 2 droput = 0.05 activation_fn='relu' #y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, nodes_in_layer, # number_of_hidden_layers, droput, activation_fn, 100) #print_regression_model_summary("DL", y_test, y_pred_dl)
# y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c) # print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl) size = 100000 x = np.random.zipf(2, size) y = np.random.normal(10, 1, size) xy = [x[i] * y[i] for i in range(size)] xbyy = [x[i] / y[i] if y[i] != 0 else 1 for i in range(size)] #target = [ 2*(2*x[i] + y[i])/3*y[i] for i in range(size)] target = [2 * (2 * x[i] + y[i]) / 3 * y[i] for i in range(size)] train_set_size = int(size * 0.7) X_all, Y_all = normlaize_data(np.column_stack((x, y, xy, xbyy)), np.array(target)) X_all, Y_all = shuffle_data(X_all, Y_all) X_train, X_test, y_train, y_test = train_test_split(train_set_size, X_all, Y_all) c = MLConfigs(nodes_in_layer=4, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse", epoch_count=15, optimizer=Adam()) y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c) print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)
timeSincePromotion = timeSincePromotion[window_size-1:-1] wfeatures = create_window_based_features(sales_data, window_size) wfeatures = wfeatures[window_size - 1:-1,] dayOfWeekCos = dfS['dayOfWeekCos'].values w1cosratio = [X_all_t[i][0]/dayOfWeekCos[i] for i in range(len(dayOfWeekCos))] w1cosproduct = [X_all_t[i][0] * dayOfWeekCos[i] for i in range(len(dayOfWeekCos))] wecosratio = [X_all_t[i][window_size-1] / dayOfWeekCos[i] for i in range(len(dayOfWeekCos))] wecosproduct = [X_all_t[i][window_size-1] * dayOfWeekCos[i] for i in range(len(dayOfWeekCos))] #print(X_without_sales.shape, 1, wfeatures.shape, 1, 1, 1, 1, X_all_t.shape) X_all_t = np.column_stack((X_without_sales, timeSincePromotion, wfeatures, w1cosratio, w1cosproduct, wecosratio, wecosproduct, X_all_t)) #print("X_all.shape", X_all_t.shape) training_set_size = int(0.7*X_all_t.shape[0]) X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(training_set_size, X_all_t, Y_all_t) x_train_list.append(X_train_s) x_test_list.append(X_test_s) y_train_list.append(y_train_s) y_test_list.append(y_test_s) #put the store data back togehter X_train = np.row_stack(x_train_list) X_test = np.row_stack(x_test_list) Y_train = np.concatenate(y_train_list, axis=0) Y_test = np.concatenate(y_test_list, axis=0) #X_all = storeData[0] #Y_all = target[0]
def run_regression_ensamble(models, y_test, parmsFromNormalization): training_set_size = int(len(y_test)*.7) X_train, X_test, y_train, y_test = train_test_split(training_set_size, models, y_test) print("results for combined Models") y_pred_lr = regression_with_LR(X_train, y_train, X_test, y_test, parmsFromNormalization)
def avg_models(conf, blend_forecasts_df, y_actual, submission_forecasts_df, submission_ids=None, xgb_params=None, do_cv=True, frac=1.0, sec_test_data=None): print "start avg models" start = time.time() forecasting_feilds = list(blend_forecasts_df) print "Using features", forecasting_feilds X_all = blend_forecasts_df.values sub_X_all = submission_forecasts_df.values if frac < 1: data_size = int(blend_forecasts_df.shape[0] * frac) X_all = X_all[:data_size, :] y_actual = y_actual[:data_size] #removing NaN and inf if there is any X_all = fillna_and_inf(X_all) y_actual_saved = y_actual target_as_log = True if target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) #X_all = scipy.stats.mstats.zscore(X_all, axis=1) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual) * 0.5)) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ensambles = [] rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast = rfr.predict(X_test) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, retransfrom_from_log(rfr_forecast)) ensambles.append((rmsle, rfr, "rfr ensamble")) lr_model = linear_model.Lasso(alpha=0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) rmsle = calculate_accuracy("lr_forecast", y_actual_test, retransfrom_from_log(lr_forecast)) ensambles.append((rmsle, lr_model, "lr ensamble")) do_xgb = True if do_xgb: if xgb_params is None: xgb_params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.1, "nthread": 4 } if do_cv: model, y_pred = regression_with_xgboost( X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) else: model, y_pred = regression_with_xgboost_no_cv( X_train, y_train, X_test, y_test, features=forecasting_feilds, xgb_params=xgb_params, num_rounds=200) xgb_forecast = model.predict(X_test) xgb_forecast_actual = retransfrom_from_log(xgb_forecast) rmsle = calculate_accuracy( str(xgb_params) + "[IDF]xgb_forecast", y_actual_test, xgb_forecast_actual) ensambles.append((rmsle, model, "xgboost ensamble")) best_ensamble_index = np.argmin([t[0] for t in ensambles]) best_ensamble = ensambles[best_ensamble_index][1] print "[IDF]Best Ensamble", ensambles[best_ensamble_index][ 2], ensambles[best_ensamble_index][0] if sub_X_all is not None: ensamble_forecast = best_ensamble.predict( transfrom_to_log2d(sub_X_all)) ensamble_forecast = retransfrom_from_log(ensamble_forecast) #becouse forecast cannot be negative ensamble_forecast = np.where(ensamble_forecast < 0, 0, ensamble_forecast) to_save = np.column_stack((submission_ids, ensamble_forecast)) to_saveDf = pd.DataFrame(to_save, columns=["id", "Demanda_uni_equil"]) to_saveDf = to_saveDf.fillna(0) to_saveDf["id"] = to_saveDf["id"].astype(int) submission_file = 'xgb_ensamble_submission_' + str( time.time()) + '.csv' to_saveDf.to_csv(submission_file, index=False) print "Best Ensamble Submission Stats", submission_file print to_saveDf.describe() if sec_test_data is not None: sec_test_data = fillna_and_inf(sec_test_data) sec_y_forecast = best_ensamble.predict(sec_test_data) sec_y_forecast = retransfrom_from_log(sec_y_forecast) sec_y_forecast = np.where(sec_y_forecast < 0, 0, sec_y_forecast) else: sec_y_forecast = None print "avg_models took ", (time.time() - start), "s" return xgb_forecast_actual, y_actual_test, ensamble_forecast, sec_y_forecast
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions, blend_data, blend_data_submission): use_complex_features = True if use_complex_features: X_all, forecasting_feilds = generate_forecast_features( forecasts, model_index_by_acc) else: X_all, forecasting_feilds = forecasts, [ "f" + str(f) for f in range(forecasts.shape[1]) ] X_all = np.column_stack([X_all, blend_data]) forecasting_feilds = forecasting_feilds + get_blend_features() #removing NaN and inf if there is any y_actual_saved = y_actual if conf.target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) X_all = fillna_and_inf(X_all) y_actual = fillna_and_inf(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual) * 0.50)) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ''' rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast_as_log = rfr.predict(X_test) rfr_forecast = retransfrom_from_log(rfr_forecast_as_log) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast) lr_model =linear_model.Lasso(alpha = 0.1) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) ''' xgb_params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.1, "nthread": 4, 'min_child_weight': 5 } model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds, # xgb_params=xgb_params,num_rounds=100) xgb_forecast = model.predict(X_test) xgb_forecast = retransfrom_from_log(xgb_forecast) calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast) if submissions_ids is not None and submissions is not None: if use_complex_features: submissions, _ = generate_forecast_features( submissions, model_index_by_acc) submissions = np.column_stack([submissions, blend_data_submission]) submissions = np.where( np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions)) rfr_ensamble_forecasts = model.predict(submissions) if conf.target_as_log: rfr_ensamble_forecasts = retransfrom_from_log( rfr_ensamble_forecasts) save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts) else: print "submissions not found" #we randomly select 5 million values x_size = X_train.shape[0] sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size)) X_train = X_train[sample_indexes] y_train = y_train[sample_indexes] dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse", epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2) y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train) y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization) X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train) X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D) model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf) y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization) y_forecast = retransfrom_from_log(y_forecast) rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
def predict_using_veriation(forecasts_data, best_forecast, y_actual, frac=1.0): size_to_keep = int(forecasts_data.shape[0] * frac) forecasts_data = forecasts_data[:size_to_keep] y_actual = y_actual[:size_to_keep] forecasts_data = transfrom_to_log2d(forecasts_data) forecasts_stdev = np.std(forecasts_data, axis=1) forecasts_mean = np.mean(forecasts_data, axis=1) #forecasts_robust_mean = forecasts_data[abs(forecasts_data - forecasts_mean.reshape((-1,1))) < 2 * forecasts_stdev.reshape((-1,1))] forecasts_hmean = fillna_and_inf( scipy.stats.hmean(np.where(forecasts_data <= 0, 0.1, forecasts_data), axis=1)) forecasts_median = np.median(forecasts_data, axis=1) min_diff_to_best = np.min(np.abs(forecasts_data - best_forecast.reshape((-1, 1))), axis=1) diff_best_to_mean = np.abs(best_forecast - forecasts_mean) print "forecasts_stdev", basic_stats_as_str(forecasts_stdev) print "forecasts_mean", basic_stats_as_str(forecasts_mean) print "diff_best_to_mean", basic_stats_as_str(diff_best_to_mean) print "min_diff_to_best", basic_stats_as_str(min_diff_to_best) #forecasts_stdev >>count=2.967219e+07,mean=2.072201e-01,std=1.256776e-01,min=3.159570e-03,50%=1.758286e-01,25%=1.243373e-01,50%=1.758286e-01,75%=2.542546e-01,95%=4.540049e-01,max=2.138248e+00,dtype:=float64 #forecasts_mean >>count=2.967219e+07,mean=1.594980e+00,std=6.205789e-01,min=8.732127e-02,50%=1.462738e+00,25%=1.151854e+00,50%=1.462738e+00,75%=1.893289e+00,95%=2.765280e+00,max=7.470179e+00,dtype:=float64 #diff_best_to_mean >>count=2.967219e+07,mean=4.604835e+00,std=1.701537e+01,min=2.758960e-07,50%=1.744317e+00,25%=8.145370e-01,50%=1.744317e+00,75%=3.950076e+00,95%=1.473034e+01,max=3.871046e+03,dtype:=float64 #min_diff_to_best >>count=2.967219e+07,mean=4.284890e+00,std=1.698368e+01,min=1.200819e-12,50%=1.479762e+00,25%=4.711246e-01,50%=1.479762e+00,75%=3.638548e+00,95%=1.426816e+01,max=3.870050e+03,dtype:=float64 final_forecast = np.zeros(size_to_keep) for i in range(size_to_keep): if min_diff_to_best[i] < 0.2 or diff_best_to_mean[i] < 0.3: final_forecast[i] = best_forecast[i] elif forecasts_stdev[i] < 0.3: final_forecast[i] = forecasts_mean[i] else: final_forecast[i] = (forecasts_median[i] + best_forecast[i]) / 2 calculate_accuracy("predict_using_veriation", y_actual, final_forecast) X_all = np.column_stack([ best_forecast, forecasts_mean, forecasts_hmean, forecasts_stdev, min_diff_to_best, diff_best_to_mean, forecasts_median ]) y_actual = transfrom_to_log(y_actual) no_of_training_instances = int(round(len(y_actual) * 0.50)) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, X_all, y_actual) lr_model = linear_model.Lasso(alpha=0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forecast = retransfrom_from_log(lr_forecast) calculate_accuracy("predict_using_veriation_lr_forecast ", retransfrom_from_log(y_test), lr_forecast) xgb_params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.1, "nthread": 4 } model, y_pred = regression_with_xgboost( X_train, y_train, X_test, y_test, features=[ 'best_forecast', 'forecasts_mean', 'forecasts_hmean', 'forecasts_stdev', 'min_diff_to_best', 'diff_best_to_mean', 'forecasts_median' ], use_cv=True, use_sklean=False, xgb_params=xgb_params) xgb_forecast = model.predict(X_test) xgb_forecast_actual = retransfrom_from_log(xgb_forecast) calculate_accuracy( str(xgb_params) + "predict_using_veriation_xgb_forecast", retransfrom_from_log(y_test), xgb_forecast_actual) return final_forecast
wecosratio = [ X_all_t[i][window_size - 1] / dayOfWeekCos[i] for i in range(len(dayOfWeekCos)) ] wecosproduct = [ X_all_t[i][window_size - 1] * dayOfWeekCos[i] for i in range(len(dayOfWeekCos)) ] #print(X_without_sales.shape, 1, wfeatures.shape, 1, 1, 1, 1, X_all_t.shape) X_all_t = np.column_stack( (X_without_sales, timeSincePromotion, wfeatures, w1cosratio, w1cosproduct, wecosratio, wecosproduct, X_all_t)) #print("X_all.shape", X_all_t.shape) training_set_size = int(0.7 * X_all_t.shape[0]) X_train_s, X_test_s, y_train_s, y_test_s = train_test_split( training_set_size, X_all_t, Y_all_t) x_train_list.append(X_train_s) x_test_list.append(X_test_s) y_train_list.append(y_train_s) y_test_list.append(y_test_s) #put the store data back togehter X_train = np.row_stack(x_train_list) X_test = np.row_stack(x_test_list) Y_train = np.concatenate(y_train_list, axis=0) Y_test = np.concatenate(y_test_list, axis=0) #X_all = storeData[0] #Y_all = target[0] #for i in range(1,len(storeData)):
def avg_models(conf, blend_forecasts_df, y_actual, submission_forecasts_df, submission_ids=None, xgb_params=None, do_cv=True, frac=1.0, sec_test_data=None): print "start avg models" start = time.time() forecasting_feilds = list(blend_forecasts_df) print "Using features", forecasting_feilds X_all = blend_forecasts_df.values sub_X_all = submission_forecasts_df.values if frac < 1: data_size = int(blend_forecasts_df.shape[0]*frac) X_all = X_all[:data_size, :] y_actual = y_actual[:data_size] #removing NaN and inf if there is any X_all = fillna_and_inf(X_all) y_actual_saved = y_actual target_as_log = True if target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) #X_all = scipy.stats.mstats.zscore(X_all, axis=1) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual)*0.5)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ensambles = [] rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast = rfr.predict(X_test) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, retransfrom_from_log(rfr_forecast)) ensambles.append((rmsle, rfr, "rfr ensamble")) lr_model =linear_model.Lasso(alpha = 0.2) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) rmsle = calculate_accuracy("lr_forecast", y_actual_test, retransfrom_from_log(lr_forecast)) ensambles.append((rmsle, lr_model, "lr ensamble")) do_xgb = True if do_xgb: if xgb_params is None: xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4 } if do_cv: model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) else: model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds, xgb_params=xgb_params,num_rounds=200) xgb_forecast = model.predict(X_test) xgb_forecast_actual = retransfrom_from_log(xgb_forecast) rmsle = calculate_accuracy(str(xgb_params) + "[IDF]xgb_forecast", y_actual_test, xgb_forecast_actual) ensambles.append((rmsle, model, "xgboost ensamble")) best_ensamble_index = np.argmin([t[0] for t in ensambles]) best_ensamble = ensambles[best_ensamble_index][1] print "[IDF]Best Ensamble", ensambles[best_ensamble_index][2], ensambles[best_ensamble_index][0] if sub_X_all is not None: ensamble_forecast = best_ensamble.predict(transfrom_to_log2d(sub_X_all)) ensamble_forecast = retransfrom_from_log(ensamble_forecast) #becouse forecast cannot be negative ensamble_forecast = np.where(ensamble_forecast < 0, 0, ensamble_forecast) to_save = np.column_stack((submission_ids, ensamble_forecast)) to_saveDf = pd.DataFrame(to_save, columns=["id","Demanda_uni_equil"]) to_saveDf = to_saveDf.fillna(0) to_saveDf["id"] = to_saveDf["id"].astype(int) submission_file = 'xgb_ensamble_submission_'+ str(time.time()) +'.csv' to_saveDf.to_csv(submission_file, index=False) print "Best Ensamble Submission Stats", submission_file print to_saveDf.describe() if sec_test_data is not None: sec_test_data = fillna_and_inf(sec_test_data) sec_y_forecast = best_ensamble.predict(sec_test_data) sec_y_forecast = retransfrom_from_log(sec_y_forecast) sec_y_forecast = np.where(sec_y_forecast < 0, 0, sec_y_forecast) else: sec_y_forecast = None print "avg_models took ", (time.time() - start), "s" return xgb_forecast_actual, y_actual_test, ensamble_forecast, sec_y_forecast
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions, blend_data, blend_data_submission): use_complex_features = True if use_complex_features: X_all, forecasting_feilds = generate_forecast_features(forecasts, model_index_by_acc) else: X_all,forecasting_feilds = forecasts, ["f"+str(f) for f in range(forecasts.shape[1])] X_all = np.column_stack([X_all, blend_data]) forecasting_feilds = forecasting_feilds + get_blend_features() #removing NaN and inf if there is any y_actual_saved = y_actual if conf.target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) X_all = fillna_and_inf(X_all) y_actual = fillna_and_inf(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual)*0.50)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ''' rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast_as_log = rfr.predict(X_test) rfr_forecast = retransfrom_from_log(rfr_forecast_as_log) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast) lr_model =linear_model.Lasso(alpha = 0.1) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) ''' xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4, 'min_child_weight':5} model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds, # xgb_params=xgb_params,num_rounds=100) xgb_forecast = model.predict(X_test) xgb_forecast = retransfrom_from_log(xgb_forecast) calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast) if submissions_ids is not None and submissions is not None: if use_complex_features: submissions, _ = generate_forecast_features(submissions, model_index_by_acc) submissions = np.column_stack([submissions, blend_data_submission]) submissions = np.where(np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions)) rfr_ensamble_forecasts = model.predict(submissions) if conf.target_as_log: rfr_ensamble_forecasts = retransfrom_from_log(rfr_ensamble_forecasts) save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts) else: print "submissions not found" #we randomly select 5 million values x_size = X_train.shape[0] sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size)) X_train = X_train[sample_indexes] y_train = y_train[sample_indexes] dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse", epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2) y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train) y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization) X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train) X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D) model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf) y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization) y_forecast = retransfrom_from_log(y_forecast) rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)