#X_all = np.column_stack((cycle_data, zscore_vals, entropy_vals, mavg1_vals, mavg2_vals, mavg4_vals, mavg8_vals, mavg16_vals)) X_all, Y_all = shuffle_data(X_all, Y_all) X_train, X_test, y_train, y_test = train_test_split(training_set_size, X_all, Y_all) print ("normalizing_factor", normalizing_factor) #run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, epoch_count=10, parmsFromNormalization=parmsFromNormalization) configs = [ #lr=0.01 MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse", epoch_count=200, optimizer=Adam(lr=0.001)), #MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse", # epoch_count=200, optimizer=Adam(lr=0.001), regularization=0.005), ] #configs = create_rondomsearch_configs4DL((1,2,3), (5,10,15,20), (0, 0.1, 0.2, 0.4), # (0, 0.01, 0.001), (0.01, 0.001, 0.0001), 50) index = 0 for c in configs: c.epoch_count = 200 #c.nodes_in_layer = c.nodes_in_layer/(1-c.dropout) y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c) print ">> %d %s" %(index, str(c.tostr())) print_regression_model_summary("DL", y_test, y_pred_dl, parmsFromNormalization) index = index + 1
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions, blend_data, blend_data_submission): use_complex_features = True if use_complex_features: X_all, forecasting_feilds = generate_forecast_features( forecasts, model_index_by_acc) else: X_all, forecasting_feilds = forecasts, [ "f" + str(f) for f in range(forecasts.shape[1]) ] X_all = np.column_stack([X_all, blend_data]) forecasting_feilds = forecasting_feilds + get_blend_features() #removing NaN and inf if there is any y_actual_saved = y_actual if conf.target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) X_all = fillna_and_inf(X_all) y_actual = fillna_and_inf(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual) * 0.50)) X_train, X_test, y_train, y_test = train_test_split( no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ''' rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast_as_log = rfr.predict(X_test) rfr_forecast = retransfrom_from_log(rfr_forecast_as_log) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast) lr_model =linear_model.Lasso(alpha = 0.1) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) ''' xgb_params = { "objective": "reg:linear", "booster": "gbtree", "eta": 0.1, "nthread": 4, 'min_child_weight': 5 } model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds, # xgb_params=xgb_params,num_rounds=100) xgb_forecast = model.predict(X_test) xgb_forecast = retransfrom_from_log(xgb_forecast) calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast) if submissions_ids is not None and submissions is not None: if use_complex_features: submissions, _ = generate_forecast_features( submissions, model_index_by_acc) submissions = np.column_stack([submissions, blend_data_submission]) submissions = np.where( np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions)) rfr_ensamble_forecasts = model.predict(submissions) if conf.target_as_log: rfr_ensamble_forecasts = retransfrom_from_log( rfr_ensamble_forecasts) save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts) else: print "submissions not found" #we randomly select 5 million values x_size = X_train.shape[0] sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size)) X_train = X_train[sample_indexes] y_train = y_train[sample_indexes] dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse", epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2) y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train) y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization) X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train) X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D) model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf) y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization) y_forecast = retransfrom_from_log(y_forecast) rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
# y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c) # print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl) size = 100000 x = np.random.zipf(2, size) y = np.random.normal(10, 1, size) xy = [x[i] * y[i] for i in range(size)] xbyy = [x[i] / y[i] if y[i] != 0 else 1 for i in range(size)] #target = [ 2*(2*x[i] + y[i])/3*y[i] for i in range(size)] target = [2 * (2 * x[i] + y[i]) / 3 * y[i] for i in range(size)] train_set_size = int(size * 0.7) X_all, Y_all = normlaize_data(np.column_stack((x, y, xy, xbyy)), np.array(target)) X_all, Y_all = shuffle_data(X_all, Y_all) X_train, X_test, y_train, y_test = train_test_split(train_set_size, X_all, Y_all) c = MLConfigs(nodes_in_layer=4, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse", epoch_count=15, optimizer=Adam()) y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c) print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions, blend_data, blend_data_submission): use_complex_features = True if use_complex_features: X_all, forecasting_feilds = generate_forecast_features(forecasts, model_index_by_acc) else: X_all,forecasting_feilds = forecasts, ["f"+str(f) for f in range(forecasts.shape[1])] X_all = np.column_stack([X_all, blend_data]) forecasting_feilds = forecasting_feilds + get_blend_features() #removing NaN and inf if there is any y_actual_saved = y_actual if conf.target_as_log: X_all = transfrom_to_log2d(X_all) y_actual = transfrom_to_log(y_actual) X_all = fillna_and_inf(X_all) y_actual = fillna_and_inf(y_actual) #we use 10% full data to train the ensamble and 30% for evalaution no_of_training_instances = int(round(len(y_actual)*0.50)) X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual) y_actual_test = y_actual_saved[no_of_training_instances:] ''' rfr = RandomForestRegressor(n_jobs=4, oob_score=True) rfr.fit(X_train, y_train) print_feature_importance(rfr.feature_importances_, forecasting_feilds) rfr_forecast_as_log = rfr.predict(X_test) rfr_forecast = retransfrom_from_log(rfr_forecast_as_log) rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast) lr_model =linear_model.Lasso(alpha = 0.1) lr_model.fit(X_train, y_train) lr_forecast = lr_model.predict(X_test) lr_forcast_revered = retransfrom_from_log(lr_forecast) calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered) ''' xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4, 'min_child_weight':5} model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True, use_sklean=False, xgb_params=xgb_params) #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds, # xgb_params=xgb_params,num_rounds=100) xgb_forecast = model.predict(X_test) xgb_forecast = retransfrom_from_log(xgb_forecast) calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast) if submissions_ids is not None and submissions is not None: if use_complex_features: submissions, _ = generate_forecast_features(submissions, model_index_by_acc) submissions = np.column_stack([submissions, blend_data_submission]) submissions = np.where(np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions)) rfr_ensamble_forecasts = model.predict(submissions) if conf.target_as_log: rfr_ensamble_forecasts = retransfrom_from_log(rfr_ensamble_forecasts) save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts) else: print "submissions not found" #we randomly select 5 million values x_size = X_train.shape[0] sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size)) X_train = X_train[sample_indexes] y_train = y_train[sample_indexes] dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse", epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2) y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train) y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization) X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train) X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D) model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf) y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization) y_forecast = retransfrom_from_log(y_forecast) rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)