#X_all = np.column_stack((cycle_data, zscore_vals, entropy_vals, mavg1_vals, mavg2_vals, mavg4_vals, mavg8_vals, mavg16_vals))

X_all, Y_all = shuffle_data(X_all, Y_all)

X_train, X_test, y_train, y_test = train_test_split(training_set_size, X_all, Y_all)

print ("normalizing_factor", normalizing_factor)
#run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, epoch_count=10, parmsFromNormalization=parmsFromNormalization)


configs = [
    #lr=0.01
    MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse",
              epoch_count=200, optimizer=Adam(lr=0.001)),
    #MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse",
    #          epoch_count=200, optimizer=Adam(lr=0.001), regularization=0.005),
    ]

#configs = create_rondomsearch_configs4DL((1,2,3), (5,10,15,20), (0, 0.1, 0.2, 0.4),
#                                        (0, 0.01, 0.001), (0.01, 0.001, 0.0001), 50)

index = 0
for c in configs:
    c.epoch_count = 200
    #c.nodes_in_layer = c.nodes_in_layer/(1-c.dropout)
    y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c)
    print ">> %d %s" %(index, str(c.tostr()))
    print_regression_model_summary("DL", y_test, y_pred_dl, parmsFromNormalization)
    index = index + 1

Example #2
0
def blend_models(conf, forecasts, model_index_by_acc, y_actual,
                 submissions_ids, submissions, blend_data,
                 blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(
            forecasts, model_index_by_acc)
    else:
        X_all, forecasting_feilds = forecasts, [
            "f" + str(f) for f in range(forecasts.shape[1])
        ]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual) * 0.50))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]
    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''

    xgb_params = {
        "objective": "reg:linear",
        "booster": "gbtree",
        "eta": 0.1,
        "nthread": 4,
        'min_child_weight': 5
    }
    model, y_pred = regression_with_xgboost(X_train,
                                            y_train,
                                            X_test,
                                            y_test,
                                            features=forecasting_feilds,
                                            use_cv=True,
                                            use_sklean=False,
                                            xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(
                submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(
            np.isnan(submissions), 0,
            np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(
                rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids,
                             rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0],
                                       min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10,
                       number_of_hidden_layers=2,
                       dropout=0.3,
                       activation_fn='relu',
                       loss="mse",
                       epoch_count=4,
                       optimizer=Adam(lr=0.0001),
                       regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test,
                                           dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
Example #3
0
#    y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c)
#    print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)

size = 100000
x = np.random.zipf(2, size)
y = np.random.normal(10, 1, size)
xy = [x[i] * y[i] for i in range(size)]
xbyy = [x[i] / y[i] if y[i] != 0 else 1 for i in range(size)]

#target = [ 2*(2*x[i] + y[i])/3*y[i] for i in range(size)]
target = [2 * (2 * x[i] + y[i]) / 3 * y[i] for i in range(size)]

train_set_size = int(size * 0.7)

X_all, Y_all = normlaize_data(np.column_stack((x, y, xy, xbyy)),
                              np.array(target))
X_all, Y_all = shuffle_data(X_all, Y_all)

X_train, X_test, y_train, y_test = train_test_split(train_set_size, X_all,
                                                    Y_all)

c = MLConfigs(nodes_in_layer=4,
              number_of_hidden_layers=3,
              dropout=0,
              activation_fn='relu',
              loss="mse",
              epoch_count=15,
              optimizer=Adam())
y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c)
print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)
Example #4
0
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions,
                 blend_data, blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(forecasts, model_index_by_acc)
    else:
        X_all,forecasting_feilds = forecasts, ["f"+str(f) for f in range(forecasts.shape[1])]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual)*0.50))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]

    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''




    xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4, 'min_child_weight':5}
    model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True,
                            use_sklean=False, xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse",
                epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)