def best_pair_forecast(conf, forecasts_data, y_actual, submission_data,
                       submissions_ids, feild_names):
    no_of_training_instances = int(forecasts_data.shape[0] * 0.3)
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, forecasts_data, y_actual)
    test_forecasts_r1, submissions_pair_forecasts = find_n_best_pairs(
        X_train,
        y_train,
        X_test,
        y_test,
        submission_data,
        feild_names,
        pair_count=10)

    no_of_training_instances = int(test_forecasts_r1.shape[0] * 0.5)
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, test_forecasts_r1, y_test)
    test_forecasts_r2, submissions_pair_forecasts = find_n_best_pairs(
        X_train,
        y_train,
        X_test,
        y_test,
        submissions_pair_forecasts,
        feild_names,
        pair_count=2)

    final_test_forecast = np.mean(test_forecasts_r2, axis=1)
    calculate_accuracy("final_pair_forecast", y_test, final_test_forecast)

    best_pair_ensamble_submission = np.mean(submissions_pair_forecasts, axis=1)
    save_submission_file("best_pair_submission.csv", submissions_ids,
                         best_pair_ensamble_submission)
    sys.stdout.flush()
def best_pair_forecast(conf, forecasts_data, y_actual, submission_data, submissions_ids, feild_names):
    no_of_training_instances = int(forecasts_data.shape[0]*0.3)
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, forecasts_data, y_actual)
    test_forecasts_r1, submissions_pair_forecasts = find_n_best_pairs(X_train, y_train, X_test, y_test, submission_data, feild_names, pair_count=10)

    no_of_training_instances = int(test_forecasts_r1.shape[0]*0.5)
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, test_forecasts_r1, y_test)
    test_forecasts_r2, submissions_pair_forecasts = find_n_best_pairs(X_train, y_train, X_test, y_test, submissions_pair_forecasts, feild_names, pair_count=2)

    final_test_forecast = np.mean(test_forecasts_r2, axis=1)
    calculate_accuracy("final_pair_forecast", y_test, final_test_forecast)

    best_pair_ensamble_submission = np.mean(submissions_pair_forecasts, axis=1)
    save_submission_file("best_pair_submission.csv", submissions_ids, best_pair_ensamble_submission)
    sys.stdout.flush()
Exemple #3
0
def run_regression_ensamble(models, y_test, parmsFromNormalization):
    training_set_size = int(len(y_test) * .7)
    X_train, X_test, y_train, y_test = train_test_split(
        training_set_size, models, y_test)
    print("results for combined Models")
    y_pred_lr = regression_with_LR(X_train, y_train, X_test, y_test,
                                   parmsFromNormalization)
def vote_with_lr(conf, forecasts, best_model_index, y_actual):
    start = time.time()
    best_forecast = forecasts[:, best_model_index]
    forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1)
    forecasts = np.where(forecasts <=0, 0.1, forecasts)

    data_train = []

    for i in range(forecasts.shape[0]):
        f_row = forecasts[i,]
        min_diff_to_best = np.min([cal_rmsle(best_forecast[i], f) for f in f_row])
        comb = list(itertools.combinations(f_row,2))
        avg_error = scipy.stats.hmean([cal_rmsle(x,y) for (x,y) in comb])
        data_train.append([min_diff_to_best, avg_error, scipy.stats.hmean(f_row), np.median(f_row), np.std(f_row)])


    X_all = np.column_stack([np.row_stack(data_train), best_forecast])
    if conf.target_as_log:
        y_actual = transfrom_to_log(y_actual)
    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual)*0.25))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual[no_of_training_instances:]

    lr_model =linear_model.Lasso(alpha = 0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    print_time_took(start, "vote_with_lr")
    return lr_forcast_revered
def predict_using_veriation(forecasts_data, best_forecast, y_actual, frac = 1.0):
    size_to_keep = int(forecasts_data.shape[0]*frac)
    forecasts_data = forecasts_data[:size_to_keep]
    y_actual = y_actual[:size_to_keep]

    forecasts_data = transfrom_to_log2d(forecasts_data)
    forecasts_stdev = np.std(forecasts_data, axis=1)
    forecasts_mean = np.mean(forecasts_data, axis=1)
    #forecasts_robust_mean = forecasts_data[abs(forecasts_data - forecasts_mean.reshape((-1,1))) < 2 * forecasts_stdev.reshape((-1,1))]
    forecasts_hmean = fillna_and_inf(scipy.stats.hmean(np.where(forecasts_data <= 0, 0.1, forecasts_data), axis=1))
    forecasts_median = np.median(forecasts_data, axis=1)
    min_diff_to_best = np.min(np.abs(forecasts_data - best_forecast.reshape((-1,1))), axis=1)
    diff_best_to_mean = np.abs(best_forecast - forecasts_mean)



    print "forecasts_stdev", basic_stats_as_str(forecasts_stdev)
    print "forecasts_mean", basic_stats_as_str(forecasts_mean)
    print "diff_best_to_mean", basic_stats_as_str(diff_best_to_mean)
    print "min_diff_to_best", basic_stats_as_str(min_diff_to_best)

    #forecasts_stdev >>count=2.967219e+07,mean=2.072201e-01,std=1.256776e-01,min=3.159570e-03,50%=1.758286e-01,25%=1.243373e-01,50%=1.758286e-01,75%=2.542546e-01,95%=4.540049e-01,max=2.138248e+00,dtype:=float64
    #forecasts_mean >>count=2.967219e+07,mean=1.594980e+00,std=6.205789e-01,min=8.732127e-02,50%=1.462738e+00,25%=1.151854e+00,50%=1.462738e+00,75%=1.893289e+00,95%=2.765280e+00,max=7.470179e+00,dtype:=float64
    #diff_best_to_mean >>count=2.967219e+07,mean=4.604835e+00,std=1.701537e+01,min=2.758960e-07,50%=1.744317e+00,25%=8.145370e-01,50%=1.744317e+00,75%=3.950076e+00,95%=1.473034e+01,max=3.871046e+03,dtype:=float64
    #min_diff_to_best >>count=2.967219e+07,mean=4.284890e+00,std=1.698368e+01,min=1.200819e-12,50%=1.479762e+00,25%=4.711246e-01,50%=1.479762e+00,75%=3.638548e+00,95%=1.426816e+01,max=3.870050e+03,dtype:=float64

    final_forecast = np.zeros(size_to_keep)
    for i in range(size_to_keep):
        if min_diff_to_best[i] < 0.2 or diff_best_to_mean[i] < 0.3:
            final_forecast[i] = best_forecast[i]
        elif forecasts_stdev[i] < 0.3:
            final_forecast[i] = forecasts_mean[i]
        else:
            final_forecast[i] = (forecasts_median[i] + best_forecast[i])/2

    calculate_accuracy("predict_using_veriation", y_actual, final_forecast)

    X_all = np.column_stack([best_forecast, forecasts_mean, forecasts_hmean, forecasts_stdev, min_diff_to_best, diff_best_to_mean, forecasts_median])
    y_actual = transfrom_to_log(y_actual)
    no_of_training_instances = int(round(len(y_actual)*0.50))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    lr_model =linear_model.Lasso(alpha = 0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forecast = retransfrom_from_log(lr_forecast)
    calculate_accuracy("predict_using_veriation_lr_forecast ", retransfrom_from_log(y_test), lr_forecast)


    xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4 }
    model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=['best_forecast', 'forecasts_mean', 'forecasts_hmean', 'forecasts_stdev', 'min_diff_to_best', 'diff_best_to_mean', 'forecasts_median'], use_cv=True,
                                use_sklean=False, xgb_params=xgb_params)
    xgb_forecast = model.predict(X_test)
    xgb_forecast_actual = retransfrom_from_log(xgb_forecast)
    calculate_accuracy(str(xgb_params) + "predict_using_veriation_xgb_forecast", retransfrom_from_log(y_test), xgb_forecast_actual)

    return final_forecast
Exemple #6
0
def vote_with_lr(conf, forecasts, best_model_index, y_actual):
    start = time.time()
    best_forecast = forecasts[:, best_model_index]
    forecasts = np.sort(np.delete(forecasts, best_model_index, axis=1), axis=1)
    forecasts = np.where(forecasts <= 0, 0.1, forecasts)

    data_train = []

    for i in range(forecasts.shape[0]):
        f_row = forecasts[i, ]
        min_diff_to_best = np.min(
            [cal_rmsle(best_forecast[i], f) for f in f_row])
        comb = list(itertools.combinations(f_row, 2))
        avg_error = scipy.stats.hmean([cal_rmsle(x, y) for (x, y) in comb])
        data_train.append([
            min_diff_to_best, avg_error,
            scipy.stats.hmean(f_row),
            np.median(f_row),
            np.std(f_row)
        ])

    X_all = np.column_stack([np.row_stack(data_train), best_forecast])
    if conf.target_as_log:
        y_actual = transfrom_to_log(y_actual)
    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual) * 0.25))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual[no_of_training_instances:]

    lr_model = linear_model.Lasso(alpha=0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test,
                       lr_forcast_revered)
    print_time_took(start, "vote_with_lr")
    return lr_forcast_revered
print("index2remove", index2remove)
df = df.drop(df.index[index2remove])


print("Df", df.head())
#covert data frame to numpy array
X_without_sales = df.values.copy()
print("X_all.shape", X_all.shape, "X_without_sales.shape", X_without_sales.shape)
#join two arrays ( also can do np.row_stack )
X_all = np.column_stack((X_without_sales,X_all))

print("X_all.shape", X_all.shape)

#X_train_raw = preprocessing.normalize(X_train_raw.astype("float32"), norm='l2')

X_train, X_test, y_train, y_test = train_test_split(training_set_size, X_all, Y_all)

run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, 10)


nodes_in_layer = 500
number_of_hidden_layers = 2
droput = 0.05
activation_fn='relu'
#y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, nodes_in_layer,
#                      number_of_hidden_layers, droput, activation_fn, 100)
#print_regression_model_summary("DL", y_test, y_pred_dl)



#    y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c)
#    print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)

size = 100000
x = np.random.zipf(2, size)
y = np.random.normal(10, 1, size)
xy = [x[i] * y[i] for i in range(size)]
xbyy = [x[i] / y[i] if y[i] != 0 else 1 for i in range(size)]

#target = [ 2*(2*x[i] + y[i])/3*y[i] for i in range(size)]
target = [2 * (2 * x[i] + y[i]) / 3 * y[i] for i in range(size)]

train_set_size = int(size * 0.7)

X_all, Y_all = normlaize_data(np.column_stack((x, y, xy, xbyy)),
                              np.array(target))
X_all, Y_all = shuffle_data(X_all, Y_all)

X_train, X_test, y_train, y_test = train_test_split(train_set_size, X_all,
                                                    Y_all)

c = MLConfigs(nodes_in_layer=4,
              number_of_hidden_layers=3,
              dropout=0,
              activation_fn='relu',
              loss="mse",
              epoch_count=15,
              optimizer=Adam())
y_pred_dl = regression_with_dl(X_train, y_train, X_test, y_test, c)
print_regression_model_summary("DL" + str(c.tostr()), y_test, y_pred_dl)
    timeSincePromotion = timeSincePromotion[window_size-1:-1]

    wfeatures = create_window_based_features(sales_data, window_size)
    wfeatures = wfeatures[window_size - 1:-1,]

    dayOfWeekCos = dfS['dayOfWeekCos'].values
    w1cosratio = [X_all_t[i][0]/dayOfWeekCos[i] for i in range(len(dayOfWeekCos))]
    w1cosproduct = [X_all_t[i][0] * dayOfWeekCos[i] for i in range(len(dayOfWeekCos))]
    wecosratio = [X_all_t[i][window_size-1] / dayOfWeekCos[i] for i in range(len(dayOfWeekCos))]
    wecosproduct = [X_all_t[i][window_size-1] * dayOfWeekCos[i] for i in range(len(dayOfWeekCos))]
    #print(X_without_sales.shape, 1, wfeatures.shape, 1, 1, 1, 1, X_all_t.shape)
    X_all_t = np.column_stack((X_without_sales, timeSincePromotion, wfeatures, w1cosratio, w1cosproduct, wecosratio, wecosproduct, X_all_t))
    #print("X_all.shape", X_all_t.shape)

    training_set_size = int(0.7*X_all_t.shape[0])
    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(training_set_size, X_all_t, Y_all_t)

    x_train_list.append(X_train_s)
    x_test_list.append(X_test_s)
    y_train_list.append(y_train_s)
    y_test_list.append(y_test_s)

#put the store data back togehter
X_train = np.row_stack(x_train_list)
X_test = np.row_stack(x_test_list)
Y_train = np.concatenate(y_train_list, axis=0)
Y_test = np.concatenate(y_test_list, axis=0)


#X_all = storeData[0]
#Y_all = target[0]
def run_regression_ensamble(models, y_test, parmsFromNormalization):
    training_set_size = int(len(y_test)*.7)
    X_train, X_test, y_train, y_test = train_test_split(training_set_size, models, y_test)
    print("results for combined Models")
    y_pred_lr = regression_with_LR(X_train, y_train, X_test, y_test, parmsFromNormalization)
Exemple #11
0
def avg_models(conf,
               blend_forecasts_df,
               y_actual,
               submission_forecasts_df,
               submission_ids=None,
               xgb_params=None,
               do_cv=True,
               frac=1.0,
               sec_test_data=None):
    print "start avg models"
    start = time.time()

    forecasting_feilds = list(blend_forecasts_df)
    print "Using features", forecasting_feilds

    X_all = blend_forecasts_df.values
    sub_X_all = submission_forecasts_df.values

    if frac < 1:
        data_size = int(blend_forecasts_df.shape[0] * frac)
        X_all = X_all[:data_size, :]
        y_actual = y_actual[:data_size]

    #removing NaN and inf if there is any
    X_all = fillna_and_inf(X_all)

    y_actual_saved = y_actual

    target_as_log = True
    if target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    #X_all = scipy.stats.mstats.zscore(X_all, axis=1)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual) * 0.5))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]

    ensambles = []

    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast = rfr.predict(X_test)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test,
                               retransfrom_from_log(rfr_forecast))
    ensambles.append((rmsle, rfr, "rfr ensamble"))

    lr_model = linear_model.Lasso(alpha=0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    rmsle = calculate_accuracy("lr_forecast", y_actual_test,
                               retransfrom_from_log(lr_forecast))
    ensambles.append((rmsle, lr_model, "lr ensamble"))

    do_xgb = True

    if do_xgb:
        if xgb_params is None:
            xgb_params = {
                "objective": "reg:linear",
                "booster": "gbtree",
                "eta": 0.1,
                "nthread": 4
            }
        if do_cv:
            model, y_pred = regression_with_xgboost(
                X_train,
                y_train,
                X_test,
                y_test,
                features=forecasting_feilds,
                use_cv=True,
                use_sklean=False,
                xgb_params=xgb_params)
        else:
            model, y_pred = regression_with_xgboost_no_cv(
                X_train,
                y_train,
                X_test,
                y_test,
                features=forecasting_feilds,
                xgb_params=xgb_params,
                num_rounds=200)
        xgb_forecast = model.predict(X_test)
        xgb_forecast_actual = retransfrom_from_log(xgb_forecast)
        rmsle = calculate_accuracy(
            str(xgb_params) + "[IDF]xgb_forecast", y_actual_test,
            xgb_forecast_actual)
        ensambles.append((rmsle, model, "xgboost ensamble"))

        best_ensamble_index = np.argmin([t[0] for t in ensambles])
        best_ensamble = ensambles[best_ensamble_index][1]
        print "[IDF]Best Ensamble", ensambles[best_ensamble_index][
            2], ensambles[best_ensamble_index][0]

        if sub_X_all is not None:
            ensamble_forecast = best_ensamble.predict(
                transfrom_to_log2d(sub_X_all))
            ensamble_forecast = retransfrom_from_log(ensamble_forecast)

            #becouse forecast cannot be negative
            ensamble_forecast = np.where(ensamble_forecast < 0, 0,
                                         ensamble_forecast)

            to_save = np.column_stack((submission_ids, ensamble_forecast))
            to_saveDf = pd.DataFrame(to_save,
                                     columns=["id", "Demanda_uni_equil"])
            to_saveDf = to_saveDf.fillna(0)
            to_saveDf["id"] = to_saveDf["id"].astype(int)
            submission_file = 'xgb_ensamble_submission_' + str(
                time.time()) + '.csv'
            to_saveDf.to_csv(submission_file, index=False)

            print "Best Ensamble Submission Stats", submission_file
            print to_saveDf.describe()

        if sec_test_data is not None:
            sec_test_data = fillna_and_inf(sec_test_data)
            sec_y_forecast = best_ensamble.predict(sec_test_data)
            sec_y_forecast = retransfrom_from_log(sec_y_forecast)
            sec_y_forecast = np.where(sec_y_forecast < 0, 0, sec_y_forecast)
        else:
            sec_y_forecast = None

        print "avg_models took ", (time.time() - start), "s"
        return xgb_forecast_actual, y_actual_test, ensamble_forecast, sec_y_forecast
Exemple #12
0
def blend_models(conf, forecasts, model_index_by_acc, y_actual,
                 submissions_ids, submissions, blend_data,
                 blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(
            forecasts, model_index_by_acc)
    else:
        X_all, forecasting_feilds = forecasts, [
            "f" + str(f) for f in range(forecasts.shape[1])
        ]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual) * 0.50))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]
    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''

    xgb_params = {
        "objective": "reg:linear",
        "booster": "gbtree",
        "eta": 0.1,
        "nthread": 4,
        'min_child_weight': 5
    }
    model, y_pred = regression_with_xgboost(X_train,
                                            y_train,
                                            X_test,
                                            y_test,
                                            features=forecasting_feilds,
                                            use_cv=True,
                                            use_sklean=False,
                                            xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(
                submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(
            np.isnan(submissions), 0,
            np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(
                rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids,
                             rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0],
                                       min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10,
                       number_of_hidden_layers=2,
                       dropout=0.3,
                       activation_fn='relu',
                       loss="mse",
                       epoch_count=4,
                       optimizer=Adam(lr=0.0001),
                       regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test,
                                           dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
Exemple #13
0
def predict_using_veriation(forecasts_data, best_forecast, y_actual, frac=1.0):
    size_to_keep = int(forecasts_data.shape[0] * frac)
    forecasts_data = forecasts_data[:size_to_keep]
    y_actual = y_actual[:size_to_keep]

    forecasts_data = transfrom_to_log2d(forecasts_data)
    forecasts_stdev = np.std(forecasts_data, axis=1)
    forecasts_mean = np.mean(forecasts_data, axis=1)
    #forecasts_robust_mean = forecasts_data[abs(forecasts_data - forecasts_mean.reshape((-1,1))) < 2 * forecasts_stdev.reshape((-1,1))]
    forecasts_hmean = fillna_and_inf(
        scipy.stats.hmean(np.where(forecasts_data <= 0, 0.1, forecasts_data),
                          axis=1))
    forecasts_median = np.median(forecasts_data, axis=1)
    min_diff_to_best = np.min(np.abs(forecasts_data -
                                     best_forecast.reshape((-1, 1))),
                              axis=1)
    diff_best_to_mean = np.abs(best_forecast - forecasts_mean)

    print "forecasts_stdev", basic_stats_as_str(forecasts_stdev)
    print "forecasts_mean", basic_stats_as_str(forecasts_mean)
    print "diff_best_to_mean", basic_stats_as_str(diff_best_to_mean)
    print "min_diff_to_best", basic_stats_as_str(min_diff_to_best)

    #forecasts_stdev >>count=2.967219e+07,mean=2.072201e-01,std=1.256776e-01,min=3.159570e-03,50%=1.758286e-01,25%=1.243373e-01,50%=1.758286e-01,75%=2.542546e-01,95%=4.540049e-01,max=2.138248e+00,dtype:=float64
    #forecasts_mean >>count=2.967219e+07,mean=1.594980e+00,std=6.205789e-01,min=8.732127e-02,50%=1.462738e+00,25%=1.151854e+00,50%=1.462738e+00,75%=1.893289e+00,95%=2.765280e+00,max=7.470179e+00,dtype:=float64
    #diff_best_to_mean >>count=2.967219e+07,mean=4.604835e+00,std=1.701537e+01,min=2.758960e-07,50%=1.744317e+00,25%=8.145370e-01,50%=1.744317e+00,75%=3.950076e+00,95%=1.473034e+01,max=3.871046e+03,dtype:=float64
    #min_diff_to_best >>count=2.967219e+07,mean=4.284890e+00,std=1.698368e+01,min=1.200819e-12,50%=1.479762e+00,25%=4.711246e-01,50%=1.479762e+00,75%=3.638548e+00,95%=1.426816e+01,max=3.870050e+03,dtype:=float64

    final_forecast = np.zeros(size_to_keep)
    for i in range(size_to_keep):
        if min_diff_to_best[i] < 0.2 or diff_best_to_mean[i] < 0.3:
            final_forecast[i] = best_forecast[i]
        elif forecasts_stdev[i] < 0.3:
            final_forecast[i] = forecasts_mean[i]
        else:
            final_forecast[i] = (forecasts_median[i] + best_forecast[i]) / 2

    calculate_accuracy("predict_using_veriation", y_actual, final_forecast)

    X_all = np.column_stack([
        best_forecast, forecasts_mean, forecasts_hmean, forecasts_stdev,
        min_diff_to_best, diff_best_to_mean, forecasts_median
    ])
    y_actual = transfrom_to_log(y_actual)
    no_of_training_instances = int(round(len(y_actual) * 0.50))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    lr_model = linear_model.Lasso(alpha=0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forecast = retransfrom_from_log(lr_forecast)
    calculate_accuracy("predict_using_veriation_lr_forecast ",
                       retransfrom_from_log(y_test), lr_forecast)

    xgb_params = {
        "objective": "reg:linear",
        "booster": "gbtree",
        "eta": 0.1,
        "nthread": 4
    }
    model, y_pred = regression_with_xgboost(
        X_train,
        y_train,
        X_test,
        y_test,
        features=[
            'best_forecast', 'forecasts_mean', 'forecasts_hmean',
            'forecasts_stdev', 'min_diff_to_best', 'diff_best_to_mean',
            'forecasts_median'
        ],
        use_cv=True,
        use_sklean=False,
        xgb_params=xgb_params)
    xgb_forecast = model.predict(X_test)
    xgb_forecast_actual = retransfrom_from_log(xgb_forecast)
    calculate_accuracy(
        str(xgb_params) + "predict_using_veriation_xgb_forecast",
        retransfrom_from_log(y_test), xgb_forecast_actual)

    return final_forecast
Exemple #14
0
    wecosratio = [
        X_all_t[i][window_size - 1] / dayOfWeekCos[i]
        for i in range(len(dayOfWeekCos))
    ]
    wecosproduct = [
        X_all_t[i][window_size - 1] * dayOfWeekCos[i]
        for i in range(len(dayOfWeekCos))
    ]
    #print(X_without_sales.shape, 1, wfeatures.shape, 1, 1, 1, 1, X_all_t.shape)
    X_all_t = np.column_stack(
        (X_without_sales, timeSincePromotion, wfeatures, w1cosratio,
         w1cosproduct, wecosratio, wecosproduct, X_all_t))
    #print("X_all.shape", X_all_t.shape)

    training_set_size = int(0.7 * X_all_t.shape[0])
    X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
        training_set_size, X_all_t, Y_all_t)

    x_train_list.append(X_train_s)
    x_test_list.append(X_test_s)
    y_train_list.append(y_train_s)
    y_test_list.append(y_test_s)

#put the store data back togehter
X_train = np.row_stack(x_train_list)
X_test = np.row_stack(x_test_list)
Y_train = np.concatenate(y_train_list, axis=0)
Y_test = np.concatenate(y_test_list, axis=0)

#X_all = storeData[0]
#Y_all = target[0]
#for i in range(1,len(storeData)):
def avg_models(conf, blend_forecasts_df, y_actual, submission_forecasts_df, submission_ids=None, xgb_params=None, do_cv=True, frac=1.0, sec_test_data=None):
    print "start avg models"
    start = time.time()

    forecasting_feilds = list(blend_forecasts_df)
    print "Using features", forecasting_feilds

    X_all = blend_forecasts_df.values
    sub_X_all = submission_forecasts_df.values

    if frac < 1:
        data_size = int(blend_forecasts_df.shape[0]*frac)
        X_all = X_all[:data_size, :]
        y_actual = y_actual[:data_size]


    #removing NaN and inf if there is any
    X_all = fillna_and_inf(X_all)

    y_actual_saved = y_actual

    target_as_log = True
    if target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    #X_all = scipy.stats.mstats.zscore(X_all, axis=1)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual)*0.5))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]

    ensambles = []


    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast = rfr.predict(X_test)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, retransfrom_from_log(rfr_forecast))
    ensambles.append((rmsle, rfr, "rfr ensamble"))


    lr_model =linear_model.Lasso(alpha = 0.2)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    rmsle = calculate_accuracy("lr_forecast", y_actual_test, retransfrom_from_log(lr_forecast))
    ensambles.append((rmsle, lr_model, "lr ensamble"))


    do_xgb = True

    if do_xgb:
        if xgb_params is None:
            xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4 }
        if do_cv:
            model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True,
                                use_sklean=False, xgb_params=xgb_params)
        else:
            model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
                                                          xgb_params=xgb_params,num_rounds=200)
        xgb_forecast = model.predict(X_test)
        xgb_forecast_actual = retransfrom_from_log(xgb_forecast)
        rmsle = calculate_accuracy(str(xgb_params) + "[IDF]xgb_forecast", y_actual_test, xgb_forecast_actual)
        ensambles.append((rmsle, model, "xgboost ensamble"))

        best_ensamble_index = np.argmin([t[0] for t in ensambles])
        best_ensamble = ensambles[best_ensamble_index][1]
        print "[IDF]Best Ensamble", ensambles[best_ensamble_index][2], ensambles[best_ensamble_index][0]

        if sub_X_all is not None:
            ensamble_forecast = best_ensamble.predict(transfrom_to_log2d(sub_X_all))
            ensamble_forecast = retransfrom_from_log(ensamble_forecast)

            #becouse forecast cannot be negative
            ensamble_forecast = np.where(ensamble_forecast < 0, 0, ensamble_forecast)

            to_save = np.column_stack((submission_ids, ensamble_forecast))
            to_saveDf =  pd.DataFrame(to_save, columns=["id","Demanda_uni_equil"])
            to_saveDf = to_saveDf.fillna(0)
            to_saveDf["id"] = to_saveDf["id"].astype(int)
            submission_file = 'xgb_ensamble_submission_'+ str(time.time()) +'.csv'
            to_saveDf.to_csv(submission_file, index=False)

            print "Best Ensamble Submission Stats", submission_file
            print to_saveDf.describe()

        if sec_test_data is not None:
            sec_test_data = fillna_and_inf(sec_test_data)
            sec_y_forecast = best_ensamble.predict(sec_test_data)
            sec_y_forecast = retransfrom_from_log(sec_y_forecast)
            sec_y_forecast = np.where(sec_y_forecast < 0, 0, sec_y_forecast)
        else:
            sec_y_forecast = None

        print "avg_models took ", (time.time() - start), "s"
        return xgb_forecast_actual, y_actual_test, ensamble_forecast, sec_y_forecast
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions,
                 blend_data, blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(forecasts, model_index_by_acc)
    else:
        X_all,forecasting_feilds = forecasts, ["f"+str(f) for f in range(forecasts.shape[1])]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual)*0.50))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]

    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''




    xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4, 'min_child_weight':5}
    model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True,
                            use_sklean=False, xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse",
                epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)