Beispiel #1
0
x_features = x.drop(['Zillow Price Index'], axis=1)
x_target = x['Zillow Price Index']
y_features = y.drop(['Zillow Price Index'], axis=1)
y_target = y['Zillow Price Index']

print(x_features.shape)
print(x_target.shape)
print(y_features.shape)
print(y_target.shape)
print('-' * 100)

print("Most Important Features in Order: ",
      x_features.columns[[17, 37, 36, 155]])

model = RFR(n_jobs=-1)
# model = xgb.XGBRegressor()

model.fit(x_features, x_target)
y_pred = model.predict(y_features)
mse = mean_squared_error(y_target, y_pred)
print("MSE: ", mse)
rmse = np.sqrt(mse)
print("RMSE: ", rmse)
print('-' * 100)

print(model.feature_importances_)
num_objects = np.arange(len(model.feature_importances_))
plt.bar(num_objects, model.feature_importances_)

z = model.feature_importances_
Beispiel #2
0
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Random Forest Regression Model to the dataset
from sklearn.ensemble import RandomForestRegressor as RFR
regressor = RFR(
    n_estimators=300,
    random_state=0,
)  #estimators is number of trees
regressor.fit(X, y)
# Predicting a new result
y_pred = regressor.predict([[6.5]])

# Visualising the Random Forest Regression results (for higher resolution and smoother curve)
#Need this because the random forest isn't constant
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Random Forest Regression  Model)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Beispiel #3
0
from sklearn.svm import SVR
model = SVR(kernel='rbf', C=20)
model.fit(num_feat_train, y_train)

# Model 3 : Linear/Polynomial Regression
from sklearn.linear_model import ElasticNet
model = ElasticNet(fit_intercept=True,
                   normalize=True,
                   alpha=0.1,
                   l1_ratio=1,
                   precompute=True)
model.fit(num_feat_train, y_train)

# Model 4 : Random Forest Regression
from sklearn.ensemble import RandomForestRegressor as RFR
model = RFR(n_estimators=300, max_depth=8)
model.fit(num_feat_train_pca, y_train)

# Model 5 : ANN
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization, Activation
from keras.optimizers import SGD


# Defining custom R2 metric for ANN
def r2_metric(y_true, y_pred):
    SS_res = K.sum(K.square(y_true - y_pred))
Beispiel #4
0
    ]

    for algorithm_name in ["rf", "etr"]:
        #     ["xgBoost", "rf", "etr"]
        for estimator_output_length in [4, 5, 6]:
            iterOrCopy = "iterative"
            required_prediction_length = 14
            if algorithm_name != "xgBoost":
                for n_estimators in [500]:
                    for min_samples_split in [2]:
                        for min_samples_leaf in [1]:
                            if algorithm_name == "rf":
                                estimator_withParams = RFR(
                                    n_estimators=n_estimators,
                                    max_features="auto",
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    oob_score=False,
                                    n_jobs=-1,
                                    random_state=2017)
                            if algorithm_name == "etr":
                                estimator_withParams = ETR(
                                    n_estimators=n_estimators,
                                    max_features="auto",
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    oob_score=False,
                                    n_jobs=-1,
                                    random_state=2017)

                            Model_for_competition(
                                algorithm_name=algorithm_name,
Beispiel #5
0
def test_ml(stock='F',
            forecast_out=5,
            month=None,
            day=None,
            year=2019,
            plot=False,
            volume=False):
    # Assume input day is valid trading day
    # Want to separate 1 percent of the data to forecast
    # Today info
    if (month == None or day == None):
        today = datetime.datetime.now()
        month = today.month
        day = today.day

    end_date = dt(year, month, day)
    trading_days = get_trading_days([2017, 2018, 2019])

    end_idx = np.where(end_date == trading_days)[0][0]
    end = trading_days[end_idx - forecast_out]
    new_start = trading_days[end_idx - forecast_out]
    new_end = trading_days[end_idx]

    # For prediction
    start = datetime.datetime(2016, 4, 1)

    df = read_data(stock, start, end)

    #df = web.DataReader(stock, 'yahoo', start, end)
    #print(df.index)
    df = read_data(stock, start, end)
    if (df.empty):
        #print("SHOULD BE EMPTY")
        return [0] * 10, "ERROR"

    df = df[df.index <= end]
    #print(df.tail(forecast_out))
    dfreg = df.loc[:, ['adjusted close', 'volume']]
    dfreg['HL_PCT'] = (df['high'] - df['low']) / df['adjusted close'] * 100.0
    dfreg['PCT_change'] = (df['adjusted close'] -
                           df['open']) / df['open'] * 100.0

    # For volume testing
    if (volume):
        dfreg['adjusted close'] = dfreg['volume']

    dfreg['EMA'] = get_ema(dfreg, forecast_out)
    if (dfreg['EMA'].empty):
        return [0] * 10, "ERROR"

    dfreg['old close'] = dfreg['adjusted close']
    dfreg['adjusted close'] = dfreg['EMA']

    # For validation
    #print("NEW START: \t{}".format(new_start))
    #print("NEW END: \t{}".format(new_end))
    #print("VALIDATION START: {} END: {}\n".format(new_start, new_end))
    #new_df = web.DataReader(stock, 'yahoo', new_start, new_end)
    new_df = read_data(stock, new_start, new_end)
    #print("TESTING VALIDATION DATA")
    if (new_df.empty):
        return [0] * 10, "ERROR"
    #print(new_end)
    new_df = new_df[new_df.index <= new_end]
    #print(new_df)
    #exit(1)
    new_dfreg = new_df.loc[:, ['adjusted close', 'volume']]
    new_dfreg['HL_PCT'] = (new_df['high'] -
                           new_df['low']) / new_df['adjusted close'] * 100.0
    new_dfreg['PCT_change'] = (new_df['adjusted close'] -
                               new_df['open']) / new_df['open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)
    new_dfreg.fillna(value=-99999, inplace=True)

    # Searating the label here, we want to predict the Adjclose
    forecast_col = 'adjusted close'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))

    # Scale X for linear regression
    X = preprocessing.scale(X)

    # Finally want late X and early X for model
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]

    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    # Training and testing sets
    X_train = X[:len(X) - forecast_out]
    X_test = X[len(X) - forecast_out:]

    y_train = y[:len(y) - forecast_out]
    y_test = y[len(y) - forecast_out:]

    # LinReg
    clfreg = LinearRegression(n_jobs=-1)

    # QuadReg2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())

    # QuadReg3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())

    # QuadReg4
    clfpoly4 = make_pipeline(PolynomialFeatures(4), Ridge())

    # QuadReg5
    clfpoly5 = make_pipeline(PolynomialFeatures(5), Ridge())

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)

    # Bayesian Ridge
    clfbayr = BayesianRidge()

    # Neural Network
    clfmlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100),
                          learning_rate='adaptive',
                          solver='adam',
                          max_iter=5,
                          verbose=False)

    # Random Forest Regressor
    clfrfr = RFR(n_estimators=15)

    # Support Vector Regressor
    clfsvr = SVR(gamma='auto')

    threads = []
    models = [
        clfreg, clfpoly2, clfpoly3, clfpoly4, clfpoly5, clfknn, clfbayr,
        clfrfr, clfsvr
    ]
    fits = [''] * len(models)
    for i in range(len(models)):
        process = Thread(target=fitting,
                         args=[models[i], X_train, y_train, fits, i],
                         name=stock)
        process.start()
        threads.append(process)

    for process in threads:
        process.join()

    start = time.time()
    try:
        reg_forecast = fits[0].predict(X_lately)
        poly2_forecast = fits[1].predict(X_lately)
        poly3_forecast = fits[2].predict(X_lately)
        poly4_forecast = fits[3].predict(X_lately)
        poly5_forecast = fits[4].predict(X_lately)
        try:
            knn_forecast = fits[5].predict(X_lately)
        except ValueError:
            #print("KNN ERROR: {}".format(stock))
            #print("F*****g really: {}".format(stock))
            #print(X_lately)
            #print(X_lately.shape)
            knn_forecast = np.zeros(poly5_forecast.shape)
            #exit(1)
        bayr_forecast = fits[6].predict(X_lately)
        rfr_forecast = fits[7].predict(X_lately)
        svr_forecast = fits[8].predict(X_lately)
        mlp_forecast = fits[6].predict(X_lately)
    except AttributeError:
        #print("ISSUES WITH {}".format(stock))
        return [0] * 10, {}
        #print(fits)
        #print(threads)
        #print(X_train, y_train)
        #print(X, y)
        #print(stock)
        #print(dfreg)
        #exit(1)
    #mlp_forecast = clfmlp.predict(X_lately)

    # Set up dataframe
    dfreg['reg_forecast'] = np.nan
    dfreg['poly2_forecast'] = np.nan
    dfreg['poly3_forecast'] = np.nan
    dfreg['poly4_forecast'] = np.nan
    dfreg['poly5_forecast'] = np.nan
    dfreg['knn_forecast'] = np.nan
    dfreg['bayr_forecast'] = np.nan
    dfreg['mlp_forecast'] = np.nan
    dfreg['rfr_forecast'] = np.nan
    dfreg['svr_forecast'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)
    for i in zip(reg_forecast, poly2_forecast, poly3_forecast, poly4_forecast,
                 poly5_forecast, knn_forecast, bayr_forecast, mlp_forecast,
                 rfr_forecast, svr_forecast):
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = list(
            [np.nan for _ in range(len(dfreg.columns) - 10)] + list(i))

    #dfreg['mean_forecast'] = dfreg[['poly2_forecast', 'poly3_forecast']].mean(axis=1)
    #print(dfreg.tail(forecast_out+1))
    dfreg['mean_forecast'] = dfreg[[
        'reg_forecast',
        'poly2_forecast',
        'poly3_forecast',
        'knn_forecast',
        'bayr_forecast',  # mlp_forecast,
        'rfr_forecast',
        'svr_forecast'
    ]].mean(axis=1)

    as_list = dfreg.index.tolist()
    # I THINK THIS IS FIXED
    #print(as_list[-forecast_out-5:])
    #for asd in as_list[-forecast_out-1:]:
    #    print(asd)
    #print()
    #for asd in new_df.index.tolist():#[:forecast_out]:
    #    print(asd)
    as_list[-forecast_out:] = new_df.index.tolist()[1:]
    try:
        dfreg.index = as_list
    except:
        print("DATA MISALIGNMENT FOR: {}".format(stock))
        #print(new_df)
        #print(dfreg.tail(forecast_out+1))
        #exit(1)
        return [0] * 10, {}
    #for asd in as_list[-forecast_out-5:]:
    #    print(asd)
    dfreg[-forecast_out:].index = new_df.index.tolist()[:forecast_out]
    #print(dfreg.tail(forecast_out+1))
    #return [None]*10, None
    #exit(1)

    #
    # Trying to do all combinations
    #
    forecasts = [
        'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'poly4_forecast',
        'poly5_forecast', 'knn_forecast', 'bayr_forecast', 'rfr_forecast',
        'svr_forecast'
    ]

    if (plot):
        dfreg['old close'].tail(20).plot(figsize=(20, 12), lw=2)
        dfreg['adjusted close'].tail(20).plot(figsize=(20, 12), lw=2)
        dfreg['reg_forecast'].tail(20).plot(lw=0.5)
        dfreg['poly2_forecast'].tail(20).plot(lw=0.5)
        dfreg['poly3_forecast'].tail(20).plot(lw=0.5)
        dfreg['poly4_forecast'].tail(20).plot(lw=0.5)
        dfreg['poly5_forecast'].tail(20).plot(lw=0.5)
        dfreg['knn_forecast'].tail(20).plot(lw=0.5)
        dfreg['bayr_forecast'].tail(20).plot(lw=0.5)
        dfreg['mean_forecast'].tail(20).plot(c='k')
        #dfreg['mlp_forecast'].tail(20).plot()
        dfreg['rfr_forecast'].tail(20).plot(lw=0.5)
        dfreg['svr_forecast'].tail(20).plot(lw=0.5)

    new_dfreg['Actual close'] = new_df['adjusted close']
    if (plot):
        new_dfreg['Actual close'].tail(20).plot(c='g', lw=2)
    fit = np.polyfit([i for i in range(forecast_out)],
                     dfreg['mean_forecast'].values[-forecast_out:],
                     deg=1)

    #print("CALCULATING CORRELATION BETWEEN METHOD AND ACTUAL")
    actual = new_dfreg['Actual close'].tail(forecast_out)

    highest_corr = 0
    best_comb = ''
    num_combs = 0
    correlations = []
    good_combinations = []
    #for j in range(1,9):
    #    for comb in combinations(forecasts, j):
    #        num_combs += 1
    #        comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out)
    #        new_correlation = corr(comb_dat, actual)[0]
    #        correlations.append(new_correlation)
    #        if(new_correlation > 0.4):
    #            good_combinations.append(comb)

    #        if(new_correlation > highest_corr):
    #            highest_corr = new_correlation
    #            best_comb = comb
    for comb in all_combinations:
        num_combs += 1
        comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out)
        new_correlation = corr(comb_dat, actual)[0]
        correlations.append(new_correlation)
        if (new_correlation > 0.4):
            good_combinations.append(comb)

        if (new_correlation > highest_corr):
            highest_corr = new_correlation
            best_comb = comb

    reg_dat = dfreg['reg_forecast'].tail(forecast_out)
    reg_corr = corr(reg_dat, actual)
    #print("Linear Regression: {}".format(reg_corr))

    poly2_dat = dfreg['poly2_forecast'].tail(forecast_out)
    poly2_corr = corr(poly2_dat, actual)
    #print("Poly2: {}".format(poly2_corr))

    poly3_dat = dfreg['poly3_forecast'].tail(forecast_out)
    poly3_corr = corr(poly3_dat, actual)
    #print("Poly3: {}".format(poly3_corr))

    poly4_dat = dfreg['poly4_forecast'].tail(forecast_out)
    poly4_corr = corr(poly4_dat, actual)
    #print("Poly3: {}".format(poly3_corr))

    poly5_dat = dfreg['poly5_forecast'].tail(forecast_out)
    poly5_corr = corr(poly5_dat, actual)
    #print("Poly3: {}".format(poly3_corr))

    knn_dat = dfreg['knn_forecast'].tail(forecast_out)
    knn_corr = corr(knn_dat, actual)
    #print("K Nearest Neighbors: {}".format(knn_corr))

    bayr_dat = dfreg['bayr_forecast'].tail(forecast_out)
    bayr_corr = corr(bayr_dat, actual)
    #print("Bayesian: {}".format(bayr_corr))

    rfr_dat = dfreg['rfr_forecast'].tail(forecast_out)
    rfr_corr = corr(rfr_dat, actual)
    #print("Random Forest: {}".format(rfr_corr))

    svr_dat = dfreg['svr_forecast'].tail(forecast_out)
    svr_corr = corr(svr_dat, actual)
    #print("Support Vector: {}".format(rfr_corr))

    mean_dat = dfreg['mean_forecast'].tail(forecast_out)
    mean_corr = corr(mean_dat, actual)

    if (plot):
        plt.legend(loc='best')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.title(stock)
        plt.savefig("./test_plots/{1}_{2}/{0}_{1}_{2}_{3}".format(
            stock, month, day, forecast_out))
        plt.close()

    return (reg_corr[0], poly2_corr[0], poly3_corr[0], poly4_corr[0], poly5_corr[0],\
           knn_corr[0], bayr_corr[0], rfr_corr[0], mean_corr[0], svr_corr[0]), good_combinations
Beispiel #6
0
def garch(input_col=['p_var', 'mean_return_square', 'sum_abs_sent_square'],
          file_name='modified_garch'):
    # data = resample_data.process_all_codes()
    data = pd.read_csv(settings.get_home_path() + 'data/week_data/total.csv',
                       index_col='date')
    indexs = data.index.drop_duplicates()
    file = open(settings.get_home_path() + 'data/{}.csv'.format(file_name),
                'w')
    file.write(
        "time_winodw_forecast_output,time_window_forecast_input,number of train,number of test,adj_svr_R^2,adj_rfr_R^2,svr_trend,rfr_trend\n"
    )

    svr = SVR(kernel='rbf', C=64, gamma=1 / 3)
    rfr = RFR(max_features=1, warm_start=False)
    origin_col = ['p_var', 'mean_return', 'sum_abs_sent']
    # the first train data
    train_input = data[data.index == indexs[0]].set_index('code')[origin_col]
    train_output = data[data.index == indexs[1]].set_index('code')['p_var']
    train = train_input.join(train_output, rsuffix='_out')
    train = train.dropna()
    train['mean_return_square'] = np.square(train['mean_return'])
    train['sum_abs_sent_square'] = np.square(train['sum_abs_sent'])
    for i in range(1, len(indexs) - 1):
        # the number of companies and features in training set
        num = len(train.index)
        num_features = len(input_col)
        # train the model
        svr.fit(train[input_col], train.p_var_out.values)
        rfr.fit(train[input_col], train.p_var_out.values)
        # predict data
        test_input = data[data.index == indexs[i]].set_index(
            'code')[origin_col]
        test_output = data[data.index == indexs[i +
                                                1]].set_index('code')['p_var']
        test = test_input.join(test_output, rsuffix='_out')
        test = test.dropna()
        test['mean_return_square'] = np.square(test['mean_return'])
        test['sum_abs_sent_square'] = np.square(test['sum_abs_sent'])
        num_test = len(test.index)
        # predict
        test['p_var_pre'] = svr.predict(test[input_col])
        test['p_var_pre2'] = rfr.predict(test[input_col])
        # get the R squared
        r1 = svr.score(train[input_col], train.p_var_out.values).round(4)
        r2 = rfr.score(train[input_col], train.p_var_out.values).round(4)
        # get the adjust R squared
        adj_r1 = 1 - (1 - r1) * (num - 1) / (num - num_features - 1)
        adj_r2 = 1 - (1 - r2) * (num - 1) / (num - num_features - 1)
        # calculate the right rate for predicting trend
        test = test.assign(trend=0)
        test.ix[(test.p_var_out >= test.p_var) &
                (test.p_var_pre >= test.p_var), 'trend'] = 1
        test.ix[(test.p_var_out <= test.p_var) &
                (test.p_var_pre <= test.p_var), 'trend'] = 1
        test = test.assign(trend2=0)
        test.ix[(test.p_var_out >= test.p_var) &
                (test.p_var_pre2 >= test.p_var), 'trend2'] = 1
        test.ix[(test.p_var_out <= test.p_var) &
                (test.p_var_pre2 <= test.p_var), 'trend2'] = 1
        t1 = test.trend.mean().round(4)
        t2 = test.trend2.mean().round(4)
        # output to file
        file.write("{},{},{},{},{},{},{},{}\n".format(indexs[i + 1], indexs[i],
                                                      num, num_test, adj_r1,
                                                      adj_r2, t1, t2))
        # save all predict vol
        out = test[['p_var_pre', 'p_var_pre2', 'p_var_out']]
        out = out.assign(date=indexs[i + 1])
        out.to_csv(settings.get_home_path() +
                   'data/total_predict_{}.csv'.format(file_name),
                   mode='a',
                   header=False,
                   float_format='%.3f')
        # training set at next time window
        train = test[input_col + ['p_var_out']]
        print("finish: ", indexs[i])
    file.close()
Beispiel #7
0
YtCV = []
YvCV = []
for tr_idx, va_idx in Xsp0:
    XtCV.append(X[tr_idx])
    XvCV.append(X[va_idx])
    YtCV.append(Y[tr_idx])
    YvCV.append(Y[va_idx])

errTLF = []
errVLF = []
leaves = [2**k for k in range(1,11)]
for lf in leaves:
    errti=[]
    errvi=[]
    for i in range(5):
        rfr = RFR(n_estimators=50,max_features=8,min_samples_leaf=lf)
        rfr.fit(XtCV[0],YtCV[0])
        errti.append(mse(YtCV[0],rfr.predict(XtCV[0])))
        errvi.append(mse(YvCV[0],rfr.predict(XvCV[0])))
    errti = np.array(errti)
    errvi = np.array(errvi)
    errTLF.append(np.mean(errti))
    errVLF.append(np.mean(errvi))

#%%
plt.semilogx(leaves, errTLF,'*-', label='Train Err')
plt.semilogx(leaves, errVLF,'*-', label='Valid Err')
plt.legend()
plt.title('RandomForest Err vs MinLeaf')
plt.xticks(leaves,leaves)
plt.xlabel('min leaves')
# In[12]:

CVS(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean()

# In[13]:

# 来查看下sklearn中的所有模型来评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

# In[14]:

# 使用随机森林和线性回归作为对比

#随机森林
rfr = RFR(n_estimators=100)
CVS(rfr, xtrain, ytrain, cv=5).mean()

# In[15]:

CVS(rfr, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean()

# In[16]:

#线性回归
lr = LinearR()
CVS(lr, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean()

# In[17]:

# 开启参数slient ,当数据巨大,训练缓慢的时候,可以使用这个参数来监控模型的进度
 def __init__(self, featureset=None, target=None, mode='predict', path=''):
     if (mode == 'train'):
         self.__svm = SVC(C=1.0,
                          cache_size=200,
                          class_weight=None,
                          coef0=0.0,
                          decision_function_shape='ovr',
                          degree=3,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          probability=False,
                          random_state=None,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__svr = SVR(C=1.0,
                          cache_size=200,
                          coef0=0.0,
                          degree=3,
                          epsilon=0.1,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__nusvm = NuSVC(cache_size=200,
                              class_weight=None,
                              coef0=0.0,
                              decision_function_shape='ovr',
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              probability=False,
                              random_state=None,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__nusvr = NuSVR(C=1.0,
                              cache_size=200,
                              coef0=0.0,
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__linsvm = LinearSVC(C=1.0,
                                   class_weight=None,
                                   dual=True,
                                   fit_intercept=True,
                                   intercept_scaling=1,
                                   loss='squared_hinge',
                                   max_iter=1000,
                                   multi_class='ovr',
                                   penalty='l2',
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__linsvr = LinearSVR(C=1.0,
                                   dual=True,
                                   epsilon=0.0,
                                   fit_intercept=True,
                                   intercept_scaling=1.0,
                                   loss='epsilon_insensitive',
                                   max_iter=1000,
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__mlpc = MLPC(activation='relu',
                            alpha=1e-05,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=1,
                            shuffle=True,
                            solver='lbfgs',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__mlpr = MLPR(activation='relu',
                            alpha=0.0001,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=None,
                            shuffle=True,
                            solver='adam',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__dtc = DTC(class_weight=None,
                          criterion='gini',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__dtr = DTR(criterion='mse',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__rfc = RFC(bootstrap=True,
                          class_weight=None,
                          criterion='gini',
                          max_depth=100,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=50,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         self.__rfr = RFR(bootstrap=True,
                          criterion='mse',
                          max_depth=None,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=10,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         (self.__svm, self.__svr, self.__nusvm, self.__nusvr, self.__linsvm,
          self.__linsvr, self.__mlpc, self.__mlpr, self.__dtc, self.__dtr,
          self.__rfc, self.__rfr) = self.__trainAll(X=list(featureset),
                                                    Y=list(target))
         self.__saveModelsToFile(path)
     else:
         self.__svm = joblib.load(path + 'Mel_SVM.pkl')
         self.__svr = joblib.load(path + 'Mel_SVR.pkl')
         self.__nusvm = joblib.load(path + 'Mel_NuSVM.pkl')
         self.__nusvr = joblib.load(path + 'Mel_NuSVR.pkl')
         self.__linsvm = joblib.load(path + 'Mel_LinSVM.pkl')
         self.__linsvr = joblib.load(path + 'Mel_LinSVR.pkl')
         self.__mlpc = joblib.load(path + 'Mel_MLPC.pkl')
         self.__mlpr = joblib.load(path + 'Mel_MLPR.pkl')
         self.__dtc = joblib.load(path + 'Mel_DTC.pkl')
         self.__dtr = joblib.load(path + 'Mel_DTR.pkl')
         self.__rfc = joblib.load(path + 'Mel_RFC.pkl')
         self.__rfr = joblib.load(path + 'Mel_RFR.pkl')
Beispiel #10
0
    dpaperi['collab_prestige'] = int(
        np.nan_to_num(
            np.median(np.exp(50) /
                      (np.exp(50) + np.exp(prestige_ca_vec)))) >= 0.35)
    dpaperi['collab_citation'] = np.nan_to_num(np.mean(citation_ca_vec))
    dpaperi['review'] = scipy.special.expit(20 * (review - 0.5))
    df_s_a_emb[index] = dpaperi

dsa = pd.DataFrame.from_dict(df_s_a_emb, orient='index')

dsa11 = dsa.loc[dsa['prestige'] == 1].loc[dsa['collab_prestige'] == 1]
dsa10 = dsa.loc[dsa['prestige'] == 1].loc[dsa['collab_prestige'] == 0]
dsa01 = dsa.loc[dsa['prestige'] == 0].loc[dsa['collab_prestige'] == 1]
dsa00 = dsa.loc[dsa['prestige'] == 0].loc[dsa['collab_prestige'] == 0]

rfs11 = RFR(n_estimators=50)
rfs10 = RFR(n_estimators=50)
rfs01 = RFR(n_estimators=50)
rfs00 = RFR(n_estimators=50)

rfs11 = rfs11.fit(dsa11[['citation', 'collab_citation']], dsa11['review'])
rfs10 = rfs10.fit(dsa10[['citation', 'collab_citation']], dsa10['review'])
rfs01 = rfs01.fit(dsa01[['citation', 'collab_citation']], dsa01['review'])
rfs00 = rfs00.fit(dsa00[['citation', 'collab_citation']], dsa00['review'])

ie1 = rfs11.predict(dsa[['citation', 'collab_citation']]) - rfs01.predict(
    dsa[['citation', 'collab_citation']])
ie0 = rfs10.predict(dsa[['citation', 'collab_citation']]) - rfs00.predict(
    dsa[['citation', 'collab_citation']])

re1 = rfs11.predict(dsa[['citation', 'collab_citation']]) - rfs10.predict(
Beispiel #11
0
print("Accuracy: %0.2f (± %0.2f)" % (scores.mean(), scores.std() * 2))

keys = list(range(0, len(confs.keys())))
conf_keys = list(confs.keys())

acc = np.zeros((len(confs.keys()), ))
blind = np.zeros((len(confs.keys()), ))
for i in range(0, len(confs.keys())):
    d_temp = []

    for j in range(0, len(data)):
        if data[j, 1] == i:
            d_temp.append(list(data[j, :]) + [target[j]])
    if len(d_temp) > 0:
        d_temp = np.array(d_temp)
        lr = RFR()
        lr = lr.fit(
            np.array(d_temp)[:, 2].reshape(-1, 1),
            np.array(d_temp)[:, 3])
        scores = lr.score(
            np.array(d_temp)[:, 2].reshape(-1, 1),
            np.array(d_temp)[:, 3])  # , cv=5)
        acc[i] = 1 - scores
        blind[i] = np.mean(d_temp[:, 0])
        fig = plt.figure(figsize=(8.75, 7))
        plt.scatter(np.array(d_temp)[:, 2], np.array(d_temp)[:, 3])
        plt.title("Conference %s" % (conf_keys[i]))
        fig.savefig("output/status_review_conference_%d.png" % (keys[i]))

# plt.scatter(range(0,len(acc)),acc,c=blind)
Beispiel #12
0
# display_score(rmse_scores)
# Mean: 71227.31692492112
# Stardard deviation:  2926.49161963209

# 和LR的交叉驗證評分做個比較
lr_scores = cvs(lr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
lr_rmse = np.sqrt(-lr_scores)
# display_score(lr_rmse)
# Mean: 69052.46136345083
# Stardard deviation:  2731.6740017983425

# 明顯兩者比較出來的結果, 決策樹回歸已經嚴重的過度擬合了導致比線性回歸還差

# 所以來嘗試最後一個模型 RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor as RFR
rfr = RFR()
# rfr.fit(housing_prepared, housing_labels)
# housing_predicted = rfr.predict(housing_prepared)
# rmse = np.sqrt(mean_squared_error(housing_labels, housing_predicted))
# print("RMSE: ", rmse) # RMSE:  18620.70199601925
# rfr_scores = cvs(rfr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
# rfr_rmse = np.sqrt(-rfr_scores)
# display_score(rfr_rmse)
# Mean: 50243.380660403775
# Stardard deviation:  1997.2178724397745
# 結果仍然過度擬合了, 因為訓練分數(rmse)遠低於驗證分數(rfr_rmse)
# 注意: 千萬別花太多時間調整超參數, 我們的目的是篩選幾個(2~5)有效的模型, 緊接著微調就可以了

# 6. 微調模型
# 格狀搜尋 GridSearch
# 使用 Scikit-Learn GridSearchCV
Beispiel #13
0
 def RandomForest_regression(self):
     model = RFR(n_estimators=1000, max_depth=10)
     model.fit(self.train_X, self.train_y)
     path = model.decision_path(self.train_X)
     self.y_pre_train = model.predict(self.train_X)
     self.y_pre_valid = model.predict(self.valid_X)
Beispiel #14
0
def train_model():
    data = get_data()
    X_train, X_test, y_train, y_test = split_data(data)
    X_train, y_train = remove_county_state(X_train, y_train)
    X_test, y_test = remove_county_state(X_test, y_test)

    print('y_train', list(y_train))
    print('y_test', list(y_test))
    print('all y', list(y_train)+list(y_test))

    # data preprocessing (removing mean and scaling to unit variance with StandardScaler)
    pipeline = make_pipeline(StandardScaler(),
                             RFR())

    # set hyperparameters
    hyperparameters = {
                        # 'randomforestregressor__max_features' : ['auto', 'sqrt'],
                        # 'randomforestregressor__max_depth': [3, 5, None],
                        # 'randomforestregressor__bootstrap': [True, False],
                        # 'randomforestregressor__min_samples_leaf': [3, 5, 7],
                        # 'randomforestregressor__min_samples_split': [5, 10, 15],
                        # 'randomforestregressor__n_estimators': [5, 8, 10, 15],

                        # 'randomforestregressor__max_features' : ['sqrt'],
                        # 'randomforestregressor__max_depth': [100],
                        # 'randomforestregressor__bootstrap': [ False],
                        # 'randomforestregressor__min_samples_leaf': [1],
                        # 'randomforestregressor__min_samples_split': [2],
                        # 'randomforestregressor__n_estimators': [200],

                        #
                        # 'randomforestregressor__max_leaf_nodes': [None],
                        # 'randomforestregressor__min_impurity_decrease': [0.0],
                        # 'randomforestregressor__min_impurity_split':[None],
                        # 'randomforestregressor__min_weight_fraction_leaf':[0.0],

                        'randomforestregressor__max_features': ['auto'],
                        'randomforestregressor__max_depth': [None],
                        'randomforestregressor__bootstrap': [True],
                        'randomforestregressor__min_samples_leaf': [5],
                        'randomforestregressor__min_samples_split': [10],
                        'randomforestregressor__n_estimators':[10],

                        # 'randomforestregressor__max_features': ['auto'],
                        # 'randomforestregressor__max_depth': [None],
                        # 'randomforestregressor__bootstrap': [True],
                        # 'randomforestregressor__min_samples_leaf': [5],
                        # 'randomforestregressor__min_samples_split': [10],
                        # 'randomforestregressor__n_estimators':[10, 30, 50, 70, 100],
                        }




    # tune model via pipeline
    clf = GridSearchCV(pipeline, hyperparameters, cv=3)

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
    # print('feature importances:', clf.feature_importances_)
    print ('r2 score:',r2_score(y_test, pred))
    print ('mse:',mean_squared_error(y_test, pred))
    print('*'*20)
    print('best params:',clf.best_params_)
    print('best grid:', clf.best_estimator_)
    print('^'*20)
    eval_model(clf.best_estimator_, X_train, y_train, X_test, y_test)
    print('#'*20)
    print('score', clf.score)
    return clf
Beispiel #15
0
for file in files:
    # Load Data
    with open(file, "rb") as f:
        datas = pickle.load(f)

    results = {}

    # boxcox-shift params
    lambda_ = datas['cat_data']['lambda']
    shift = datas['cat_data']['shift']

    # models
    models = {}
    models["RF"] = GridSearchCV(
        RFR(n_jobs=-1),
        param_grid={
            "n_estimators": [10, 100, 1000, 10000],
            "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        },
        cv=5,
        n_jobs=20)
    models["LASSO"] = LassoCV(max_iter=100000, cv=5, n_jobs=20)
    models["RIDGE"] = RidgeCV(cv=5)
    models["LASSOLARS"] = LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1)
    models["SVR_POLY2"] = GridSearchCV(
        SVR(kernel='poly', degree=2),
        param_grid={
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
            "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "epsilon": [0.01, 0.1, 0.5, 1, 2, 4]
Beispiel #16
0
                          y_train,
                          cv=2).mean()
    return val


rfrBO = BayesianOptimization(
    rfrcv, {
        'n_estimators': (100, 400),
        'min_samples_split': (20, 100),
        'max_features': (0.1, 0.999)
    })
gp_params = {"alpha": 1e-5}
rfrBO.maximize(n_iter=10, **gp_params)

rf = RFR(n_estimators=268,
         min_samples_split=20,
         max_features=9,
         random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)


def ridgecv(alpha):
    val = cross_val_score(Ridge(alpha=alpha, random_state=42),
                          X_train,
                          y_train,
                          cv=2).mean()
    return val


ridgeBO = BayesianOptimization(ridgecv, {'alpha': (0.01, 11)})
ridgeBO.maximize(n_iter=20, **gp_params)
Beispiel #17
0
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
var = pca.explained_variance_ratio_

#Regressor
from sklearn.ensemble import RandomForestRegressor as RFR

regressor = RFR(n_estimators=100)
regressor.fit(X_train, y_train)

#prediction
y_pred = regressor.predict(X_test)

#Plotting
plt.scatter(X_train, y_train, color="red")
plt.plot(X_train, regressor.predict(X_train), color="blue")
plt.show()

plt.scatter(X_test, y_test, color="red")
plt.plot(X_test, y_pred, color="blue")
plt.show()
Beispiel #18
0
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype(
    'category')
train_test_combine.general_cat = train_test_combine.general_cat.astype(
    'category')
train_test_combine.subcat_1 = train_test_combine.subcat_1.astype('category')
train_test_combine.subcat_2 = train_test_combine.subcat_2.astype('category')
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.general_cat = train_test_combine.general_cat.cat.codes
train_test_combine.subcat_1 = train_test_combine.subcat_1.cat.codes
train_test_combine.subcat_2 = train_test_combine.subcat_2.cat.codes
# modeling
train_test_combine = train_test_combine.drop(["test_id", "train_id"], axis=1)
train_test_combined = pd.concat([
    train_test_combine.reset_index(drop=True),
    train_test_tfidf.reset_index(drop=True)
],
                                axis=1)
df_train = train_test_combined.loc[train_test_combined["is_train"] == 1]
df_test = train_test_combined.loc[train_test_combined["is_train"] == 0]
df_test = df_test.drop(["is_train"], axis=1)
df_train = df_train.drop(["is_train"], axis=1)
df_train["log_price"] = train.log_price.values
x_train, y_train = df_train.drop(['log_price'], axis=1), df_train.log_price
model = RFR(n_estimators=4)
model.fit(x_train, y_train)
y_test = model.predict(df_test)
submission = pd.DataFrame({"test_id": list(test["test_id"])})
submission["price"] = y_test
submission["price"] = submission["price"].apply(lambda x: np.exp(x) - 1)
Beispiel #19
0
 def fun_rfr(x):
     clf = RFR(n_estimators=500, oob_score=True)
     rf_fit = clf.fit(X=x, y=pheno)
     return rf_fit.oob_score_
Beispiel #20
0
print([*zip(poly.get_feature_names(),reg.coef_)][:10])
# 放到dataframe中进行排序
coeff = pd.DataFrame([poly.get_feature_names(),reg.coef_.tolist()]).T
coeff.columns = ["feature","coef"]
coeff.sort_values(by="coef", inplace=True)

# In[]:
from time import time
time0 = time()
print("R2:{}".format(reg.score(X_,y)))
print("time:{}".format(time()-time0))


# In[]:
# 假设使用其他模型?
from sklearn.ensemble import RandomForestRegressor as RFR

time0 = time()
print("R2:{}".format(RFR(n_estimators=100).fit(X,y).score(X,y))) # R2:0.9743205003727138
print("time:{}".format(time()-time0))










Beispiel #21
0
for tr_idx, va_idx in Xsp0:
    XtCV.append(X[tr_idx])
    XvCV.append(X[va_idx])
    YtCV.append(Y[tr_idx])
    YvCV.append(Y[va_idx])

errTP = []
errVP = []
#%%
parents = [2**k for k in range(1,11)]
#%%
for pr in parents:
    errti=[]
    errvi=[]
    for i in range(5):
        rfr = RFR(n_estimators=50,max_features=8,min_samples_split=pr)
        rfr.fit(XtCV[0],YtCV[0])
        errti.append(mse(YtCV[0],rfr.predict(XtCV[0])))
        errvi.append(mse(YvCV[0],rfr.predict(XvCV[0])))
    errti = np.array(errti)
    errvi = np.array(errvi)
    errTP.append(np.mean(errti))
    errVP.append(np.mean(errvi))

#%%
plt.semilogx(parents, errTP,'*-', label='Train Err')
plt.legend()
plt.title('RandomForest Train Err vs MinParent')
plt.xticks(parents,parents)
plt.xlabel('min parent')
plt.ylabel('err')
Beispiel #22
0
#KNeighbors Regression
knr = KNR(n_neighbors = 4, weights = 'distance', p = 4)

#training
knr.fit(X_train, y_train)

#testing
y_pred_knr = knr.predict(X_test)

#r_square
r_2_knr = r2_score(y_test, y_pred_knr)



#Random Forest Regression
rfr = RFR(n_estimators = 100, max_features = 'auto', random_state = 1)

#training
rfr.fit(X_train, y_train)

#testing
y_pred_rfr = rfr.predict(X_test)

#r_square
r_2_rfr = r2_score(y_test, y_pred_rfr)

#feature importance
fet = rfr.feature_importances_


Beispiel #23
0
            url_data['station'],
            np.dot(url_data['day'], 24) + url_data['hour']
        ]).T
        if city == 'bj':
            y_train_data = np.array(
                [url_data['pm25'], url_data['pm10'], url_data['o3']]).T
        else:
            y_train_data = np.array([url_data['pm25'], url_data['pm10']]).T

        for i in range(2):
            station_list = []
            for station in range(STATION_NUM[city]):
                station_list = np.concatenate(
                    (station_list, [station] * HOUR_NUM))
            day_delta_list = [url_data['day_delta'] + i
                              ] * HOUR_NUM * STATION_NUM[city]
            hour_list = list(range(HOUR_NUM)) * STATION_NUM[city]
            x_data = np.array([station_list, day_delta_list, hour_list]).T
            hour_list = list(range(HOUR_NUM)) * STATION_NUM[city] + np.dot(
                day_delta_list, 24)
            x_predict_data = np.array([station_list, hour_list]).T

            regr_rf = RFR(max_depth=MAX_DEPTH, random_state=2)
            regr_rf.fit(x_train_data, y_train_data)

            y_rf = regr_rf.predict(x_predict_data)
            y_rf[y_rf < 0] = 0
            filename = 'sub.csv'
            csv_saver = utils.CsvSaver(x_data, y_rf, city, filename, day=i)
            csv_saver.save()
Beispiel #24
0
def testPCA(components):

    #pca_trans=PCA(n_components=components,random_state=1)
    pca_trans = tsvd(n_components=components, random_state=7, n_iter=10)

    pca_trans.fit(data)

    data2 = pca_trans.transform(data)

    #MinMax Normalizer
    scaler = MinMaxScaler()
    scaler.fit(data2)
    data2 = scaler.transform(data2)

    y["target"] = np.log1p(y["target"])

    #train test split
    x_train, x_test, y_train, y_test = tts(data2, y["target"], test_size=0.20)

    #######################----------Algos--------------------#######################
    ranfor = RFR(n_estimators=500, verbose=0, n_jobs=-1, random_state=7)
    extratrees = ETR(n_estimators=500, random_state=7)
    bagging = BR(ETR(n_estimators=10, random_state=1),
                 n_estimators=100,
                 random_state=7)
    """---XGBOOST---"""
    xgb_train = xgb.DMatrix(x_train, label=y_train)
    xgb_validate = xgb.DMatrix(x_test, label=y_test)
    xgb_test_pred = xgb.DMatrix(x_test)

    param = {}
    param['objective'] = 'reg:linear'
    param['eta'] = 0.001
    param['max_depth'] = 6
    param['alpha'] = 0.001
    param['colsample_bytree'] = 0.6
    param['subsample'] = 0.6
    param['silent'] = 0
    param['nthread'] = 4
    param['random_state'] = 42
    param['eval_metric'] = 'rmse'

    watchlist = [(xgb_train, 'train'), (xgb_validate, 'validation')]
    """-fit-"""
    ranfor.fit(x_train, y_train)
    extratrees.fit(x_train, y_train)

    bst = xgb.train(param,
                    xgb_train,
                    10000,
                    watchlist,
                    early_stopping_rounds=100,
                    verbose_eval=100,
                    maximize=False)

    y_pred = ranfor.predict(x_test)
    y_pred_ada = extratrees.predict(x_test)
    y_pred_xgb = bst.predict(xgb_test_pred, ntree_limit=bst.best_ntree_limit)

    #blending
    blending_X = pd.DataFrame()
    blending_X['xgb'] = bst.predict(xgb.DMatrix(x_train),
                                    ntree_limit=bst.best_ntree_limit)
    blending_X['ExtraTrees'] = extratrees.predict(x_train)
    blending_X['ranfor'] = ranfor.predict(x_train)

    bagging.fit(blending_X, y_train)

    blending_test = pd.DataFrame()
    blending_test['xgb'] = y_pred_xgb
    blending_test['ExtraTrees'] = y_pred_ada
    blending_test['ranfor'] = y_pred

    y_pred_grad = bagging.predict(blending_test)
    ###############################################

    y_pred_2best = (0.6 * y_pred_ada) + (0.4 * y_pred_xgb)

    print("PCA: %s --- Ranfor RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred))))
    print("PCA: %s --- ExtraTrees RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_ada))))
    print("PCA: %s --- XGBoost RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_xgb))))

    print("PCA: %s --- blended bagging RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_grad))))
    print("PCA: %s --- XGBoost+ExtraTrees RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_2best))))

    return {
        "pca": pca_trans,
        "scaler": scaler,
        "ranfor": ranfor,
        'extratrees': extratrees,
        'bagging': bagging,
        'xgboost': bst
    }
Beispiel #25
0
def buy_ml_vol(stock,
               forecast_out=5,
               month=None,
               day=None,
               plot=False,
               year=2019,
               best_combination=None):
    # Want to separate 1 percent of the data to forecast
    days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

    # Today info
    if ((month == None) or (day == None)):
        today = datetime.datetime.now()
        month = today.month if ((today.day + forecast_out) <=
                                days[today.month - 1]) else today.month + 1
        day = today.day+forecast_out if((today.day-forecast_out)<=days[today.month-1]) else \
              today.day+forecast_out-days[today.month-1]
        day = today.day+forecast_out if(today.day+forecast_out == days[today.month-1]) else \
              (today.day+forecast_out)%days[today.month-1]

    # For prediction
    start = datetime.datetime(2016, 4, 1)
    end = datetime.datetime(year, month, day)
    #df = web.DataReader(stock, 'yahoo', start, end)
    df = read_data(stock, start, end)
    #print("BUYING")
    if (df.empty):
        return [0] * 10, "ERROR"
    dfreg = df.loc[:, ['adjusted close', 'volume']]
    dfreg['HL_PCT'] = (df['high'] - df['low']) / df['adjusted close'] * 100.0
    dfreg['PCT_change'] = (df['adjusted close'] -
                           df['open']) / df['open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)

    # Searating the label here, we want to predict ht eAdjclose
    forecast_col = 'volume'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))

    # Scale X for linear regression
    try:
        X = preprocessing.scale(X)
    except ValueError:
        print("DATA: {}".format(X))
        print("STOCK: {}".format(stock))
        print("START PERIOD: {}".format(start))
        print("END PERIOD: {}".format(end))

    # Finally want late X and early X for model
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]

    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    # Training and testing sets
    X_train = X[:len(X) - forecast_out]
    X_test = X[len(X) - forecast_out:]

    y_train = y[:len(y) - forecast_out]
    y_test = y[len(y) - forecast_out:]

    # LinReg
    clfreg = LinearRegression(n_jobs=-1)

    # QuadReg2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())

    # QuadReg3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())

    # QuadReg4
    clfpoly4 = make_pipeline(PolynomialFeatures(4), Ridge())

    # QuadReg5
    clfpoly5 = make_pipeline(PolynomialFeatures(5), Ridge())

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)

    # Bayesian Ridge
    clfbayr = BayesianRidge()

    # Neural Network
    #clfmlp = MLPRegressor(hidden_layer_sizes=(100,100,100), learning_rate='adaptive',
    #          solver='adam', max_iter=5, verbose=False)
    #clfmlp.fit(X_train, y_train)

    # Random Forest Regressor
    clfrfr = RFR(n_estimators=15, random_state=0)

    # Support Vector Regressor
    clfsvr = SVR(gamma='auto')

    # Fitting
    threads = []
    models = [
        clfreg, clfpoly2, clfpoly3, clfpoly4, clfpoly5, clfknn, clfbayr,
        clfrfr, clfsvr
    ]
    fits = [''] * len(models)
    for i in range(len(models)):
        process = Thread(target=fitting,
                         args=[models[i], X_train, y_train, fits, i],
                         name=stock)
        process.start()
        threads.append(process)

    for process in threads:
        process.join()

    # Evaluation
    #confidencereg = clfreg.score(X_train, y_train)
    #confidencepoly2 = clfpoly2.score(X_train, y_train)
    #confidencepoly3 = clfpoly3.score(X_train, y_train)
    #confidenceknn = clfknn.score(X_train, y_train)
    #confidencebayr = clfbayr.score(X_train, y_train)

    # Predictions
    reg_forecast = fits[0].predict(X_lately)
    poly2_forecast = fits[1].predict(X_lately)
    poly3_forecast = fits[2].predict(X_lately)
    poly4_forecast = fits[3].predict(X_lately)
    poly5_forecast = fits[4].predict(X_lately)
    knn_forecast = fits[5].predict(X_lately)
    bayr_forecast = fits[6].predict(X_lately)
    #mlp_forecast = clfmlp.predict(X_lately)
    rfr_forecast = fits[7].predict(X_lately)
    svr_forecast = fits[8].predict(X_lately)

    # Set up dataframe
    dfreg['reg_forecast'] = np.nan
    dfreg['poly2_forecast'] = np.nan
    dfreg['poly3_forecast'] = np.nan
    dfreg['poly4_forecast'] = np.nan
    dfreg['poly5_forecast'] = np.nan
    dfreg['knn_forecast'] = np.nan
    dfreg['bayr_forecast'] = np.nan
    dfreg['mlp_forecast'] = np.nan
    dfreg['rfr_forecast'] = np.nan
    dfreg['svr_forecast'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)
    for i in zip(reg_forecast, poly2_forecast, poly3_forecast, poly4_forecast,
                 poly5_forecast, knn_forecast, bayr_forecast, rfr_forecast,
                 svr_forecast):
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = list(
            [np.nan for _ in range(len(dfreg.columns) - 9)] + list(i))

    dfreg['mean_forecast'] = dfreg[[
        'reg_forecast',
        'poly2_forecast',
        'poly3_forecast',
        'knn_forecast',
        'bayr_forecast',  # 'mlp_forecast',
        'rfr_forecast'
    ]].mean(axis=1)
    if (plot):
        dfreg['volume'].tail(50).plot(lw=2, figsize=(20, 12))
        dfreg['mean_forecast'].tail(50).plot(lw=2, c='k')
        dfreg['bayr_forecast'].tail(50).plot(lw=0.5)
        dfreg['knn_forecast'].tail(50).plot(lw=0.5)
        dfreg['reg_forecast'].tail(50).plot(lw=0.5)
        dfreg['poly2_forecast'].tail(50).plot(lw=0.5)
        dfreg['poly3_forecast'].tail(50).plot(lw=0.5)
        #dfreg['mlp_forecast'].tail(50).plot(lw=0.5)
        dfreg['rfr_forecast'].tail(50).plot(lw=0.5)
        plt.legend(loc='best')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.title(stock)
        plt.savefig("./pred_plots/{}_{}/volume/{}_{}_{}".format(
            today.day, today.month, stock, today.day, today.month))
        #plt.show()
        plt.close()
        #dfreg['volume'].tail(200).plot()
        #plt.title(stock)
        #plt.show()

    #if(not(best_combination==None)):
    #    dfreg['best_mean_forecast'] = dfreg[[*list(best_combination)]].mean(axis=1)
    #    fit = np.polyfit([i for i in range(forecast_out)],
    #                      dfreg['best_mean_forecast'].values[-forecast_out:], deg=1)
    #else:
    try:
        fit = np.polyfit([i for i in range(forecast_out)],
                         dfreg['mean_forecast'].values[-forecast_out:],
                         deg=1)
    except:
        print("FORECASTING {} DAY OUT".format(forecast_out))
        fit = [
            dfreg['mean_forecast'].values[-1] -
            dfreg['adjusted close'].values[-1], 2
        ]

    string = "VOLUME SHOULD GO UP" if (fit[0] > 0) else "VOlUME SHOULD GO DOWN"
    #print("{} {}".format(stock, string))
    #print("VOLUME HAS BEEN FIT: {}".format(fit[0]))
    return fit[0], dfreg['volume'].values[-forecast_out - 1]
Beispiel #26
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

from sklearn.ensemble import RandomForestRegressor as RFR
regressor = RFR(n_estimators=300, random_state=0)
regressor.fit(X, y)

y_pred = regressor.predict([[6.5]])

X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Regression Model)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# There seems to be a large amount of noise in this model, perhaps due to overfitting.

# ## Random Forest Algorithm

# In[35]:


from sklearn.ensemble import RandomForestRegressor as RFR

# set seed for consistency
np.random.seed(171)

# fit random forest
random_forest = RFR()
random_forest.fit(X_train, y_train)


# In[36]:


random_forest.score(X=X_test, y=y_test)


# In[37]:


# predict using the random forest model
y_pred = random_forest.predict(X_test)
Beispiel #28
0
def render(img,
           features='coordinates',
           ratio=0.00025,
           iterations=1,
           lab=True,
           depth=None,
           npxs=5e5,
           anti_aliasing=False,
           verbose=False):
    """
	Features decides the inputs for the model, current options are 'coordinates' for coordinate based features
	and 'landmarks' for distance to landmarks features. Render time and memory usage is about 2x using landmarks features.
	Default is 'coordinates'.

	ratio corresponds to the ratio of the size of the smallest details the model is allowed to use compared to the whole image.
	Default is 0.001, 1 would correspond to not fitting to anything while 0 would fit down to individual pixels.

	iterations is how many randomized runs of the base model to use for averaging in the final prediction.
	1 is default is corresponds to sharp boundaries. 10-100 would result in a much smoother more painterly result.
	Render time and memory usage increases linearly with number of iterations.

	lab decides whether to fit the model in lab color space instead of rgb color space. 
	Default is True.

	depth decides how many levels of splits the regressor is allowed to have.
	Default is None which corresponds to as many as needed.

	np decides how many pixels to resize the source image to internally for fitting.
	Default is 500,000. 

	anti_aliasing decides whether or not to use 2x grid super sampling.
	Default is False. Render time and memory usage will be increased over 2x.

	verbose controls whether to print info about a render
	Default is False.
	"""
    t = time()

    w, h = img.shape[:2]

    wrender, hrender = w, h
    if anti_aliasing:
        wrender, hrender = w * 2, h * 2

    img_o = pixel_scale(img, npxs)
    wfit, hfit = img_o.shape[:2]

    if lab:
        img = rgb2lab(img_o)

    if features == 'landmarks':
        locations = list(np.linspace(0, 1, 7))
        landmarks = list(product(locations, locations))
    else:
        landmarks = None

    X, Y = gen_xy(img, landmarks)
    xrender = gen_x(wrender, hrender, landmarks)

    min_samples = int(round(ratio * len(X)))
    model = RFR(n_estimators=iterations,
                n_jobs=-1,
                max_depth=depth,
                random_state=42,
                min_samples_leaf=min_samples)
    model.fit(X[:, 2:], Y)

    pred = model.predict(xrender[:, 2:])
    pred_img = pred_to_img(pred, xrender, wrender, hrender)

    if lab:
        pred_img = lab2rgb(pred_img)

    error = np.mean(np.square(resize(pred_img, (wfit, hfit)) - img_o)) * 255.

    if anti_aliasing:
        pred_img = resize(pred_img, (w, h))

    if verbose:
        s = "%08.3f seconds to render\n" % (time() - t)
        s += "%08.3f error (0-255 scaled)\n" % (error)
        s += "%08.3f min pixels considered\n" % (min_samples)
        print(s)

    return pred_img
Beispiel #29
0
    mean_squared_error(y_pred_knr, y_test_energy)))

#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor as DTR
dtr_energy = DTR(max_depth=11,
                 min_samples_split=16,
                 min_samples_leaf=2,
                 random_state=37).fit(X_train_energy_stand, y_train_energy)
y_pred_dtr = dtr_energy.predict(X_test_energy_stand)
print("Mean squared error for DTR: {:.3f}.".format(
    mean_squared_error(y_pred_dtr, y_test_energy)))

#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor as RFR
rfr_energy = RFR(n_estimators=100,
                 min_samples_leaf=2,
                 max_leaf_nodes=1000,
                 random_state=37).fit(X_train_energy, y_train_energy)
y_pred_rfr = rfr_energy.predict(X_test_energy)
print("Mean squared error for RFR: {:.3f}.".format(
    mean_squared_error(y_pred_rfr, y_test_energy)))

#Support Vector
from sklearn.svm import SVR
svr_energy = SVR().fit(X_train_energy_stand, y_train_energy)
y_pred_svr = svr_energy.predict(X_test_energy_stand)
print("Mean squared error for SVR: {:.3f}.".format(
    mean_squared_error(y_pred_svr, y_test_energy)))

from sklearn.neural_network import MLPRegressor as MLPR
mlpr_energy = MLPR(hidden_layer_sizes=(100, 100),
                   alpha=.3,
Beispiel #30
0
YtCV = []
YvCV = []
for tr_idx, va_idx in Xsp0:
    XtCV.append(X[tr_idx])
    XvCV.append(X[va_idx])
    YtCV.append(Y[tr_idx])
    YvCV.append(Y[va_idx])

errTD = []
errVD = []
D = list(range(5, 60, 5))
for d in D:
    errti = []
    errvi = []
    for i in range(5):
        rfr = RFR(n_estimators=50, max_depth=d)
        rfr.fit(XtCV[0], YtCV[0])
        YtHat = rfr.predict(XtCV[0])
        YvHat = rfr.predict(XvCV[0])
        errti.append(mse(YtCV[0], YtHat))
        errvi.append(mse(YvCV[0], YvHat))
    errti = np.array(errti)
    errvi = np.array(errvi)
    errTD.append(np.mean(errti))
    errVD.append(np.mean(errvi))

#%%
plt.plot(D, errTD, '*-', label='Train Err')
plt.plot(D, errVD, '*-', label='Valid Err')
plt.legend()
plt.title('RandomForest Err vs MaxDepth')