Beispiel #1
0
ols.fit(train, spamtrain)



expected = spamtest

predicted = clf.predict(test)
predicted1 = ols.predict(test)


#print(spamtrain)

#print(predicted)


print(clf.score(test, spamtest))
print(ols.score(test, spamtest))


# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600, 
           title = 'Example Glyphs',
           x_axis_label = 'X', y_axis_label = 'Y')



# Add squares glyph
p.square(clf.X_offset_, ols.coef_, size = 12, color = 'navy', alpha = 0.6)

bokehShow(p)
#            random_state=0))
t1 = time()
sc.fit(X_train, y_train)
sc_time = time() -t1
computed_coefs = sc.inverse_transform()
computed_coefs = np.reshape(computed_coefs, [size, size, size])
score = sc.score(X_test, y_test)


###############################################################################
# Compute the results for simple BayesianRidge
t1 = time()
clf.fit(X_train, y_train)
bayes_time = time() - t1
bayes_coefs = clf.coef_
bayes_score = clf.score(X_test, y_test)
bayes_coefs = bayes_coefs.reshape((size, size, size))


###############################################################################
# Plot the results

pl.close('all')
pl.figure()
pl.title('Scores of the supervised clustering')
pl.subplot(2, 1, 1)
pl.plot(np.arange(len(sc.scores_)), sc.scores_)
pl.xlabel('score')
pl.ylabel('iteration')
pl.title('Score of the best parcellation of each iteration')
pl.subplot(2, 1, 2)
Beispiel #3
0
class IndividualTest:
    def __init__(self):
        self.test = Test()
        self.file_io = FileIO()
        self.lr = LinearRegression(normalize=True)
        self.br = BayesianRidge()
        #self.svr_lin = SVR(kernel='linear', C=1e5)
        self.svr_poly = SVR(kernel='poly', C=1e5, degree=2)
        self.svr_rbf = SVR(kernel='rbf', C=5e4, gamma='scale')
        self.svr_sig = SVR(kernel='sigmoid', C=1e3)
        #self.gridsearch = GridSearchCV(SVR(kernel='rbf'), scoring="r2", return_train_score=True)
        self.sc = StandardScaler()
        self.ms = MinMaxScaler()
        self.chart = DrawChart2()

    def lin_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(
            index=['coefficient', 'intercept', 'train_score', 'test_score'],
            columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに単回帰分析
            self.lr.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.lr.coef_

            # 切片 (誤差)
            intercept = self.lr.intercept_

            # トレーニングスコア
            train_score = self.lr.score(s_X_train, s_Y_train)

            # テストスコア
            test_score = self.lr.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, intercept, train_score, test_score]

            # 回帰曲線
            lin_pred = self.lr.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'go-')
            plt.show()

            #if col in ['売上単価','コース受諾回数_なし','数量','施術時間','指名回数_あり','治療送客回数_あり','治療送客回数_なし']:
            # グラフ描画
            #self.chart.draw(self.lr, s_X_test, s_Y_test, col, 'score is {}'.format(test_score))

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def bayesian_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(
            index=['coefficient', 'intercept', 'train_score', 'test_score'],
            columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに単回帰分析
            self.br.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.br.coef_

            # 切片 (誤差)
            intercept = self.br.intercept_

            # トレーニングスコア
            train_score = self.br.score(s_X_train, s_Y_train)

            # テストスコア
            test_score = self.br.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, intercept, train_score, test_score]

            if col in [
                    '売上単価', 'コース受諾回数_なし', '数量', '施術時間', '指名回数_あり', '治療送客回数_あり',
                    '治療送客回数_なし'
            ]:
                # グラフ描画
                self.chart.draw(self.br, s_X_test, s_Y_test, col,
                                'score is {}'.format(test_score))

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_rbf_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            #self.svr_lin.fit(s_X_train, s_Y_train)
            #self.svr_poly.fit(s_X_train, s_Y_train)
            self.svr_rbf.fit(s_X_train, s_Y_train)
            #self.gridsearch.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_rbf.dual_coef_

            # サポートベクトル
            support_vec = self.svr_rbf.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_rbf.intercept_

            # 精度
            train_score = self.svr_rbf.score(s_X_train, s_Y_train)
            test_score = self.svr_rbf.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            #lin_pred = self.svr_lin.predict(s_X_test)
            #poly_pred = self.svr_poly.predict(s_X_test)
            rbf_pred = self.svr_rbf.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
            plt.show()

            if col in ['生年月日']:
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-')
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-')
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_poly_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            self.svr_poly.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_poly.dual_coef_

            # サポートベクトル
            support_vec = self.svr_poly.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_poly.intercept_

            # 精度
            train_score = self.svr_poly.score(s_X_train, s_Y_train)
            test_score = self.svr_poly.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            #lin_pred = self.svr_lin.predict(s_X_test)
            #poly_pred = self.svr_poly.predict(s_X_test)
            rbf_pred = self.svr_poly.predict(s_X_test)

            if col in ['生年月日']:
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-')
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-')
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_sig_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            self.svr_sig.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_sig.dual_coef_

            # サポートベクトル
            support_vec = self.svr_sig.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_sig.intercept_

            # 精度
            train_score = self.svr_sig.score(s_X_train, s_Y_train)
            test_score = self.svr_sig.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            sig_pred = self.svr_sig.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-')
            plt.show()

            if col in ['生年月日', '閲覧ページ総数', '閲覧ページ数/セッション', '滞在時間']:
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(
            df, inifile.get('regression', 'ind_path'))
Beispiel #4
0
def test3():
    name = request.form["name"]
    target = request.form["target"]
    test_size = request.form["test_size"]
    dataset = request.files["dataset"]
    df = pd.read_csv(dataset)

    #directory making
    rootdirectory = name
    parent_dir = "/home/sanfer/Documents/ml-examples-vuejs-flask/web-app/src/assets/"
    path = os.path.join(parent_dir, rootdirectory)
    working = path  #working path
    os.mkdir(path)

    plotdirectory = "plots"
    plot_parent_dir = parent_dir + rootdirectory + '/'
    path = os.path.join(plot_parent_dir, plotdirectory)
    plots_dir = path  #plot path
    os.mkdir(path)

    modeldirectory = "models"
    model_parent_dir = parent_dir + rootdirectory + "/"
    path = os.path.join(model_parent_dir, modeldirectory)
    model_dir = path  #model path
    os.mkdir(path)

    #pre-processiong plots

    snsdist = sns.distplot(df[target])
    snsdist = snsdist.get_figure()
    snsdist.savefig(plots_dir + "/dist.png")
    snsdist.clf()

    features = {}

    dataTypes = df.dtypes
    for items in dataTypes.iteritems():
        # print(items)
        # print((items[1].name))
        if (items[1].name != 'float64' and items[1].name != 'int64'):
            df.drop(labels=items[0], axis=1, inplace=True)
        else:
            features.update({items[0]: items[1].name})

    del features[target]
    features = json.dumps(features)
    y = df[target]
    df.drop(labels=target, axis=1, inplace=True)
    df.replace(0, np.NaN).fillna(df.mean(), inplace=True)
    X = df[list(df.columns)]

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(test_size), random_state=101)
    from sklearn.linear_model import BayesianRidge
    lm = BayesianRidge()
    lm.fit(X_train, y_train)

    print("Linear model intercept")
    print(lm.intercept_)
    coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient'])
    print(coeff_df)

    predictions = lm.predict(X_test)
    # plt.figure()
    plt.scatter(y_test, predictions)
    plt.savefig(plots_dir + "/scatter.png")
    plt.clf()

    sn = sns.distplot((y_test - predictions), bins=50)
    sn = sn.get_figure()
    sn.savefig(plots_dir + "/residual.png")
    sn.clf()

    # plt.show()
    # cv2.waitKey(0)
    # sns.distplot((y_test-predictions),bins=50);
    from sklearn import metrics
    print('MAE:', metrics.mean_absolute_error(y_test, predictions))
    print('MSE:', metrics.mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

    print(test_size)
    print(name)
    print(features)
    pkl_filename = model_dir + "/" + name + ".pkl"
    with open(pkl_filename, 'wb') as file:
        pickle.dump(lm, file)

    #metrics to return
    r_square = lm.score(X, y)
    MAE = metrics.mean_absolute_error(y_test, predictions)
    MSE = metrics.mean_squared_error(y_test, predictions)
    RMSE = np.sqrt(MSE)

    #plot paths to return
    scatterplotpath = name + "/plots/scatter.png"
    distpath = name + "/plots/dist.png"
    residualpath = name + "/plots/residual.png"

    #path to model
    modelpath = name + "/models/" + name + ".pkl"

    return jsonify({
        "status": "success LinearReg",
        "metrics": {
            "mae": MAE,
            "mse": MSE,
            "rmse": RMSE,
            "r_square": r_square
        },
        "ploturl": {
            "scatterplotpath": scatterplotpath,
            "distpath": distpath,
            "residualpath": residualpath
        },
        "feature_names": features,
        "model_path": modelpath
    }), 201
Beispiel #5
0
logistic_model = LogisticRegression() # predicted values are like categories
gaussian_nb = GaussianNB()
#error rmse - r2
degree = 1
poly_reg_model = make_pipeline(PolynomialFeatures(degree=5),LinearRegression())

linear_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)
poly_reg_model.fit(X_train, y_train)
logistic_model.fit(X_train, y_train)
elastic_model.fit(X_train, y_train)
bayesian_model.fit(X_train, y_train)
gaussian_nb.fit(X_train, y_train)

y_pred = gaussian_nb.predict(X_test)
r2_score = linear_model.score(X_test, y_test)
print("linear",linear_model.score(X_test, y_test)*100)
print("ridge",ridge_model.score(X_test, y_test)*100)
print("lasso",lasso_model.score(X_test, y_test)*100)
print("polynomial reg model",poly_reg_model.score(X_test, y_test)*100)
print("logistic  reg",logistic_model.score(X_test, y_test)*100)
print("elastic net",elastic_model.score(X_test, y_test)*100)
print("bayesian",bayesian_model.score(X_test, y_test)*100)
print("gaussian-nb",gaussian_nb.score(X_test, y_test)*100)
# print(r2_score*100,'%')

print(y_pred)
plt.scatter(X, y, s=15)
plt.plot(X_test, y_pred, color = 'r')
# plt.show()
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = BayesianRidge(normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("MSE with X_train and Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="BayesianRidge"
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result
Beispiel #7
0
#MSE
print(metrics.mean_squared_error(t_test, prediction_hu))

#RMSE
print(np.sqrt(metrics.mean_squared_error(t_test, prediction_hu)))

model_hu.score(s_test, t_test)

from sklearn.linear_model import BayesianRidge

model_br = BayesianRidge()
fit = model_br.fit(X_train, y_train)
prediction_br = model_br.predict(X_test)

from matplotlib import pyplot as plt
plt.plot(t, y_test, 'bs', t, prediction_br, 'g^')
plt.xlabel('Samples')
plt.ylabel('prediction')
plt.title('BeysianRidge regressor')

#MAE
print(metrics.mean_absolute_error(y_test, prediction_br))

#MSE
print(metrics.mean_squared_error(y_test, prediction_br))

#RMSE
print(np.sqrt(metrics.mean_squared_error(y_test, prediction_br)))

model_br.score(X_test, y_test)
Beispiel #8
0
 def __bayesian_ridge_regression(self, X_train, X_test, y_train, y_test):
     lm = BayesianRidge()
     lm.fit(X_train, y_train)
     print('BayesianRidge Accuracy:', lm.score(X_test, y_test))
Beispiel #9
0
y_test = y[ind_split:]

#  Lasso Regressor
reg_1 = Lasso()
reg_1.fit(X_train, y_train)
print("Lasso Score:", reg_1.score(X_test, y_test))

# Ridge Regressor
reg_2 = Ridge()
reg_2.fit(X_train, y_train)
print("Ridge Score:", reg_2.score(X_test, y_test))

# Bayesian Ridge Regressor
reg_3 = BayesianRidge()
reg_3.fit(X_train, y_train)
print("BayesianRidge Score:", reg_3.score(X_test, y_test))

# ElasticNet Regresor
reg_4 = ElasticNet()
reg_4.fit(X_train, y_train)
print("ElasticNet Score:", reg_4.score(X_test, y_test))

#Let us predict the stock market for the Future 30 days
days = 20

data_seed = df['Adj Close'].values[-window_size:][None]

input_values = {
    'Lasso': data_seed,
    'Ridge': data_seed,
    'BayesianRidge': data_seed,
Beispiel #10
0
# Determining accuracy

rf_accuracy = rf.score(x_test, y_test)
rf_evs = evs(y_test, rf_yhat)

print("Random Forest Training Accuracy:", rf.score(x_train, y_train))
print("Random Forest Testing Accuracy:", rf_accuracy)
print("Random Forest Explained Variance Score:", rf_evs)

dt_accuracy = dt.score(x_test, y_test)
dt_evs = evs(y_test, dt_yhat)

print("Decision Tree Training Accuracy:", dt.score(x_train, y_train))
print("Decision Tree Testing Accuracy:", dt_accuracy)
print("Decision Tree Explained Variance Score:", dt_evs)

lr_accuracy = lr.score(x_test, y_test)
lr_evs = evs(y_test, lr_yhat)

print("Linear Regression Training Accuracy:", lr.score(x_train, y_train))
print("Linear Regression Testing Accuracy:", lr_accuracy)
print("Linear Regression Explained Variance Score:", lr_evs)

bayesian_accuracy = bayesian.score(x_test, y_test)
bayesian_evs = evs(y_test, bayesian_yhat)

print("Bayesian Training Accuracy:", bayesian.score(x_train, y_train))
print("Bayesian Testing Accuracy:", bayesian_accuracy)
print("Bayesian Explained Variance Score:", bayesian_evs)
while (j < len(X_valid) - 2):
    xx1.append(j)
    for t in range(3):
        if t == 0:
            yy1.append(pred4[j + t])
        elif t == 1:
            yy2.append(pred4[j + t])
        else:
            yy3.append(pred4[j + t])
    j += 3
plt.plot(xx1, yy1)
plt.plot(xx1, yy2)
plt.plot(xx1, yy3)
plt.show()

print('Model score is: ', br.score(X_train, y_train))
print('R2 score for bayesian ridge is: ', br.score(X_valid, y_valid))
print('MSE For validation set for bayesian ridge is: ',
      mean_squared_error(pred4, y_valid))

pred5 = clf.predict(X_valid)

xx1 = []
yy1 = []
yy2 = []
yy3 = []

j = 0
while (j < len(X_valid) - 2):
    xx1.append(j)
    for t in range(3):
x = data.loc[:, ['carat', 'cut', 'color', 'depth']].values
y = data.loc[:, 'price'].values

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20)
model = BayesianRidge(compute_score=True)
model.fit(xtrain, ytrain)

yprd = model.predict(xtest)

wt = float(input("Enter the weight\n"))
ct = float(input("Enter the Cut( 0-4)"))
cl = float(input("Enter the color(0-6)"))
dt = float(input("Enter the Depth\n"))

xnew = [[wt, ct, cl, dt]]
ynew = model.predict(xnew)
print("Diamond Price", ynew[0])

print("Mean sqaured Error", mean_squared_error(ytest, yprd))
print("Variance Score", r2_score(ytest, yprd))
print("Coefficent", model.coef_)
print("Intercept ", model.intercept_)
print("Accuracy", model.score(x, y) * 100)
'''plt.scatter(ytest,yprd)
plt.plot(ytest,yprd)
plt.title("Expected Price and Predict Output")
plt.xlabel("Excpected Value")
plt.ylabel("Predict output")
plt.show()
'''
Beispiel #13
0
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试BayesianRidge类**********"
    bayesianRidge = BayesianRidge()
    # 拟合训练集
    bayesianRidge.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    print "系数:", bayesianRidge.coef_
    print "截距:", bayesianRidge.intercept_
    print '训练集R2: ', r2_score(train_Y, bayesianRidge.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = bayesianRidge.predict(test_X)
    print "测试集得分:", bayesianRidge.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, bayesianRidge.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试ARDRegression类**********"
    ardRegression = ARDRegression()
    # 拟合训练集
    ardRegression.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
Beispiel #14
0
def main():
    usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features')
    parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]')
    parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]')
    parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]')
    parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument')
    parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide full data HDF5, representation HDF5, and target index or filename')
    else:
        repr_hdf5_file = args[0]
        data_hdf5_file = args[1]
        target_i = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    #######################################################
    # preprocessing
    #######################################################

    # load training targets
    data_hdf5_in = h5py.File(data_hdf5_file, 'r')
    if options.target_hdf5:
        target_hdf5_in = h5py.File(options.target_hdf5, 'r')
    else:
        target_hdf5_in = data_hdf5_in
    train_y = np.array(target_hdf5_in['train_out'])[:,target_i]
    test_y = np.array(target_hdf5_in['test_out'])[:,target_i]

    # load training representations
    if not options.add_only:
        repr_hdf5_in = h5py.File(repr_hdf5_file, 'r')
        train_x = np.array(repr_hdf5_in['train_repr'])
        test_x = np.array(repr_hdf5_in['test_repr'])
        repr_hdf5_in.close()

    if options.seq_only:
        add_labels = []

    else:
        # load additional features
        train_a = np.array(data_hdf5_in['train_add'])
        test_a = np.array(data_hdf5_in['test_add'])
        add_labels = np.array(data_hdf5_in['add_labels'])

        if options.regex_add:
            fi = filter_regex(options.regex_add, add_labels)
            train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi]

        # append additional features
        if options.add_only:
            add_i = 0
            train_x, test_x = train_a, test_a
        else:
            add_i = train_x.shape[1]
            train_x = np.concatenate((train_x,train_a), axis=1)
            test_x = np.concatenate((test_x,test_a), axis=1)

    data_hdf5_in.close()
    if options.target_hdf5:
        target_hdf5_in.close()

    # balance
    if options.balance:
        train_x, train_y = balance(train_x, train_y)

    # sample
    if options.sample is not None and options.sample < train_x.shape[0]:
        sample_indexes = random.sample(range(train_x.shape[0]), options.sample)
        train_x = train_x[sample_indexes]
        train_y = train_y[sample_indexes]


    #######################################################
    # model
    #######################################################
    if options.regression:
        # fit
        model = BayesianRidge(fit_intercept=True)
        model.fit(train_x, train_y)

        # accuracy
        acc_out = open('%s/r2.txt' % options.out_dir, 'w')
        print >> acc_out, model.score(test_x, test_y)
        acc_out.close()

        test_preds = model.predict(test_x)

        # plot a sample of predictions versus actual
        plt.figure()
        sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3})
        plt.savefig('%s/scatter.pdf' % options.out_dir)
        plt.close()

        # plot the distribution of residuals
        plt.figure()
        sns.distplot(test_y-test_preds)
        plt.savefig('%s/residuals.pdf' % options.out_dir)
        plt.close()

    else:
        # fit
        model = LogisticRegression(penalty='l2', C=1000)
        model.fit(train_x, train_y)

        # accuracy
        test_preds = model.predict_proba(test_x)[:,1].flatten()
        acc_out = open('%s/auc.txt' % options.out_dir, 'w')
        print >> acc_out, roc_auc_score(test_y, test_preds)
        acc_out.close()

        # compute and print ROC curve
        fpr, tpr, thresholds = roc_curve(test_y, test_preds)

        roc_out = open('%s/roc.txt' % options.out_dir, 'w')
        for i in range(len(fpr)):
            print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i])
        roc_out.close()

        # compute and print precision-recall curve
        precision, recall, thresholds = precision_recall_curve(test_y, test_preds)

        prc_out = open('%s/prc.txt' % options.out_dir, 'w')
        for i in range(len(precision)):
            print >> prc_out, '%f\t%f' % (precision[i], recall[i])
        prc_out.close()

    # save model
    joblib.dump(model, '%s/model.pkl' % options.out_dir)

    #######################################################
    # analyze
    #######################################################
    # print coefficients table
    coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w')
    for ai in range(len(add_labels)):
        if options.regression:
            coefi = model.coef_[add_i+ai]
        else:
            coefi = model.coef_[0,add_i+ai]
        print >> coef_out, add_labels[ai], coefi
    coef_out.close()
Beispiel #15
0
feature_selector = FeatureSelector(feature_corr_min=0.45,
                                   feature_x_corr_max=0.5)

feature_selector.fit(X_train, y_train)

X_train = feature_selector.transform(X_train)
X_test = feature_selector.transform(X_test)

# corr = data.corr()

# param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]}

#print(X_train)
#print(y_train)

# regressor = LinearRegression()
regressor = BayesianRidge()
#regressor.fit(X_train, y_train.squeeze().tolist())
regressor.fit(X_train, y_train)

print('Fit=' + str(regressor.score(X_train, y_train)))
print('Score=' + str(regressor.score(X_test, y_test)))
print(feature_selector.features_)

print(regressor.get_params())
y_predict = regressor.predict(X_test)

plt.plot(y_test, y_predict, 'o')
plt.show()
Beispiel #16
0
class BayesianRegressor:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.model = BayesianRidge(normalize=True, copy_X=True)
        self.train = self.transform(self.X)

    def transform(self, X):
        return X

    def fit(self):
        self.model.fit(self.train, self.y)
        self.V = self.Vn()

    def score(self):
        return self.model.score(self.train, self.y)

    def plot(self, file=None):
        import matplotlib.pyplot as plt
        if file is not None:
            plt.ioff()
        else:
            plt.ion()
        fig, ax = plt.subplots(nrows=1, ncols=1)
        X_axis = np.linspace(self.X.min(), self.X.max() + 20, 100)
        X_axis_transformed = self.transform(X_axis)
        ax.scatter(self.X, self.y)
        ax.plot(X_axis, self.model.predict(X_axis_transformed))
        if file is not None:
            fig.savefig(file)
            plt.close(fig)
        else:
            fig.show()

    def Vn(self):
        try:
            return np.linalg.inv(
                (1 / self.y.std()**2) * (self.train.T).dot(self.train))
        except np.linalg.LinAlgError:
            return np.linalg.pinv(
                (1 / self.y.std()**2) * (self.train.T).dot(self.train))

    # computes estimator for the posterior variance
    def posterior_variance(self, x):
        x_t = self.transform(x)[0]
        return ((self.y.std()**2) + (x_t.T).dot(self.V).dot(x_t))

    # computes estimator for the posterior mean
    def posterior_mean(self, x):
        return self.model.predict(self.transform(x)[0].reshape(1, -1))[0]

    # return tuple of (posterior_mean,sqrt(posterior_variance))
    def posterior_distribution(self, x):
        return self.posterior_mean(x), np.sqrt(self.posterior_variance(x))

    def posterior_cdf(self, y_query, x):
        return norm.cdf(y_query, self.posterior_mean(x),
                        np.sqrt(self.posterior_variance(x)))

    def print_stats(self, t, y_query):
        print("R^2:{}".format(self.score()))
        print("posterior std on t = {}: {:.2f}  ".format(
            t, np.sqrt(self.posterior_variance(t))))
        print("posterior mean on t = {}: {:.0f}".format(
            t, self.posterior_mean(t)))
        print("goal of {} achieved with probability:{:.2f} ".format(
            y_query, (1 - self.posterior_cdf(y_query, t)) * 100))
Beispiel #17
0
def task2(data):

	df = data

	dfreg = df.loc[:,['Adj Close','Volume']]
	dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
	dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

	# Drop missing value
	dfreg.fillna(value=-99999, inplace=True)
	# We want to separate 1 percent of the data to forecast
	forecast_out = int(math.ceil(0.01 * len(dfreg)))
	# Separating the label here, we want to predict the AdjClose
	forecast_col = 'Adj Close'
	dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
	X = np.array(dfreg.drop(['label'], 1))
	# Scale the X so that everyone can have the same distribution for linear regression
	X = preprocessing.scale(X)
	# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
	X_lately = X[-forecast_out:]
	X = X[:-forecast_out]
	# Separate label and identify it as y
	y = np.array(dfreg['label'])
	y = y[:-forecast_out]
	
	#Split data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	##################
	##################
	##################


	# Linear regression
	clfreg = LinearRegression(n_jobs=-1)
	# 1 - First save the models to local device in models folder
	# filename = 'models/clfreg_model.sav'
	# pickle.dump(clfreg, open(filename, 'wb'))

	# 2 - load the models from disk onces first instruction is done once.
	# clfreg = pickle.load(open(filename, 'rb'))
	clfreg.fit(X_train, y_train)


	# Quadratic Regression 2
	clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
	#Save model to a pickle
	# filename1 = 'models/clfpoly2_model.sav'
	# pickle.dump(clfpoly2, open(filename1, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfpoly2 = pickle.load(open(filename1, 'rb'))
	clfpoly2.fit(X_train, y_train)


	# Quadratic Regression 3
	clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
	#Save model to a pickle
	# filename2 = 'models/clfpoly3_model.sav'
	# pickle.dump(clfpoly3, open(filename2, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfpoly3 = pickle.load(open(filename2, 'rb'))
	clfpoly3.fit(X_train, y_train)


	# KNN Regression
	clfknn = KNeighborsRegressor(n_neighbors=2)
	#Save model to a pickle
	# filename3 = 'models/clfknn_model.sav'
	# pickle.dump(clfknn, open(filename3, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfknn = pickle.load(open(filename3, 'rb'))
	clfknn.fit(X_train, y_train)


	# Lasso Regression
	clflas = Lasso()
	#Save model to a pickle
	# filename4 = 'models/clflas_model.sav'
	# pickle.dump(clflas, open(filename4, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clflas = pickle.load(open(filename4, 'rb'))
	clflas.fit(X_train, y_train)


	# Multitask Lasso Regression
	# clfmtl = MultiTaskLasso(alpha=1.)
	# clfmtl.fit(X_train, y_train).coef_


	# Bayesian Ridge Regression
	clfbyr = BayesianRidge()
	clfbyr.fit(X_train, y_train)
	#Save model to a pickle
	# filename5 = 'models/clfbyr_model.sav'
	# pickle.dump(clfbyr, open(filename5, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfbyr = pickle.load(open(filename5, 'rb'))


	# Lasso LARS Regression
	clflar = LassoLars(alpha=.1)
	clflar.fit(X_train, y_train)
	#Save model to a pickle
	# filename6 = 'models/clflar_model.sav'
	# pickle.dump(clflar, open(filename6, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clflar = pickle.load(open(filename6, 'rb'))


	# Orthogonal Matching Pursuit Regression
	clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
	clfomp.fit(X_train, y_train)
	#Save model to a pickle
	# filename7 = 'models/clfomp_model.sav'
	# pickle.dump(clfomp, open(filename7, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfomp = pickle.load(open(filename7, 'rb'))


	# Automatic Relevance Determination Regression
	clfard = ARDRegression(compute_score=True)
	clfard.fit(X_train, y_train)
	#Save model to a pickle
	# filename8 = 'models/clfard_model.sav'
	# pickle.dump(clfard, open(filename8, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfard = pickle.load(open(filename8, 'rb'))


	# Logistic Regression
	# clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
	# coefs_ = []
	# for c in cs:
	#   clflgr.set_params(C=c)
	#   clflgr.fit(X_train, y_train)
	#   coefs_.append(clflgr.coef_.ravel().copy())


	#SGD Regression
	clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
	clfsgd.fit(X_train, y_train)
	#Save model to a pickle
	# filename9 = 'models/clfsgd_model.sav'
	# pickle.dump(clfsgd, open(filename9, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfsgd = pickle.load(open(filename9, 'rb'))


	##################
	##################
	##################


	#Create confindence scores
	confidencereg = clfreg.score(X_test, y_test)
	confidencepoly2 = clfpoly2.score(X_test,y_test)
	confidencepoly3 = clfpoly3.score(X_test,y_test)
	confidenceknn = clfknn.score(X_test, y_test)
	confidencelas = clflas.score(X_test, y_test)
	# confidencemtl = clfmtl.score(X_test, y_test)
	confidencebyr = clfbyr.score(X_test, y_test)
	confidencelar = clflar.score(X_test, y_test)
	confidenceomp = clfomp.score(X_test, y_test)
	confidenceard = clfard.score(X_test, y_test)
	confidencesgd = clfsgd.score(X_test, y_test)

	# results
	print('The linear regression confidence is:',confidencereg*100)
	print('The quadratic regression 2 confidence is:',confidencepoly2*100)
	print('The quadratic regression 3 confidence is:',confidencepoly3*100)
	print('The knn regression confidence is:',confidenceknn*100)
	print('The lasso regression confidence is:',confidencelas*100)
	# print('The lasso regression confidence is:',confidencemtl*100)
	print('The Bayesian Ridge regression confidence is:',confidencebyr*100)
	print('The Lasso LARS regression confidence is:',confidencelar*100)
	print('The OMP regression confidence is:',confidenceomp*100)
	print('The ARD regression confidence is:',confidenceard*100)
	print('The SGD regression confidence is:',confidencesgd*100)

	#Create new columns
	forecast_reg = clfreg.predict(X_lately)
	forecast_pol2 = clfpoly2.predict(X_lately)
	forecast_pol3 = clfpoly3.predict(X_lately)
	forecast_knn = clfknn.predict(X_lately)
	forecast_las = clflas.predict(X_lately)
	forecast_byr = clfbyr.predict(X_lately)
	forecast_lar = clflar.predict(X_lately)
	forecast_omp = clfomp.predict(X_lately)
	forecast_ard = clfard.predict(X_lately)
	forecast_sgd = clfsgd.predict(X_lately)

	#Process all new columns data
	dfreg['Forecast_reg'] = np.nan

	last_date = dfreg.iloc[-1].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)

	for i in forecast_reg:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
	    dfreg['Forecast_reg'].loc[next_date] = i
	    
	dfreg['Forecast_pol2'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_pol2:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_pol2'].loc[next_date] = i

	dfreg['Forecast_pol3'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_pol3:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_pol3'].loc[next_date] = i
	    
	dfreg['Forecast_knn'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_knn:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_knn'].loc[next_date] = i
	        
	dfreg['Forecast_las'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_las:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_las'].loc[next_date] = i
	    
	dfreg['Forecast_byr'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_byr:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_byr'].loc[next_date] = i
	    
	dfreg['Forecast_lar'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_lar:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_lar'].loc[next_date] = i
	    
	dfreg['Forecast_omp'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_omp:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_omp'].loc[next_date] = i
	    
	dfreg['Forecast_ard'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_ard:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_ard'].loc[next_date] = i
	    
	dfreg['Forecast_sgd'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_sgd:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_sgd'].loc[next_date] = i

	return dfreg.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), dfreg['Adj Close'].to_list(), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
Beispiel #18
0
def calc_bayesian_ridge_regression(X_train, X_test, y_train, y_test):
    reg = BayesianRidge().fit(X_train, y_train)
    reg.score(X_train, y_train)
    return calc_spearmanr_from_regressor(reg, X_test, y_test)
Beispiel #19
0
na_mask_train = ~X_train.loc[X_train_odds.index].isna().T.any()
X_train_odds_comp = X_train.loc[X_train_odds.index].dropna()
# X_train_odds_comp = X_train_odds_comp.fillna(X_train_odds_comp.mean())
na_mask_val = ~X_val.loc[X_val_odds.index].isna().T.any()
X_val_odds_comp = X_val.loc[X_val_odds.index].dropna()
# X_val_odds_comp = X_val_odds_comp.fillna(X_val_odds_comp.mean())
X_train_odds = X_train_odds[na_mask_train]
X_val_odds = X_val_odds[na_mask_val]
y_train_odds =  y_train_odds[na_mask_train]
y_val_odds = y_val_odds[na_mask_val]


lm = BayesianRidge().fit(X_train_odds.median(axis=1).values.reshape(-1,1), y_train_odds)
predictions = lm.predict(X_val_odds.median(axis=1).values.reshape(-1,1))
print(mean_squared_error(y_val_odds, predictions))
lm.score(X_val_odds.median(axis=1).values.reshape(-1,1), y_val_odds)
# X_train_odds_comp_tot = pd.concat([X_train.loc[X_train_odds.index], X_train_odds], axis=1)
# X_val_odds_comp_tot = pd.concat([X_val.loc[X_val_odds.index], X_val_odds], axis=1)

####### Scale data select features
standardscaler = StandardScaler()
X_trainscaled_odds_comp = standardscaler.fit_transform(X_train_odds_comp[featurestouse])
X_valscaled_odds_comp = standardscaler.transform(X_val_odds_comp[featurestouse])

# standardscaler = StandardScaler()
# X_trainscaled_odds_comp_tot = standardscaler.fit_transform(X_train_odds_comp_tot[featurestouse])
# X_valscaled_odds_comp_tot = standardscaler.transform(X_val_odds_comp_tot[featurestouse])

####### Setup grid search
def do_grid_search(X_train, y_train, X_val, y_val):
    X_train_val = np.vstack((X_train, X_val))
 
# PCA + Orthogonal Matching Pursuit
omp = OrthogonalMatchingPursuit()
omp.fit(reduced_training_features, training_labels)
preds = omp.predict(reduced_testing_features)
score = omp.score(reduced_testing_features,testing_labels)
print 'PCA + Orthogonal Matching Pursuit Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Bayesian Ridge Regression
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(training_features, training_labels)
preds = br.predict(testing_features)
score = br.score(testing_features,testing_labels)
print 'Bayesian Ridge Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + Bayesian Ridge Regression
br = BayesianRidge()
br.fit(reduced_training_features, training_labels)
preds = br.predict(reduced_testing_features)
score = br.score(reduced_testing_features,testing_labels)
print 'PCA + Bayesian Ridge Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Stochastic Gradient Descent Regression
from sklearn.linear_model import SGDRegressor
Beispiel #21
0
df['Prediction'] = df_close.shift(-forecast_out) #  label column with data shifted 30 units up

# print(df.tail())

X = np.array(df.drop(['Prediction'], 1))
X = preprocessing.scale(X)



X_forecast = X[-forecast_out:] # set X_forecast equal to last 30
X = X[:-forecast_out] # remove last 30 from X


y = np.array(df['Prediction'])
y = y[:-forecast_out]



X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3)

# Training
clf = BayesianRidge()
clf.fit(X_train,y_train)
# Testing
confidence = clf.score(X_test, y_test)
print("confidence: ", confidence)


forecast_prediction = clf.predict(X_forecast)
print(forecast_prediction)
Beispiel #22
0
def make_predictions(df):
    ## Volatility
    #high to low percent
    df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100.0

    #Change percent in close to open
    df['PCT_change'] = (df['close'] - df['open']) / df['open'] * 100.0

    # Drop missing value
    df.fillna(value=-99999, inplace=True)

    # separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(df)))

    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'adjusted_close'
    df['label'] = df[forecast_col].shift(-forecast_out)
    X = np.array(df.drop(['label'], 1))

    # Scale X - so all have the same distribution for Linear regression
    X = preprocessing.scale(X)

    # #finally, we want to find Data series of late X early X (train) for model generation and evaluation
    X_forecast = X[-forecast_out:]
    X = X[:-forecast_out]

    # Separate label and identify it as y
    y = np.array(df['label'])
    y = y[:-forecast_out]

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # Linear regression
    model = LinearRegression(n_jobs=-1)
    model.fit(X_train, y_train)

    # KNN Regression
    model_knn = KNeighborsRegressor(n_neighbors=2)
    model_knn.fit(X_train, y_train)

    # Bayesian Ridge Regression
    model_by = BayesianRidge()
    model_by.fit(X_train, y_train)

    #Create confindence scores
    confidencereg = model.score(X_test, y_test)
    confidence_model_knn = model_knn.score(X_test, y_test)
    confidence_model_by = model_by.score(X_test, y_test)

    reg = confidencereg * 100
    knn = confidence_model_knn * 100
    by = confidence_model_by * 100

    score = " Regression {}\n KNN {}\n Bayesian {}\n ".format(reg, knn, by)

    #Create new columns
    forecast_reg = model.predict(X_forecast)
    forecast_knn = model_knn.predict(X_forecast)
    forecast_by = model_by.predict(X_forecast)

    #Process all new columns data
    df['Forecast_reg'] = np.nan

    last_date = df.iloc[-1].name
    # last_unix = datetime.strptime(last_date, '%Y-%m-%d')
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df.loc[next_date] = [np.nan for _ in range(len(df.columns))]
        df['Forecast_reg'].loc[next_date] = i

    df['Forecast_knn'] = np.nan

    last_date = df.iloc[-40].name
    # last_date = df.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)

    for i in forecast_knn:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df['Forecast_knn'].loc[next_date] = i

    df['forecast_by'] = np.nan

    last_date = df.iloc[-40].name
    last_unix = last_date
    next_unix = last_unix + timedelta(days=1)

    for i in forecast_by:
        next_date = next_unix
        next_unix += timedelta(days=1)
        df['forecast_by'].loc[next_date] = i

    return df
Beispiel #23
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = BayesianRidge(compute_score=True)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
Beispiel #24
0
# corr = data.corr()

# param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]}

print(X_train)
print(y_train)

# regressor = LinearRegression()
# regressor = SVR(C=5, gamma=0.00001)
regressor = BayesianRidge(normalize=True,
                          n_iter=5,
                          tol=0.01,
                          fit_intercept=True)
# regressor = ARDRegression(normalize=True, n_iter=5, tol=0.01)
# regressor = SGDRegressor()
# regressor = MLPRegressor(hidden_layer_sizes=(200, 50, 10))
# regressor = RANSACRegressor(min_samples=80, max_trials=1000)
# regressor = Lasso()

regressor.fit(X_train, y_train.squeeze().tolist())

print(regressor.score(X_train, y_train.squeeze().tolist()))
print(regressor.score(X_test, y_test.squeeze().tolist()))

print(regressor.get_params())
y_predict = regressor.predict(X_test)
print(y_predict)

plt.plot(y_test.squeeze().tolist(), y_predict, 'o')
plt.show()
Beispiel #25
0
#        cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6,
#            random_state=0))
t1 = time()
sc.fit(X_train, y_train)
sc_time = time() - t1
computed_coefs = sc.inverse_transform()
computed_coefs = np.reshape(computed_coefs, [size, size, size])
score = sc.score(X_test, y_test)

###############################################################################
# Compute the results for simple BayesianRidge
t1 = time()
clf.fit(X_train, y_train)
bayes_time = time() - t1
bayes_coefs = clf.coef_
bayes_score = clf.score(X_test, y_test)
bayes_coefs = bayes_coefs.reshape((size, size, size))

###############################################################################
# Plot the results

pl.close('all')
pl.figure()
pl.title('Scores of the supervised clustering')
pl.subplot(2, 1, 1)
pl.plot(np.arange(len(sc.scores_)), sc.scores_)
pl.xlabel('score')
pl.ylabel('iteration')
pl.title('Score of the best parcellation of each iteration')
pl.subplot(2, 1, 2)
pl.plot(np.arange(len(sc.delta_scores_)), sc.delta_scores_)
Beispiel #26
0
import os, sys


full_path = os.path.realpath(__file__)
file = os.path.dirname(full_path) + "\\\data\\housingSample.csv"
(X,Y,records)=getData(file)
X_train, X_test, price_train, price_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)
model=BayesianRidge()
model.fit(X_train, price_train.ravel())
predPrices=model.predict(X_train)
print(model)
# Summarize the fit of the model

#print(model.intercept_, model.coef_, mse)
print(model.score(X_train, price_train))

predPrices=model.predict(X_train)
mse=mean_squared_error(price_train, predPrices)
rs=r2_score(price_train, predPrices)

print("training mse:",mse)
print("training score:",rs)

# testing
testing_pred_price_results=model.predict(X_test)
mse=mean_squared_error(price_test, testing_pred_price_results)
rs=r2_score(price_test, testing_pred_price_results)
print("median_house_value"+" Predicted_median_house_value")
print(np.c_[price_test, testing_pred_price_results])
print("testing mse:", mse)
Beispiel #27
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]')
    parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file')
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(',')]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ''
    if options.cuda:
        cuda_str = '-cuda'

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    num_filters = len(filter_consensus)
    # num_filters = 40
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length/2 - options.center_dist - filter_len
    right_i = options.seq_length/2 + options.center_dist

    ns_1hot = np.zeros((4,options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i]
            motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length))


    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = '%s/motif_seqs.h5' % options.out_dir
    h5f = h5py.File(seqs_file, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = '%s/motif_seqs_scores.h5' % options.out_dir
    torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, 'r')
    motif_seq_scores = np.array(hdf5_in['scores'])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0],2*num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi,i] += 1
                X[xi,num_filters+j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:,ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:,ti])

        # print filter coefficients
        coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w')
        for i in range(num_filters):
            print >> coef_out, '%3d  %6.2f' % (i,model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters,num_filters))
        table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w')

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j])
                print >> table_out, '%3d  %3d  %6.3f  %6.3f  %6.3f' % cols
                si += 1

        table_out.close()

        scores_abs = abs(filter_interaction.flatten())
        max_score = stats.quantile(scores_abs, .999)
        print 'Limiting scores to +-%f' % max_score
        filter_interaction_max = np.zeros((num_filters, num_filters))
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score])
                filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score])

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False)
        plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths)
print(reg_world_deaths.get_params)

#World

reg_world_deaths=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)
reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths)
reg_world_deaths_test = reg_world_deaths.predict(xtest_world_deaths)
reg_world_deaths_predict_days = reg_world_deaths.predict(prediction_days)
print('MAE:', metrics.mean_absolute_error(reg_world_deaths_test, ytest_world_deaths))
print('MSE:',metrics.mean_squared_error(reg_world_deaths_test, ytest_world_deaths))
print('R2 :',metrics.r2_score(reg_world_deaths_test, ytest_world_deaths))
print('Training score:',reg_world_deaths.score(xtrain_world_deaths,ytrain_world_deaths))
print('Testing score:',reg_world_deaths.score(xtest_world_deaths,ytest_world_deaths))

#Graph for Bayesian Predicted deaths in World

plt.figure(figsize=(12, 8))
plt.plot(days,world_deaths)
plt.plot(prediction_days,reg_world_deaths_predict_days,linestyle='dashed')
plt.title('Predicted Coronavirus deaths Cases Over Time in World', size=30)
plt.xlabel('Days Since 1/22/2020', size=20)
plt.ylabel('No.of Cases(in Croces)', size=20)
plt.legend(['deaths Cases', 'Bayesian Ridge Predictions'])
plt.xticks(size=15)
plt.show()

reg_world_deaths_predict_days = reg_world_deaths_predict_days.reshape(1,-1)[0]
ls.fit(X_trn, y_trn)
ls.score(X_trn, y_trn)
ls.intercept_
ls.coef_
ls.__dict__
scoreOfModel6 = ls.score(X_trn, y_trn)

pred6 = ls.predict(X_tst)
pred6 = pd.DataFrame(pred6)
print('r2 score:', {r2_score(y_tst, pred6)})
############### Bayesian regression ###############################################
Bs = BayesianRidge()
Bs.fit(X_trn, y_trn)
Bs.coef_
Bs.intercept_
scoreOfModel7 = Bs.score(X_trn, y_trn)

pred7 = Bs.predict(X_tst)
pred7 = pd.DataFrame(pred7)
print('r2 score BSR:', {r2_score(y_tst, pred7)})  #model Evaluation

############## ElasticNet Regression (L1 + L2 penalized model) ###########
## hyperparameter that determines strength of a Ridge,lasso, elastcNet regression
# we use alpha as our hyperparameter lambda ## l1 for lasso
enet = ElasticNet(alpha=0.005, l1_ratio=0.7)
enet.fit(X_trn, y_trn)
enet.alpha
scoreOfModel8 = enet.score(X_trn, y_trn)

pred8 = enet.predict(X_tst)
pred8 = pd.DataFrame(pred8)
Beispiel #30
0
def main():
    usage = "usage: %prog [options] <model_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        dest="center_dist",
        default=10,
        type="int",
        help="Distance between the motifs and sequence center [Default: %default]",
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]"
    )
    parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]")
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide Basset model file")
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(",")]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ""
    if options.cuda:
        cuda_str = "-cuda"

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    # num_filters = len(filter_consensus)
    num_filters = 20
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length / 2 - options.center_dist - filter_len
    right_i = options.seq_length / 2 + options.center_dist

    ns_1hot = np.zeros((4, options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i]
            motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length))

    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = "%s/motif_seqs.h5" % options.out_dir
    h5f = h5py.File(seqs_file, "w")
    h5f.create_dataset("test_in", data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = "%s/motif_seqs_scores.h5" % options.out_dir
    torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, "r")
    motif_seq_scores = np.array(hdf5_in["scores"])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi, i] += 1
                X[xi, num_filters + j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:, ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:, ti])

        # print filter coefficients
        coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w")
        for i in range(num_filters):
            print >> coef_out, "%3d  %6.2f" % (i, model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters, num_filters))
        table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w")

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j])
                print >> table_out, "%3d  %3d  %6.3f  %6.3f  %6.3f" % cols
                si += 1

        table_out.close()

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction)
        plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
x_test = test['Open'].as_matrix()
y_test = test['Future'].as_matrix()

# reshape into (row, column for sklearn)
x = x.reshape((len(x), 1))
y = y.reshape((len(y), 1))

x_test = x_test.reshape(len(x_test), 1)
y_test = y_test.reshape(len(y_test), 1)

# fit classifiers
ols = LinearRegression()
ols = ols.fit(x, y)
predict_ols = ols.predict(x_test)
score_ols = ols.score(x_test, y_test)

clf = BayesianRidge(compute_score=True)
clf = clf.fit(x, y)
predict_b = clf.predict(x_test)
score_b = clf.score(x_test, y_test)

print("Accuracy: OLS %lf, Bayes %lf" % (score_ols, score_b))

# plot results
plt.plot(y_test, 'r+', label="actual")
plt.plot(predict_ols, 'bx', label="ols")
plt.plot(predict_b, 'g1', label="bayesian")
plt.legend()
plt.title("Predict DJIA 1 year ahead ( 2016 )")
plt.savefig('OLS_vs_BayesianRegression.png')
plt.show()
Beispiel #32
0
                                          test_size=test_size, random_state=0)
    

#    k = int(0.5 * n_features)
#    print("-----------------------------------------------")
#    print("Perform chi2 feature selection k=", k)   
#    print("-----------------------------------------------")
#    X_train, X_test = selectFeatures(X_train, X_test, y_train, k)

    print("-----------------------------------------------")
    print("SVM Classification of training set")   
    print("-----------------------------------------------")
    class_weight = {0:5}
    print("Class weight=", class_weight)
    clf = BayesianRidge(compute_score=True).fit(X_train, y_train)
    print("Test svm.SVC score=", clf.score(X_test, y_test))
    print("Train svm.SVC score=", clf.score(X_train, y_train))
    
    print("-----------------------------------------------")
    print("Metrics on TEST SET")   
    print("-----------------------------------------------")    
    y_pred = clf.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred, target_names=label_names))
    print(metrics.confusion_matrix(y_test, y_pred))       
    
    print("-----------------------------------------------")
    print("Metrics on TRAIN SET")   
    print("-----------------------------------------------")    
    y_predTrain = clf.predict(X_train)
    
    topIndex = len(y_test) - 1
    for i in range(topIndex, -1, -1):
        if (math.isnan(y_test[i])):
            del y_test[i]
            del X_test[i]

    # CHOOSING THE MODEL
    model = BayesianRidge()
    model2 = SVR()

    model.fit(X_train, y_train)
    model2.fit(X_train, y_train)

    print("Bayesian Ridge")
    print("R2:" + str(model.score(X_test, y_test)))
    print("Mean Squared Error: " +
          str(mean_squared_error(y_test, model.predict(X_test))))
    df['Bayesian Ridge'][index] = mean_squared_error(y_test,
                                                     model.predict(X_test))
    print("Mean Absolute Error: " +
          str(mean_absolute_error(y_test, model.predict(X_test))))
    print("Median Absolute Error: " +
          str(median_absolute_error(y_test, model.predict(X_test))))

    print("")
    print("Support Vector Regression")
    print("R2:" + str(model2.score(X_test, y_test)))
    print("Mean Squared Error: " +
          str(mean_squared_error(y_test, model2.predict(X_test))))
    df['SVR'][index] = mean_squared_error(y_test, model2.predict(X_test))