X_multiple = boston.data[:, 5:8]
print(X_multiple)

# Defino los datos del target
y_multiple = boston.target

# IMPLEMENTACION DE REGRESIÓN LINEAL MULTIPLE
from sklearn.model_selection import train_test_split

# Separamos los datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_multiple,
                                                    y_multiple,
                                                    test_size=0.2)

# Definimos el algoritmo a utilizar
lr_multiple = linear_model.LinearRegression()

# Entrenamos el modelo
lr_multiple.fit(X_train, y_train)

# Realizamos una predicción
Y_pred_multiple = lr_multiple.predict(X_test)

# Calculamos los coeficientes del modelo
print('DATOS DEL MODELO DE REGRESIÓN LINEAL MULTIPLE')
print()
print('Valor de la pendiente o coeficiente "a": ')
print(lr_multiple.coef_)
print()
print('Valor de la intersección o coeficiente "b": ')
print(lr_multiple.intercept_)
Example #2
0
st.markdown('_Please see left sidebar for more details._')

# currentStats = pd.read_csv('https://raw.githubusercontent.com/neelganta/neel_project/master/alltimeDynasty.csv') #Dynasty
currentStats = pd.read_csv(
    'https://raw.githubusercontent.com/neelganta/neel_project/master/2020stats_salary.csv'
)  #Current
regModel = pd.read_csv(
    'https://raw.githubusercontent.com/neelganta/neel_project/master/githubRegression.csv'
)
regModel = regModel.fillna(0)
# regModel = regModel.drop(columns=['Unnamed: 0'])

y = regModel['NET_RATING']
X = regModel.drop(['NET_RATING'], axis=1)
# Fit the model below
model1 = lm.LinearRegression(
)  #higher alpha (penality parameter), fewer predictors
model1.fit(X, y)
model1_y = model1.predict(X)

players = []
players = currentStats['Player']
players = deque(players)
# players.appendleft('1980-Present NBA Players')
players.appendleft('2020 NBA Players')  #Current
players = list(players)

player1 = st.selectbox(
    'Select first player: (Example: Type "BOS" to find all Celtics)', players)
player2 = st.selectbox(
    'Select second player: (Example: Type "PG" to find all Point Guards)',
    players)
Example #3
0
from sklearn import linear_model as lm
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor

MAX_ITER = 1e5

MODELS = [
    (
        'regression',
        lm.LinearRegression(),
        None,
    ),
    (
        'ridge',
        lm.Ridge(
            random_state=0,
            solver='saga',
            max_iter=MAX_ITER
        ),
        {'clf__alpha': [1e-3, 1e-2, 1e-1, 1e0]},
    ),
    (
        'lasso',
        lm.Lasso(
            random_state=0,
            max_iter=MAX_ITER
        ),
        {'clf__alpha': [1e-3, 1e-2, 1e-1, 1e0]},
    ),
    (
        'elastic_net',
datasets_X = []
datasets_Y = []
fr = open('prices.txt','r')
lines = fr.readlines()
for line in lines:
    items = line.strip().split(',')
    datasets_X.append(int(items[0]))
    datasets_Y.append(int(items[1]))


datasets_X = np.array(datasets_X).reshape([-1,1])
datasets_Y = np.array(datasets_Y)

minX = min(datasets_X)
maxX = max(datasets_X)
X = np.arange(minX,maxX).reshape([-1,1])


poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(datasets_X)
#使用线性模型学习X_poly和datasets_Y之间的映射关系(即参数)
lin_reg_2 = linear_model.LinearRegression()
lin_reg_2.fit(X_poly, datasets_Y)

# 图像中显示
plt.scatter(datasets_X, datasets_Y, color = 'red',label='origin data')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue',label='Polynomial regression prediction')
plt.legend()#使label生效
plt.xlabel('Area')
plt.ylabel('Price')
plt.show()
X = pp.normalize(main_data[main_data.columns[0:-1]])

print(X.shape)
Y = np.asarray(main_data[main_data.columns[-1]])
print(Y.shape)

X_train, X_test, Y_train, Y_test = ms.train_test_split(X,
                                                       Y,
                                                       test_size=0.33,
                                                       random_state=40)
X_train.shape
Y_train.shape

import sklearn.linear_model as lm

model = lm.LinearRegression()

model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

import sklearn.metrics as m

m.mean_squared_error(Y_test, y_pred)
X_train[1]

main_data

# displaying coefficients of each feature
features = main_data.columns[0:-1]
coeffecients = pd.DataFrame(model.coef_, features)
Example #6
0
    #5 years ago
    #attributes
    att_train_fiveyear_df = attributes_df[:fiveyear]
    att_test_fiveyear_df = attributes_df[fiveyear:fouryear]
    #target groupings
    tar_train_fiveyear__df = targets_df[:fiveyear]
    tar_test_fiveyear_df = targets_df[fiveyear:fouryear]
    # food 17 alone
    food17_train_fiveyear_df = food17_target_df[:fiveyear]
    food17_test_fiveyear_df = food17_target_df[fiveyear:fouryear]

    #LAST 12 MONTHS!
    #code below modified version of:
    #http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
    # Create linear regression object for one year
    regr_oneyear = linear_model.LinearRegression()
    # Train the model using the training sets
    regr_oneyear.fit(att_train_oneyear_df, food17_train_oneyear_df)
    # Make predictions using the testing set
    food17_pred_oneyear = regr_oneyear.predict(att_test_oneyear_df)
    # The coefficients
    #print("2016-08-01 to 2017-07-01")
    print("2016")
    #print('Coefficients: \n', regr_oneyear.coef_)
    # The mean squared error
    print("Mean squared error: %.2f" %
          mean_absolute_error(food17_test_oneyear_df, food17_pred_oneyear))
    MSE.append(mean_absolute_error(food17_test_oneyear_df,
                                   food17_pred_oneyear))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' %
Example #7
0
plt.tight_layout()
filename='mfr_predict/scatter_matrix.png'
plt.savefig(filename)

############### Btot average




#split into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

##linear model
import sklearn.linear_model as lm
lr=lm.LinearRegression()
lr.fit(np.array(X_train).reshape(-1,1),y_train) #format needs to be different for lr.fit...
y_pred = lr.predict(np.array(X_test).reshape(-1,1))

#print('R**2 score linear fit')
#print(lr.score(X,y))

from sklearn.metrics import  mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
print('Scores for <Btot>: mean absolute error, mean squared, median absolute, R2: Btot in nT')
print(mean_absolute_error(y_pred,y_test))
print(mean_squared_error(y_pred,y_test))
print(median_absolute_error(y_pred,y_test))
print(r2_score(y_pred,y_test))


#these are the fit coefficients
 
 sizeHR=len(HR_norm)
 sizepttsis=len(pttsis_norm)
 sizepttdia=len(pttdia_norm)
 sizevecsis=len(vec_sis)
 sizevecdia=len(vec_dia)
 ma=np.min([sizeHR,sizepttsis,sizepttdia])
 HR_norm=HR_norm[0:ma]
 pttsis=pttsis_norm[0:ma]
 pttdia=pttdia_norm[0:ma]
 vec_dian1=vec_dia[0:ma]
 vec_sisn1=vec_sis[0:ma]
 vec_dia0=vec_dia[1:ma+1]
 vec_sis0=vec_sis[1:ma+1]
 
 regSIS=linear_model.LinearRegression();
 xsis=np.transpose(np.array([HR_norm,pttsis,vec_sisn1]))
 regSIS.fit(xsis,vec_sis0)
 
 coefsis.append(regSIS.coef_)
 
 regDIA=linear_model.LinearRegression();
 xdia=np.transpose(np.array([HR_norm,pttdia,vec_dian1]))
 regDIA.fit(xsis,vec_dia0)
 
 coefdia.append(regSIS.coef_)
 regSIS2=linear_model.LinearRegression();
 xsis=np.transpose(np.array([HR_norm,pttsis]))
 regSIS2.fit(xsis,vec_sis0)
 
 coefsis2.append(regSIS.coef_)
MTmodel1 = ols("mpg ~ wt + hp", data=df1).fit()
print(MTmodel1.summary())
predictionM1 = MTmodel1.predict()
predictionM1
# Method 2 - sklearn
IV = df1[['wt', 'hp']].values
IV
DV = df1['mpg'].values
DV
IV_train, IV_test, DV_train, DV_test = train_test_split(IV,
                                                        DV,
                                                        test_size=0.2,
                                                        random_state=123)
IV_train, IV_test, DV_train, DV_test
from sklearn import linear_model
MTmodel2a = linear_model.LinearRegression()
MTmodel2a.fit(IV_train, DV_train)  #putting data to model
#MTmodel2a.summary()  #no summary in sklearn
MTmodel2a.intercept_
MTmodel2a.coef_
predicted2a = MTmodel2a.predict(IV_test)
predicted2a
DV_test
r2_score(DV_train, MTmodel2a.predict(IV_train))
#The mean squared error
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(DV_test, predicted2a)
r2_score(DV_test, predicted2a)

#%%% Logistic Regression
def generateLinearRegression(target, features):
    regression = linear_model.LinearRegression()
    regression.fit(features, target)

    return regression
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
import numpy as np
#read data
dataframe = pd.read_csv('challenge_dataset.txt')
print dataframe.head()
x_values = dataframe[[0]]
y_values = dataframe[[1]]
#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()
# The coefficients
print('Coefficients: ', body_reg.coef_)
# The mean squared error
print('Mean squared error: %.2f ' % np.mean(
    (body_reg.predict(x_values) - y_values)**2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % body_reg.score(x_values, y_values))
    print "Features name:", list(df.columns.values)
    print "Selected features:", features
    y = df["price"]
    X = df[features]

    # split data-set into training (70%) and testing set (30%)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # plotting features, target relationships
    plotting_features_vs_target(features, x_train, y_train)

    """
    DEFAULT MODEL
    """
    # training model
    linear = linear_model.LinearRegression()
    linear.fit(x_train, y_train)

    # evaluating model
    score_trained = linear.score(x_test, y_test)
    print "Model scored:", score_trained

    """
    LASSO MODEL
    """
    # L1 regularization
    lasso_linear = linear_model.Lasso(alpha=1.0)
    lasso_linear.fit(x_train, y_train)

    # evaluating L1 regularized model
    score_lasso_trained = lasso_linear.score(x_test, y_test)
Example #13
0
print("... start linear regression\n")

all_errMSE = []

for idx in range(0, len(coexpr_mats_train) - batch_size, batch_size):

    print("..." + " - batch: " + str(idx + 1) + "/" +
          str(len(coexpr_mats_train) - batch_size))

    b_imgs_input_X_train = hic_mats_train[
        idx:idx + batch_size]  # iput: low-resol -> hic-data
    b_imgs_target_Y_train = coexpr_mats_train[
        idx:idx + batch_size]  # output: high-resol -> coexpr

    # create linear regression object
    lreg = linear_model.LinearRegression()  # sklearn.linear_model

    # train the model using training set # => should be of shape (n, 1)
    lreg.fit(b_imgs_input_X_train.flatten()[:, np.newaxis],
             b_imgs_target_Y_train.flatten()[:, np.newaxis])

    ## ITERATE OVER THE TEST DATA

    assert len(hic_mats_test) == len(coexpr_mats_test)

    # ??? TEST ALL TEST DATA IN ONE RUN ???

    b_imgs_input_X_test = hic_mats_test[
        idx:idx + batch_size]  # iput: low-resol -> hic-data
    b_imgs_target_Y_test = coexpr_mats_test[
        idx:idx + batch_size]  # output: high-resol -> coexpr
Y_Carr_test = Y_Carr[Y_Carr['CARRIER_DELAY'] == 999]
Y_Carr_train = Y_Carr[Y_Carr['CARRIER_DELAY'] < 999]

# In[35]:

Y_Carr_test
Y_Carr_train

# In[37]:

from sklearn import linear_model, svm
from sklearn import cross_validation
X_train, X_test, y_train, y_test = X_Carr_train, X_Carr_test, Y_Carr_train, Y_Carr_test

clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
y_Carr_res = clf.predict(X_test)

# In[38]:

X_weather_train = X_weather[X_weather['WEATHER_DELAY'] < 999]
X_weather_train = X_weather_train.drop(['WEATHER_DELAY'], axis=1)
X_weather_train

# In[39]:

X_weather_test = X_weather[X_weather['WEATHER_DELAY'] == 999]
X_weather_test = X_weather_test.drop(['WEATHER_DELAY'], axis=1)
X_weather_test
Example #15
0
def Learning(t):
    from sklearn.model_selection import KFold
    K = 2
    kf = KFold(n_splits=K, random_state=t, shuffle=True)

    sim_Q_obs, sim_Potential_obs, sim_X_obs, sim_N_obs, sim_A_obs, sim_A_True_obs = generate_data(
        sample_size, obs_p, seed=t)
    sim_Q_ran, sim_Potential_ran, sim_X_ran, sim_N_ran, sim_A_ran, sim_A_True_ran = generate_data(
        10000, ran_p, seed=t, obs=False)

    # inclusion criterion
    np.random.seed(t)
    subgroup = np.where((sim_X_ran[:, 0] >= crit_point1)
                        & (sim_X_ran[:, 0] <= crit_point2))
    ran_index = np.random.choice(len(subgroup[0]), ran_size)
    sim_Q_ran = sim_Q_ran[subgroup][ran_index]
    sim_Potential_ran = sim_Potential_ran[subgroup][ran_index]
    sim_A_ran = sim_A_ran[subgroup][ran_index]
    sim_X_ran = sim_X_ran[subgroup][ran_index]

    # Prognositic score
    from sklearn import linear_model
    X_1 = sim_X_ran[np.where(sim_A_ran == -1)]
    Q_1 = sim_Q_ran[np.where(sim_A_ran == -1)]
    lin = linear_model.LinearRegression()
    lin.fit(X_1, Q_1)
    prog = lin.predict(sim_X_ran)
    prog = scale(prog)

    ####### strategy 3:
    My_model_out1, My_model_out2, My_model_trt1 = learn_obs(
        sim_X_obs[:, 0:(unobs_latent)], sim_Q_obs, sim_A_obs, seed=t * t)
    My_RCT_benefit, My_RCT_prob1 = RCT_scores(sim_X_ran[:, 0:(unobs_latent)],
                                              My_model_out1, My_model_out2,
                                              My_model_trt1)
    np.random.seed(t)
    My_RCT_prob1 = My_RCT_prob1 + np.random.uniform(
        -1e-10, 1e-10, size=ran_size)

    if strat == 'prob':
        My_RCT_scores1 = np.column_stack(
            (scale(My_RCT_benefit), prog, scale(My_RCT_prob1)))
    elif strat == 'benefit':
        My_RCT_scores1 = np.column_stack(
            (scale(My_RCT_prob1), prog, scale(My_RCT_benefit)))

    cutoff = 0
    My_RCT_X3 = np.append(sim_X_ran[:, 0:(unobs_latent)],
                          My_RCT_scores1,
                          axis=1)

    My_RCT_X3_hi = My_RCT_X3[np.where(My_RCT_scores1[:, 2] >= cutoff)]
    sim_Q_ran_hi = sim_Q_ran[np.where(My_RCT_scores1[:, 2] >= cutoff)]
    sim_A_ran_hi = sim_A_ran[np.where(My_RCT_scores1[:, 2] >= cutoff)]

    My_RCT_X3_lo = My_RCT_X3[np.where(My_RCT_scores1[:, 2] < cutoff)]
    sim_Q_ran_lo = sim_Q_ran[np.where(My_RCT_scores1[:, 2] < cutoff)]
    sim_A_ran_lo = sim_A_ran[np.where(My_RCT_scores1[:, 2] < cutoff)]

    My_RCT_X3_hi = My_RCT_X3_hi[:, :-1]
    My_RCT_X3_lo = My_RCT_X3_lo[:, :-1]

    TEST_RCT_benefit, TEST_RCT_prob1 = RCT_scores(
        sim_X_TEST[:, 0:(unobs_latent)], My_model_out1, My_model_out2,
        My_model_trt1)
    np.random.seed(t * t)
    TEST_RCT_prob1 = TEST_RCT_prob1 + np.random.uniform(
        -1e-10, 1e-10, size=10000)
    if strat == 'prob':
        TEST_RCT_scores1 = np.column_stack(
            (scale(TEST_RCT_benefit), scale(TEST_RCT_prob1)))
    elif strat == 'benefit':
        TEST_RCT_scores1 = np.column_stack(
            (scale(TEST_RCT_prob1), scale(TEST_RCT_benefit)))
    TEST_RCT_X3 = np.append(sim_X_TEST[:, 0:(unobs_latent)],
                            TEST_RCT_scores1,
                            axis=1)

    TEST_RCT_X3_hi = TEST_RCT_X3[np.where(TEST_RCT_scores1[:, 1] >= cutoff)]
    sim_N_TEST_hi = sim_N_TEST[np.where(TEST_RCT_scores1[:, 1] >= cutoff)]
    sim_X_TEST_hi = sim_X_TEST[np.where(TEST_RCT_scores1[:, 1] >= cutoff)]

    TEST_RCT_X3_lo = TEST_RCT_X3[np.where(TEST_RCT_scores1[:, 1] < cutoff)]
    sim_N_TEST_lo = sim_N_TEST[np.where(TEST_RCT_scores1[:, 1] < cutoff)]
    sim_X_TEST_lo = sim_X_TEST[np.where(TEST_RCT_scores1[:, 1] < cutoff)]

    TEST_RCT_X3_hi = TEST_RCT_X3_hi[:, :-2]
    TEST_RCT_X3_lo = TEST_RCT_X3_lo[:, :-2]

    # HIGH GROUP
    MAX_EST3 = -9999
    for C in Cs:
        for gamma in gammas:
            for kernel in kernels:
                cv_res = []
                for train_index, test_index in kf.split(My_RCT_X3_hi):
                    # print("TRAIN:", train_index, "TEST:", test_index)
                    X_train, X_test = My_RCT_X3_hi[train_index], My_RCT_X3_hi[
                        test_index]
                    Q_train, Q_test = sim_Q_ran_hi[train_index], sim_Q_ran_hi[
                        test_index]
                    A_train, A_test = sim_A_ran_hi[train_index], sim_A_ran_hi[
                        test_index]
                    model_all = MatchOLearn_KW(C=C,
                                               gamma=gamma,
                                               kernel=kernel,
                                               metric='mahalanobis',
                                               propensity=0.5)
                    model_all.fit(X_train,
                                  Q_train,
                                  A_train,
                                  match=np.array([
                                      X_train.shape[1] - 2,
                                      X_train.shape[1] - 1
                                  ]),
                                  learn=np.array(range(X_train.shape[1] - 2)),
                                  bandC=1,
                                  size=my_size)
                    est_all = model_all.estimate(
                        X_test,
                        Q_test,
                        A_test,
                        learn=np.array(range(X_test.shape[1] - 2)),
                        normalize=True)

                    cv_res.append(est_all)
                cv_res_all = np.mean(cv_res)

                if cv_res_all > (MAX_EST3 + 1e-5):
                    MAX_EST3 = cv_res_all
                    PARAM_EST3 = (C, gamma, kernel)

    best_model_s3_hi = MatchOLearn_KW(C=PARAM_EST3[0],
                                      gamma=PARAM_EST3[1],
                                      kernel=PARAM_EST3[2],
                                      metric='mahalanobis',
                                      propensity=0.5)
    best_model_s3_hi.fit(
        My_RCT_X3_hi,
        sim_Q_ran_hi,
        sim_A_ran_hi,
        match=np.array([My_RCT_X3_hi.shape[1] - 2, My_RCT_X3_hi.shape[1] - 1]),
        learn=np.array(range(My_RCT_X3_hi.shape[1] - 2)),
        bandC=1,
        size=my_size)

    TEST_Pred_ML_s3_hi = best_model_s3_hi.predict(TEST_RCT_X3_hi)
    phi_TEST3_hi, eta_TEST3_hi = phi_eta(sim_X_TEST_hi)

    TEST_Potential_ML_s3_hi = np.mean(eta_TEST3_hi +
                                      phi_TEST3_hi * TEST_Pred_ML_s3_hi +
                                      sim_N_TEST_hi)
    Ben3_hi = np.mean(
        phi_TEST3_hi * TEST_Pred_ML_s3_hi) * len(TEST_RCT_X3_hi) / 10000

    TEST_Potential_ML_s3_hi = TEST_Potential_ML_s3_hi * len(
        TEST_RCT_X3_hi) / 10000

    # LOW GROUP
    MAX_EST4 = -9999
    for C in Cs:
        for gamma in gammas:
            for kernel in kernels:
                cv_res = []
                for train_index, test_index in kf.split(My_RCT_X3_lo):
                    # print("TRAIN:", train_index, "TEST:", test_index)
                    X_train, X_test = My_RCT_X3_lo[train_index], My_RCT_X3_lo[
                        test_index]
                    Q_train, Q_test = sim_Q_ran_lo[train_index], sim_Q_ran_lo[
                        test_index]
                    A_train, A_test = sim_A_ran_lo[train_index], sim_A_ran_lo[
                        test_index]
                    model_all = MatchOLearn_KW(C=C,
                                               gamma=gamma,
                                               kernel=kernel,
                                               metric='mahalanobis',
                                               propensity=0.5)
                    model_all.fit(X_train,
                                  Q_train,
                                  A_train,
                                  match=np.array([
                                      X_train.shape[1] - 2,
                                      X_train.shape[1] - 1
                                  ]),
                                  learn=np.array(range(X_train.shape[1] - 2)),
                                  bandC=1,
                                  size=my_size)
                    est_all = model_all.estimate(
                        X_test,
                        Q_test,
                        A_test,
                        learn=np.array(range(X_test.shape[1] - 2)),
                        normalize=True)

                    cv_res.append(est_all)
                cv_res_all = np.mean(cv_res)

                if cv_res_all > (MAX_EST4 + 1e-5):
                    MAX_EST4 = cv_res_all
                    PARAM_EST4 = (C, gamma, kernel)

    best_model_s3_lo = MatchOLearn_KW(C=PARAM_EST4[0],
                                      gamma=PARAM_EST4[1],
                                      kernel=PARAM_EST4[2],
                                      metric='mahalanobis',
                                      propensity=0.5)
    best_model_s3_lo.fit(
        My_RCT_X3_lo,
        sim_Q_ran_lo,
        sim_A_ran_lo,
        match=np.array([My_RCT_X3_lo.shape[1] - 2, My_RCT_X3_lo.shape[1] - 1]),
        learn=np.array(range(My_RCT_X3_lo.shape[1] - 2)),
        bandC=1,
        size=my_size)

    TEST_Pred_ML_s3_lo = best_model_s3_lo.predict(TEST_RCT_X3_lo)
    phi_TEST3_lo, eta_TEST3_lo = phi_eta(sim_X_TEST_lo)

    TEST_Potential_ML_s3_lo = np.mean(eta_TEST3_lo +
                                      phi_TEST3_lo * TEST_Pred_ML_s3_lo +
                                      sim_N_TEST_lo)
    TEST_Potential_ML_s3_lo = TEST_Potential_ML_s3_lo * len(
        TEST_RCT_X3_lo) / 10000
    Ben3_lo = np.mean(
        phi_TEST3_lo * TEST_Pred_ML_s3_lo) * len(TEST_RCT_X3_lo) / 10000

    TEST_Potential_ML_s3 = TEST_Potential_ML_s3_hi + TEST_Potential_ML_s3_lo
    Ben3 = Ben3_hi + Ben3_lo

    ####### strategy 1:
    My_model_out1, My_model_out2, My_model_trt1 = learn_obs(
        sim_X_obs[:, 0:(unobs_latent)], sim_Q_obs, sim_A_obs, seed=t * t)
    My_RCT_benefit, My_RCT_prob1 = RCT_scores(sim_X_ran[:, 0:(unobs_latent)],
                                              My_model_out1, My_model_out2,
                                              My_model_trt1)

    My_RCT_scores0 = np.column_stack(
        (prog, scale(My_RCT_benefit), scale(My_RCT_prob1)))
    My_RCT_X1 = np.append(sim_X_ran[:, 0:(unobs_latent)],
                          My_RCT_scores0,
                          axis=1)
    My_RCT_X1 = My_RCT_X1[:, :-2]
    TEST_RCT_X1 = sim_X_TEST[:, 0:(unobs_latent)]

    MAX_EST1 = -9999
    for C in Cs:
        for gamma in gammas:
            for kernel in kernels:
                cv_res = []
                for train_index, test_index in kf.split(My_RCT_X1):
                    # print("TRAIN:", train_index, "TEST:", test_index)
                    X_train, X_test = My_RCT_X1[train_index], My_RCT_X1[
                        test_index]
                    Q_train, Q_test = sim_Q_ran[train_index], sim_Q_ran[
                        test_index]
                    A_train, A_test = sim_A_ran[train_index], sim_A_ran[
                        test_index]
                    model_all = MatchOLearn_KW(C=C,
                                               gamma=gamma,
                                               kernel=kernel,
                                               metric='mahalanobis',
                                               propensity=0.5)
                    model_all.fit(X_train,
                                  Q_train,
                                  A_train,
                                  match=np.array([X_train.shape[1] - 1]),
                                  learn=np.array(range(X_train.shape[1] - 1)),
                                  bandC=1,
                                  size=my_size)
                    est_all = model_all.estimate(
                        X_test,
                        Q_test,
                        A_test,
                        learn=np.array(range(X_test.shape[1] - 1)),
                        normalize=True)

                    cv_res.append(est_all)
                cv_res_all = np.mean(cv_res)

                if cv_res_all > (MAX_EST1 + 1e-5):
                    MAX_EST1 = cv_res_all
                    PARAM_EST1 = (C, gamma, kernel)

    best_model_s1 = MatchOLearn_KW(C=PARAM_EST1[0],
                                   gamma=PARAM_EST1[1],
                                   kernel=PARAM_EST1[2],
                                   metric='mahalanobis',
                                   propensity=0.5)
    best_model_s1.fit(My_RCT_X1,
                      sim_Q_ran,
                      sim_A_ran,
                      match=np.array([My_RCT_X1.shape[1] - 1]),
                      learn=np.array(range(My_RCT_X1.shape[1] - 1)),
                      bandC=1,
                      size=my_size)
    TEST_Pred_ML_s1 = best_model_s1.predict(TEST_RCT_X1)

    phi_TEST1, eta_TEST1 = phi_eta(sim_X_TEST)
    TEST_Potential_ML_s1 = np.mean(eta_TEST1 + phi_TEST1 * TEST_Pred_ML_s1 +
                                   sim_N_TEST)
    Ben1 = 2 * np.mean(phi_TEST1 * TEST_Pred_ML_s1)

    ####### Strategy 2
    My_RCT_X2 = np.append(sim_X_ran[:, 0:(unobs_latent)],
                          My_RCT_scores0,
                          axis=1)
    TEST_RCT_X1 = sim_X_TEST[:, 0:(unobs_latent)]

    MAX_EST2 = -9999
    for C in Cs:
        for gamma in gammas:
            for kernel in kernels:
                cv_res = []
                for train_index, test_index in kf.split(My_RCT_X2):
                    # print("TRAIN:", train_index, "TEST:", test_index)
                    X_train, X_test = My_RCT_X2[train_index], My_RCT_X2[
                        test_index]
                    Q_train, Q_test = sim_Q_ran[train_index], sim_Q_ran[
                        test_index]
                    A_train, A_test = sim_A_ran[train_index], sim_A_ran[
                        test_index]
                    model_all = MatchOLearn_KW(C=C,
                                               gamma=gamma,
                                               kernel=kernel,
                                               metric='mahalanobis',
                                               propensity=0.5)
                    model_all.fit(X_train,
                                  Q_train,
                                  A_train,
                                  match=np.array([
                                      X_train.shape[1] - 3,
                                      X_train.shape[1] - 2,
                                      X_train.shape[1] - 1
                                  ]),
                                  learn=np.array(range(X_train.shape[1] - 3)),
                                  bandC=1,
                                  size=my_size)
                    est_all = model_all.estimate(
                        X_test,
                        Q_test,
                        A_test,
                        learn=np.array(range(X_test.shape[1] - 3)),
                        normalize=True)

                    cv_res.append(est_all)
                cv_res_all = np.mean(cv_res)

                if cv_res_all > (MAX_EST2 + 1e-5):
                    MAX_EST2 = cv_res_all
                    PARAM_EST2 = (C, gamma, kernel)

    best_model_s2 = MatchOLearn_KW(C=PARAM_EST2[0],
                                   gamma=PARAM_EST2[1],
                                   kernel=PARAM_EST2[2],
                                   metric='mahalanobis',
                                   propensity=0.5)
    best_model_s2.fit(My_RCT_X2,
                      sim_Q_ran,
                      sim_A_ran,
                      match=np.array([
                          My_RCT_X2.shape[1] - 3, My_RCT_X2.shape[1] - 2,
                          My_RCT_X2.shape[1] - 1
                      ]),
                      learn=np.array(range(My_RCT_X2.shape[1] - 3)),
                      bandC=1,
                      size=my_size)
    TEST_Pred_ML_s2 = best_model_s2.predict(TEST_RCT_X1)

    TEST_Potential_ML_s2 = np.mean(eta_TEST1 + phi_TEST1 * TEST_Pred_ML_s2 +
                                   sim_N_TEST)
    Ben2 = 2 * np.mean(phi_TEST1 * TEST_Pred_ML_s2)

    print('iteration_time: ', t, ' results: ', np.mean(sim_Potential_ran)),
    print("strategy 1: ", MAX_EST1, TEST_Potential_ML_s1, PARAM_EST1,
          Counter(TEST_Pred_ML_s1)),
    print("strategy 2: ", MAX_EST2, TEST_Potential_ML_s2, PARAM_EST2,
          Counter(TEST_Pred_ML_s2)),
    print("strategy 3: ", TEST_Potential_ML_s3_hi, TEST_Potential_ML_s3_lo,
          Counter(TEST_Pred_ML_s3_hi), Counter(TEST_Pred_ML_s3_lo),
          len(TEST_RCT_X3_hi), len(TEST_RCT_X3_lo)),
    print(
        "----------------------------------------------------------------------------------"
    )

    return TEST_Potential_ML_s1, TEST_Potential_ML_s2, TEST_Potential_ML_s3, Ben1, Ben2, Ben3
si_series = np.array(merge_df["open_si"])
gazp_series = np.array(merge_df["open_gazp"])
gmkn_series = np.array(merge_df["open_gmkn"])
lkoh_series = np.array(merge_df["open_lkoh"])
#mgnt_series = np.array(merge_df["OPEN_MGNT"])
#rosn_series = np.array(merge_df["OPEN_SBER"])
sber_series = np.array(merge_df["open_sber"])
#sngsp_series = np.array(merge_df["OPEN_SNGSP"])
vtbr_series = np.array(merge_df["open_vtbr"])

#nvtk_series = np.array(merge_df["OPEN_NVTK"])
#sngs_series = np.array(merge_df["OPEN_SNGS"])
#trnfp_series = np.array(merge_df["OPEN_TRNFP"])
#rtsi_series = 100.0*np.array(merge_df["OPEN_RTSI"])

linreg = linear_model.LinearRegression(fit_intercept=True)
series_list = [ gazp_series/si_series, gmkn_series/si_series, lkoh_series/si_series, sber_series/si_series, vtbr_series/si_series ]
#series_list = [si_series, gazp_series, gmkn_series, lkoh_series, mgnt_series, rosn_series, sber_series, sngsp_series, vtbr_series, nvtk_series, sngs_series, trnfp_series]
#rts_index_series =  rts_index_series/(0.001*si_series)
#series_list = [np.log(100.0*rts_index_series), np.arange(0, len(rts_index_series)) ]
series_array = np.array(series_list).transpose()
linreg.fit(series_array, np.log(rts_series))

print linreg.coef_
print linreg.intercept_

rts_index = np.exp(linreg.predict(series_array))

residual = rts_series - rts_index

rts_index_data = { "date": merge_df["date"], "time": merge_df["time"],  "index_residual" : residual, "index_residual_ma" : moving_average(residual, 2000)}
#also compute lagged correlation between THF and AMO
#todo: compute lagged correlation between SLP and AMO
print 'calculating correlations between AMO and THF, SLP...'
for i in range(nlat):

    print 'latitude', lats[i]

    sstprime_g = sstprime[:, i, :]
    thfprime_g = thfprime[:, i, :]
    psprime_g = psprime[:, i, :]
    thf_lt_g = thf_lt[:, i, :]
    thf_st_g = thf_st[:, i, :]
    ps_lt_g = ps_lt[:, i, :]
    ps_st_g = ps_st[:, i, :]

    clf = linear_model.LinearRegression()
    clf.fit(AMOstd.reshape(-1, 1), sstprime_g)
    sstcorrs[i, :] = np.squeeze(clf.coef_)

    clf = linear_model.LinearRegression()
    clf.fit(AMOstd.reshape(-1, 1), thfprime_g)
    thfcorrs[i, :] = np.squeeze(clf.coef_)

    clf = linear_model.LinearRegression()
    clf.fit(AMOstd.reshape(-1, 1), psprime_g)
    pscorrs[i, :] = np.squeeze(clf.coef_)

    clf = linear_model.LinearRegression()
    clf.fit(AMOstd_lt.reshape(-1, 1), thf_lt_g)
    thfcorrs_lt[i, :] = np.squeeze(clf.coef_)
Example #18
0
    def __init__(self, maker, MAX_DF=0.1, MAX_FEATURES=300, LSA_DIM=10):

        estimator.__init__(self, maker, MAX_DF, MAX_FEATURES, LSA_DIM)
        self.model = linear_model.LinearRegression(fit_intercept=False)
def regression(other_args: List[str], s_ticker: str, df_stock: pd.DataFrame,
               polynomial: int):
    """
    Train a regression model
    Parameters
    ----------
    other_args: List[str]
        Argparse arguments
    s_ticker: str
        Stock ticker
    df_stock: pd.DataFrame
        Dataframe of stock prices
    polynomial: int
        Order of polynomial

    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="regression",
        description="""
            Regression attempts to model the relationship between
            two variables by fitting a linear/quadratic/cubic/other equation to
            observed data. One variable is considered to be an explanatory variable,
            and the other is considered to be a dependent variable.
        """,
    )

    parser.add_argument(
        "-i",
        "--input",
        action="store",
        dest="n_inputs",
        type=check_positive,
        default=40,
        help="number of days to use for prediction.",
    )
    parser.add_argument(
        "-d",
        "--days",
        action="store",
        dest="n_days",
        type=check_positive,
        default=5,
        help="prediction days.",
    )
    parser.add_argument(
        "-j",
        "--jumps",
        action="store",
        dest="n_jumps",
        type=check_positive,
        default=1,
        help="number of jumps in training data.",
    )
    parser.add_argument(
        "-e",
        "--end",
        action="store",
        type=valid_date,
        dest="s_end_date",
        default=None,
        help="The end date (format YYYY-MM-DD) to select - Backtesting",
    )

    if polynomial == USER_INPUT:
        parser.add_argument(
            "-p",
            "--polynomial",
            action="store",
            dest="n_polynomial",
            type=check_positive,
            required=True,
            help="polynomial associated with regression.",
        )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        # BACKTESTING
        if ns_parser.s_end_date:
            if ns_parser.s_end_date < df_stock.index[0]:
                print(
                    "Backtesting not allowed, since End Date is older than Start Date of historical data\n"
                )
                return

            if ns_parser.s_end_date < get_next_stock_market_days(
                    last_stock_day=df_stock.index[0],
                    n_next_days=ns_parser.n_inputs + ns_parser.n_days,
            )[-1]:
                print(
                    "Backtesting not allowed, since End Date is too close to Start Date to train model\n"
                )
                return

            future_index = get_next_stock_market_days(
                last_stock_day=ns_parser.s_end_date,
                n_next_days=ns_parser.n_days)

            if future_index[-1] > datetime.datetime.now():
                print(
                    "Backtesting not allowed, since End Date + Prediction days is in the future\n"
                )
                return

            df_future = df_stock[future_index[0]:future_index[-1]]
            df_stock = df_stock[:ns_parser.s_end_date]

        # Split training data
        stock_x, stock_y = splitTrain.split_train(
            df_stock["Adj Close"].values,
            ns_parser.n_inputs,
            ns_parser.n_days,
            ns_parser.n_jumps,
        )

        if not stock_x:
            print("Given the model parameters more training data is needed.\n")
            return

        # Machine Learning model
        if polynomial == LINEAR:
            model = linear_model.LinearRegression(n_jobs=-1)
        else:
            if polynomial == USER_INPUT:
                polynomial = ns_parser.n_polynomial
            model = pipeline.make_pipeline(
                preprocessing.PolynomialFeatures(polynomial),
                linear_model.Ridge())

        model.fit(stock_x, stock_y)
        l_predictions = [
            i if i > 0 else 0 for i in model.predict(
                df_stock["Adj Close"].values[-ns_parser.n_inputs:].reshape(
                    1, -1))[0]
        ]

        # Prediction data
        l_pred_days = get_next_stock_market_days(
            last_stock_day=df_stock["Adj Close"].index[-1],
            n_next_days=ns_parser.n_days,
        )
        df_pred = pd.Series(l_predictions, index=l_pred_days, name="Price")

        # Plotting
        plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI)
        plt.plot(df_stock.index, df_stock["Adj Close"], lw=2)
        # BACKTESTING
        if ns_parser.s_end_date:
            plt.title(
                f"BACKTESTING: Regression (polynomial {polynomial}) on {s_ticker} - {ns_parser.n_days} days prediction"
            )
        else:
            plt.title(
                f"Regression (polynomial {polynomial}) on {s_ticker} - {ns_parser.n_days} days prediction"
            )
        plt.xlim(df_stock.index[0],
                 get_next_stock_market_days(df_pred.index[-1], 1)[-1])
        plt.xlabel("Time")
        plt.ylabel("Share Price ($)")
        plt.grid(b=True, which="major", color="#666666", linestyle="-")
        plt.minorticks_on()
        plt.grid(b=True,
                 which="minor",
                 color="#999999",
                 linestyle="-",
                 alpha=0.2)
        plt.plot(
            [df_stock.index[-1], df_pred.index[0]],
            [df_stock["Adj Close"].values[-1], df_pred.values[0]],
            lw=1,
            c="tab:green",
            linestyle="--",
        )
        plt.plot(df_pred.index, df_pred, lw=2, c="tab:green")
        plt.axvspan(df_stock.index[-1],
                    df_pred.index[-1],
                    facecolor="tab:orange",
                    alpha=0.2)
        _, _, ymin, ymax = plt.axis()
        plt.vlines(df_stock.index[-1],
                   ymin,
                   ymax,
                   linewidth=1,
                   linestyle="--",
                   color="k")

        # BACKTESTING
        if ns_parser.s_end_date:
            plt.plot(
                df_future.index,
                df_future["Adj Close"],
                lw=2,
                c="tab:blue",
                ls="--",
            )
            plt.plot(
                [df_stock.index[-1], df_future.index[0]],
                [
                    df_stock["Adj Close"].values[-1],
                    df_future["Adj Close"].values[0],
                ],
                lw=1,
                c="tab:blue",
                linestyle="--",
            )

        if gtff.USE_ION:
            plt.ion()

        plt.show()

        # BACKTESTING
        if ns_parser.s_end_date:
            plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI)
            plt.subplot(211)
            plt.plot(
                df_future.index,
                df_future["Adj Close"],
                lw=2,
                c="tab:blue",
                ls="--",
            )
            plt.plot(df_pred.index, df_pred, lw=2, c="green")
            plt.scatter(df_future.index,
                        df_future["Adj Close"],
                        c="tab:blue",
                        lw=3)
            plt.plot(
                [df_stock.index[-1], df_future.index[0]],
                [
                    df_stock["Adj Close"].values[-1],
                    df_future["Adj Close"].values[0],
                ],
                lw=2,
                c="tab:blue",
                ls="--",
            )
            plt.scatter(df_pred.index, df_pred, c="green", lw=3)
            plt.plot(
                [df_stock.index[-1], df_pred.index[0]],
                [df_stock["Adj Close"].values[-1], df_pred.values[0]],
                lw=2,
                c="green",
                ls="--",
            )
            plt.title("BACKTESTING: Real data price versus Prediction")
            plt.xlim(df_stock.index[-1],
                     df_pred.index[-1] + datetime.timedelta(days=1))
            plt.xticks(
                [
                    df_stock.index[-1],
                    df_pred.index[-1] + datetime.timedelta(days=1)
                ],
                visible=True,
            )
            plt.ylabel("Share Price ($)")
            plt.grid(b=True, which="major", color="#666666", linestyle="-")
            plt.minorticks_on()
            plt.grid(b=True,
                     which="minor",
                     color="#999999",
                     linestyle="-",
                     alpha=0.2)
            plt.legend(["Real data", "Prediction data"])
            plt.xticks([])

            plt.subplot(212)
            plt.axhline(y=0, color="k", linestyle="--", linewidth=2)
            plt.plot(
                df_future.index,
                100 * (df_pred.values - df_future["Adj Close"].values) /
                df_future["Adj Close"].values,
                lw=2,
                c="red",
            )
            plt.scatter(
                df_future.index,
                100 * (df_pred.values - df_future["Adj Close"].values) /
                df_future["Adj Close"].values,
                c="red",
                lw=5,
            )
            plt.title(
                "BACKTESTING: Error between Real data and Prediction [%]")
            plt.plot(
                [df_stock.index[-1], df_future.index[0]],
                [
                    0,
                    100 *
                    (df_pred.values[0] - df_future["Adj Close"].values[0]) /
                    df_future["Adj Close"].values[0],
                ],
                lw=2,
                ls="--",
                c="red",
            )
            plt.xlim(df_stock.index[-1],
                     df_pred.index[-1] + datetime.timedelta(days=1))
            plt.xticks(
                [
                    df_stock.index[-1],
                    df_pred.index[-1] + datetime.timedelta(days=1)
                ],
                visible=True,
            )
            plt.xlabel("Time")
            plt.ylabel("Prediction Error (%)")
            plt.grid(b=True, which="major", color="#666666", linestyle="-")
            plt.minorticks_on()
            plt.grid(b=True,
                     which="minor",
                     color="#999999",
                     linestyle="-",
                     alpha=0.2)
            plt.legend(["Real data", "Prediction data"])

            if gtff.USE_ION:
                plt.ion()

            plt.show()

            # Refactor prediction dataframe for backtesting print
            df_pred.name = "Prediction"
            df_pred = df_pred.to_frame()
            df_pred["Real"] = df_future["Adj Close"]

            if gtff.USE_COLOR:

                patch_pandas_text_adjustment()

                print("Time         Real [$]  x  Prediction [$]")
                print(
                    df_pred.apply(price_prediction_backtesting_color,
                                  axis=1).to_string())
            else:
                print(df_pred[["Real", "Prediction"]].round(2).to_string())

            print("")
            print_prediction_kpis(df_pred["Real"].values,
                                  df_pred["Prediction"].values)

        else:
            # Print prediction data
            print_pretty_prediction(df_pred, df_stock["Adj Close"].values[-1])
        print("")

    except SystemExit:
        print("")
    except Exception as e:
        print(e)
        print("")
    def update(self):

        # # Linear Regression using features from Correlation Matrix
        U = coormatrix_features.drop('price', axis=1)
        V = coormatrix_features['price']

        # split the dataset into train and test
        U_train, U_test, V_train, V_test = train_test_split(U,
                                                            V,
                                                            test_size=0.2,
                                                            random_state=10)

        # Standardize the features and target / # normalizing the features target
        ss = StandardScaler()
        U_train = ss.fit_transform(U_train)
        U_test = ss.transform(U_test)  # borrowing parameters from train
        U_train.shape, U_test.shape
        V_train = ss.fit_transform(V_train.values.reshape(-1, 1))
        V_test = ss.transform(V_test.values.reshape(-1, 1))

        regr = linear_model.LinearRegression()
        regr.fit(U_train, V_train)
        airbnb_V_pred = regr.predict(U_test)

        vtitle = "V1 Prices vs Predicted Prices: $Y_i$ vs $\hat{Y}_i$"
        self.ax1.plot(V_test, airbnb_V_pred, 'bo')
        self.ax1.set_title(vtitle)
        self.ax1.set_xlabel("Prices: $Y_i$")
        self.ax1.set_ylabel("Predicted prices: $\hat{Y}_i$")
        self.ax1.grid(True)

        self.fig1.tight_layout()
        self.fig1.canvas.draw_idle()

        #self.label1.setText('V1 Coefficients: \n' % regr.coef_)
        self.label2.setText("V1 Mean squared error: %.2f" %
                            mean_squared_error(V_test, airbnb_V_pred))
        self.label3.setText('V1 R2 score: %.2f' %
                            r2_score(V_test, airbnb_V_pred))

        #LR - V2
        X = FeaturesFINAL.drop('price', axis=1)
        y = FeaturesFINAL['price']

        # split the dataset into train and test
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=10)

        # Standardize the features and target / # normalizing the features target
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)  # borrowing parameters from train

        y_train = ss.fit_transform(y_train.values.reshape(-1, 1))
        y_test = ss.transform(y_test.values.reshape(-1, 1))

        regr = linear_model.LinearRegression()
        regr.fit(X_train, y_train)
        airbnb_y_pred = regr.predict(X_test)

        self.ax2.plot(y_test, airbnb_y_pred, 'bo')
        self.ax2.set_title(
            "V2 Prices vs Predicted Prices: $Y_i$ vs $\hat{Y}_i$")
        self.ax2.set_xlabel("Prices: $Y_i$")
        self.ax2.set_ylabel("Predicted prices: $\hat{Y}_i$")
        self.ax2.grid(True)

        self.fig2.tight_layout()
        self.fig2.canvas.draw_idle()

        #self.label5.setText('V2 Coefficients: \n' % regr.coef_)
        self.label6.setText("V2 Mean squared error: %.2f" %
                            mean_squared_error(y_test, airbnb_y_pred))
        self.label7.setText('V2 R2 score: %.2f' %
                            r2_score(y_test, airbnb_y_pred))
Example #21
0
ymon = np.array(ymon, dtype='float64')
'''

dataw = np.array(dataw, dtype='int64')
datas = np.array(datas, dtype='int64')
datam = np.array(datam, dtype='int64')
ywin = np.array(ywin, dtype='int64')
ysum = np.array(ysum, dtype='int64')
ymon = np.array(ymon, dtype='int64')

clf = SVC(kernel='rbf')
clf.fit(dataw, ywin)
ysvc = clf.predict(pre)
print(ysvc)

clfw = linear_model.LinearRegression()
clfw.fit(dataw, ywin)
wpre = clfw.predict(pre)
print(wpre)

clf = SVC(kernel='rbf')
clf.fit(datas, ysum)
ysvc = clf.predict(pre)
print(ysvc)

clfs = linear_model.LinearRegression()
clfs.fit(datas, ysum)
spre = clfs.predict(pre)
print(spre)

clf = SVC(kernel='rbf')
Example #22
0
def lr(x: numpy.ndarray, y: numpy.ndarray) -> LRRes:
    regr = linear_model.LinearRegression()
    regr.fit(x, y)
    score = regr.score(x, y)
    diff = numpy.abs(regr.predict(x) - y).sum() / y.sum()
    return LRRes(regr.coef_, regr.intercept_, regr._residues, score, diff)
def plot_ols():
    """
    =========================================================
    Linear Regression Example
    =========================================================
    This example uses the only the first feature of the `diabetes` dataset, in
    order to illustrate a two-dimensional plot of this regression technique. The
    straight line can be seen in the plot, showing how linear regression attempts
    to draw a straight line that will best minimize the residual sum of squares
    between the observed responses in the dataset, and the responses predicted by
    the linear approximation.

    The coefficients, the residual sum of squares and the variance score are also
    calculated.

    """
    print(__doc__)

    # Code source: Jaques Grobler
    # License: BSD 3 clause

    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn import datasets, linear_model
    from sklearn.metrics import mean_squared_error, r2_score

    # Load the diabetes dataset
    diabetes = datasets.load_diabetes()

    # Use only one feature
    diabetes_X = diabetes.data[:, np.newaxis, 2]

    # Split the data into training/testing sets
    diabetes_X_train = diabetes_X[:-20]
    diabetes_X_test = diabetes_X[-20:]

    # Split the targets into training/testing sets
    diabetes_y_train = diabetes.target[:-20]
    diabetes_y_test = diabetes.target[-20:]

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(diabetes_X_train, diabetes_y_train)

    # Make predictions using the testing set
    diabetes_y_pred = regr.predict(diabetes_X_test)

    # The coefficients
    print('Coefficients: \n', regr.coef_)
    # The mean squared error
    print("Mean squared error: %.2f" %
          mean_squared_error(diabetes_y_test, diabetes_y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

    # Plot outputs
    plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
    plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

    plt.xticks(())
    plt.yticks(())

    plt.show()
Example #24
0
import numpy as np
import pandas as pd
data = pd.read_csv('./house_price.csv')
data1 = data.dropna()
data2 = pd.get_dummies(data1[['dist', 'floor']])
pd.set_option('display.max_columns', None)
data3 = data2.drop(['dist_shijingshan', 'floor_high'], axis=1)
data4 = pd.concat(
    [data3, data1[['roomnum', 'halls', 'AREA', 'subway', 'school', 'price']]],
    axis=1)
x = data4.iloc[:, :-1]
y = data4.iloc[:, -1:]
from sklearn import linear_model
from sklearn.model_selection import train_test_split
x_train, x_text, y_train, y_text = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)
model = linear_model.LinearRegression().fit(x_train, y_train)
result = model.predict(np.array([[0, 0, 0, 0, 0, 0, 0, 2, 1, 60, 1, 1]]))
# print(result)
# print(model.coef_) #模型系数
# print(model.intercept_)  #模型截距
# print(model.score(x_text,y_text))
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.describe()

# Dzielenie danych na zbiór treningowy i testowy
X1_train, X1_test, Y_train, Y_test = train_test_split(df[['Age']],
                                                      df[['YearsCode']],
                                                      test_size=0.33,
                                                      random_state=42)
X1_X2_train, X1_X2_test, Y_train, Y_test = train_test_split(
    df[['Age', 'Age1stCode']],
    df[['YearsCode']],
    test_size=0.33,
    random_state=42)

regr1 = linear_model.LinearRegression()
regr1.fit(X1_train, Y_train)

print('Coefficients: \n', regr1.coef_)
print("Residual sum of squares: %.2f" %
      mean_squared_error(Y_test, regr1.predict(X1_test)))

regr2 = linear_model.LinearRegression()
regr2.fit(X1_X2_train, Y_train)

print('Coefficients: \n', regr2.coef_)
print("Residual sum of squares: %.2f" %
      mean_squared_error(Y_test, regr2.predict(X1_X2_test)))
Example #26
0
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures as pf
from sklearn import linear_model as lm

train = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\train.csv')
test = pd.read_csv('C:\\Users\\Preetham G\\Downloads\\test.csv')
train = train.drop(columns=['Index', 'District'])
test = test.drop(columns=['Index', 'District'])
base = [
    RandomForestRegressor(n_estimators=100, max_depth=10),
    ExtraTreesRegressor(n_estimators=90, max_depth=15),
    GradientBoostingRegressor(n_estimators=60, max_depth=5),
    XGBRegressor(n_estimators=50, max_depth=5),
    BaggingRegressor(n_estimators=50, base_estimator=lm.LinearRegression())
]
name = ['RFR', 'ETR', 'GBR', 'XGBR', 'BAR']
df1 = pd.DataFrame()
c = 0
train_x = train.drop(columns=['Rainfall'])
train_y = train['Rainfall']
test_x = test.drop(columns=['Rainfall'])
test_y = test['Rainfall']
d1 = {}
for i, j in zip(base, name):
    print(j, c)
    if j == 'BAR':
        poly = pf(degree=4)
        train_x = poly.fit_transform(train_x)
        test_x = poly.fit_transform(test_x)
Example #27
0
print("Mean squared error =",
      round(sm.mean_squared_error(y_test, y_test_pred), 2))
print("Median abs error =",
      round(sm.median_absolute_error(y_test, y_test_pred), 2))
print("Explain var scr =",
      round(sm.explained_variance_score(y_test, y_test_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))

input = 'D:/ProgramData/praktika2/Mul_linear.txt'
input_data = np.loadtxt(input, delimiter=',')
X, y = input_data[:, :-1], input_data[:, -1]
training_samples = int(0.6 * len(X))
testing_samples = len(X) - 10
X_train, y_train = X[:training_samples], y[:training_samples]
X_test, y_test = X[training_samples:], y[training_samples:]
reg_linear_mul = linear_model.LinearRegression()
reg_linear_mul.fit(X_train, y_train)
y_test_pred = reg_linear_mul.predict(X_test)
print("Performance of Linear regressor:")
print("Mean absolute error =",
      round(sm.mean_absolute_error(y_test, y_test_pred), 2))
print("Mean squared error =",
      round(sm.mean_squared_error(y_test, y_test_pred), 2))
print("Median abs error =",
      round(sm.median_absolute_error(y_test, y_test_pred), 2))
print("Explain var scr=",
      round(sm.explained_variance_score(y_test, y_test_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))
polynomial = PolynomialFeatures(degree=10)
X_train_transformed = polynomial.fit_transform(X_train)
datapoint = [[2.23, 1.35, 1.12]]
                                         test_size=0.2,
                                         shuffle=True)
y_df_train, y_df_test = train_test_split(y_dataFrame,
                                         test_size=0.2,
                                         shuffle=True)
#
# print(y_df_train.loc[:, 0:6])
predict_x = None
predict_y = None
train_for_x = True
train_for_y = True
if train_for_x:
    poly_reg_x = PolynomialFeatures(degree=1)
    X_ploy = poly_reg_x.fit_transform(x_df_train.loc[:, 1:])
    X_ploy_predict = poly_reg_x.fit_transform(x_df_test.loc[:, 1:])
    lin_reg_2_x = linear_model.LinearRegression()
    lin_reg_2_x.fit(X_ploy, x_df_train.loc[:, 0] / 1920)
    predict_x = lin_reg_2_x.predict(X_ploy_predict)
    output = open('linear_x.pkl', 'wb')
    pickle.dump(lin_reg_2_x, output, 0)  # 将训练后的线性模型保存
    output.close()

if train_for_y:
    poly_reg_y = PolynomialFeatures(degree=1)
    Y_ploy = poly_reg_y.fit_transform(y_df_train.loc[:, 1:])
    Y_ploy_predict = poly_reg_y.fit_transform(y_df_test.loc[:, 1:])
    lin_reg_2_y = linear_model.LinearRegression()
    # lin_reg_2 = svm.SVR(kernel='linear', C=1e3)
    lin_reg_2_y.fit(Y_ploy, y_df_train.loc[:, 0] / 1080)
    predict_y = lin_reg_2_y.predict(Y_ploy_predict)
    # output = open('svr_y.pkl', 'wb')
# 读取自带的diabete数据集
diabetes = datasets.load_diabetes()

# 使用其中的一个feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# 将数据集分割成training set和test set
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# 将目标(y值)分割成training set和test set
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# 使用线性回归
regr = linear_model.LinearRegression()

# 进行training set和test set的fit,即是训练的过程
regr.fit(diabetes_X_train, diabetes_y_train)

# 打印出相关系数和截距等信息
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f" % np.mean(
    (regr.predict(diabetes_X_test) - diabetes_y_test)**2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))

# 使用pyplot画图
plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
plt.plot(diabetes_X_test,
def Linear_Regression():
    clf = linear_model.LinearRegression()
    clf.fit([[0, 0], [1, 1], [2, 2]], [1, 2, 4])
    print(clf.predict([[0.2, 0.3], [0.4, 0.5]]))
    return clf.coef_, clf.intercept_