Ejemplo n.º 1
0
def test_simple_vs_refined_algorithm(theta, fit_path):
    # Test the consistency of the results between the 2 versions of
    # the algorithm.

    # Simple Algorithm (2 steps of Lasso Lars)
    lasso1 = LassoLars(alpha=alpha)
    lasso1.fit(X_train, y_train)
    X1 = X_train.copy()
    X1[:, lasso1.coef_ == 0] = 0

    lasso2 = LassoLars(alpha=alpha*theta)
    lasso2.fit(X1, y_train)
    pred_simple = lasso2.predict(X_test)

    # Refined Algorithm
    relasso = RelaxedLassoLars(alpha=alpha, theta=theta, fit_path=fit_path)
    relasso.fit(X_train, y_train)
    pred_refined = relasso.predict(X_test)

    assert_array_almost_equal(pred_simple, pred_refined)
    assert_array_almost_equal(lasso2.coef_, relasso.coef_)
    assert_almost_equal(lasso2.score(X_test, y_test),
                        relasso.score(X_test, y_test),
                        decimal=2)
Ejemplo n.º 2
0
# LassoLars Regression
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LassoLars
# load the iris datasets
dataset = datasets.load_diabetes()
# fit a LASSO using LARS model to the data
model = LassoLars(alpha=0.1)
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
Ejemplo n.º 3
0
# LassoLars Regression
# The Least Angle Regression (LARS) can be used as an alternative method for calculating Least Absolute Shrinkage
# and Selection Operator (LASSO) fit.
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LassoLars

# load the iris datasets
dataset = datasets.load_diabetes()

# fit a LASSO using LARS model to the data
model = LassoLars(alpha=0.1)
model.fit(dataset.data, dataset.target)
print(model)

# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
mse = np.mean((predicted - expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
Ejemplo n.º 4
0
def task2(data):

    df = data

    dfreg = df.loc[:, ['Adj Close', 'Volume']]
    dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)
    # We want to separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(dfreg)))
    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'Adj Close'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))
    # Scale the X so that everyone can have the same distribution for linear regression
    X = preprocessing.scale(X)
    # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ##################
    ##################
    ##################

    # Linear regression
    clfreg = LinearRegression(n_jobs=-1)
    clfreg.fit(X_train, y_train)
    # Quadratic Regression 2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
    clfpoly2.fit(X_train, y_train)

    # Quadratic Regression 3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
    clfpoly3.fit(X_train, y_train)

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)
    clfknn.fit(X_train, y_train)

    # Lasso Regression
    clflas = Lasso()
    clflas.fit(X_train, y_train)

    # Multitask Lasso Regression
    # clfmtl = MultiTaskLasso(alpha=1.)
    # clfmtl.fit(X_train, y_train).coef_

    # Bayesian Ridge Regression
    clfbyr = BayesianRidge()
    clfbyr.fit(X_train, y_train)

    # Lasso LARS Regression
    clflar = LassoLars(alpha=.1)
    clflar.fit(X_train, y_train)

    # Orthogonal Matching Pursuit Regression
    clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
    clfomp.fit(X_train, y_train)

    # Automatic Relevance Determination Regression
    clfard = ARDRegression(compute_score=True)
    clfard.fit(X_train, y_train)

    # Logistic Regression
    # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
    # coefs_ = []
    # for c in cs:
    #   clflgr.set_params(C=c)
    #   clflgr.fit(X_train, y_train)
    #   coefs_.append(clflgr.coef_.ravel().copy())

    clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
    clfsgd.fit(X_train, y_train)

    ##################
    ##################
    ##################

    #Create confindence scores
    confidencereg = clfreg.score(X_test, y_test)
    confidencepoly2 = clfpoly2.score(X_test, y_test)
    confidencepoly3 = clfpoly3.score(X_test, y_test)
    confidenceknn = clfknn.score(X_test, y_test)
    confidencelas = clflas.score(X_test, y_test)
    # confidencemtl = clfmtl.score(X_test, y_test)
    confidencebyr = clfbyr.score(X_test, y_test)
    confidencelar = clflar.score(X_test, y_test)
    confidenceomp = clfomp.score(X_test, y_test)
    confidenceard = clfard.score(X_test, y_test)
    confidencesgd = clfsgd.score(X_test, y_test)

    # results
    print('The linear regression confidence is:', confidencereg * 100)
    print('The quadratic regression 2 confidence is:', confidencepoly2 * 100)
    print('The quadratic regression 3 confidence is:', confidencepoly3 * 100)
    print('The knn regression confidence is:', confidenceknn * 100)
    print('The lasso regression confidence is:', confidencelas * 100)
    # print('The lasso regression confidence is:',confidencemtl*100)
    print('The Bayesian Ridge regression confidence is:', confidencebyr * 100)
    print('The Lasso LARS regression confidence is:', confidencelar * 100)
    print('The OMP regression confidence is:', confidenceomp * 100)
    print('The ARD regression confidence is:', confidenceard * 100)
    print('The SGD regression confidence is:', confidencesgd * 100)

    #Create new columns
    forecast_reg = clfreg.predict(X_lately)
    forecast_pol2 = clfpoly2.predict(X_lately)
    forecast_pol3 = clfpoly3.predict(X_lately)
    forecast_knn = clfknn.predict(X_lately)
    forecast_las = clflas.predict(X_lately)
    forecast_byr = clfbyr.predict(X_lately)
    forecast_lar = clflar.predict(X_lately)
    forecast_omp = clfomp.predict(X_lately)
    forecast_ard = clfard.predict(X_lately)
    forecast_sgd = clfsgd.predict(X_lately)

    #Process all new columns data
    dfreg['Forecast_reg'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
        dfreg['Forecast_reg'].loc[next_date] = i

    dfreg['Forecast_pol2'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol2:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol2'].loc[next_date] = i

    dfreg['Forecast_pol3'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol3:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol3'].loc[next_date] = i

    dfreg['Forecast_knn'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_knn:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_knn'].loc[next_date] = i

    dfreg['Forecast_las'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_las:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_las'].loc[next_date] = i

    dfreg['Forecast_byr'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_byr:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_byr'].loc[next_date] = i

    dfreg['Forecast_lar'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_lar:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_lar'].loc[next_date] = i

    dfreg['Forecast_omp'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_omp:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_omp'].loc[next_date] = i

    dfreg['Forecast_ard'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_ard:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_ard'].loc[next_date] = i

    dfreg['Forecast_sgd'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_sgd:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_sgd'].loc[next_date] = i

    return dfreg.index.format(formatter=lambda x: x.strftime(
        '%Y-%m-%d')), dfreg['Adj Close'].to_list(
        ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(
        ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(
        ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(
        ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(
        ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
Ejemplo n.º 5
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLars

data = pd.read_csv("dataset.csv", header=0)

X = data.loc[:, ["Commune", "Etage", "Superficie", "Piece"]].values
Y = data.loc[:, "Prix"].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

regressor = LassoLars(alpha=0.1)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
Ejemplo n.º 6
0

l_reg=LinearRegression()
lasso_reg=LassoLars()


# In[380]:


l_reg.fit(X_train,y_train)


# In[382]:



l_reg.score(X_test,y_test)


# In[383]:


lasso_reg.fit(X_train,y_train)


# In[384]:


lasso_reg.score(X_test,y_test)

model_EN.fit(X_train, y_train)
model_EN.score(X_train, y_train)
print "Score of Elastic-net on train data: ", model_EN.score(X_train, y_train)
print "Score of Elastic-net on test data: ", model_EN.score(X_test, y_test)
print "L1 ratio: ", models.l1_ratio_
print "Alpha: ", models.alpha_

# At this point we should be our final model on the entire dataset
# using previously tunned parameters

# Lasso-lars
models = LassoLarsCV(max_n_alphas=40, verbose=1, cv=folds)
models.fit(X_train, y_train)
model_LL = LassoLars(alpha=models.alpha_)
model_LL.fit(X_train, y_train)
print "Score of Lasso-Lars on train data: ", model_LL.score(X_train, y_train)
print "Score of Lasso-Lars on test data: ", model_LL.score(X_test, y_test)
'''
=============== PCA ==================
3 first components explain only ~33% of the variance when applied non-normalized data

You can test it by copying following code to preprocessing.py

from sklearn.decomposition import PCA
pca = PCA(3)
X = pca.fit_transform(data.iloc[:,1:])
print pca.explained_variance_ratio_


Actually I was surprised by how poorly it performed so I even wrote a simple MatLab script because it didn't feel right,
but it outputs exactly same numbers.
Ejemplo n.º 8
0
mse = mean_squared_error(y_test, lasso_pred)
print("Root Mean Squared Error: ", np.sqrt(mse))

fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(111)
ax.plot(y_test.index, lasso_pred, label='Predicted')
ax.plot(y_test, label='Test')
ax.legend()
plt.show()

# Evaluation
confidence_lr = lr.score(X_test, y_test)
confidence_poly2 = poly2.score(X_test, y_test)
confidence_poly3 = poly3.score(X_test, y_test)
confidence_knn = knn.score(X_test, y_test)
confidence_lasso = lasso.score(X_test, y_test)

print("Results: ", confidence_lr, confidence_poly2, confidence_poly3,
      confidence_knn, confidence_lasso)

# all on one graph

fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(111)
ax.plot(y_test.index, lasso_pred, label='Lasso', color='red')
ax.plot(y_test.index, knn_pred, label='KNN', color='blue')
ax.plot(y_test.index, poly2_pred, label='Poly2', color='green')
ax.plot(y_test.index, poly3_pred, label='Poly3', color='orange')
ax.plot(y_test.index, y_pred_lr, label='LR', color='cyan')
ax.plot(y_test, label='Test', color='magenta')
ax.legend()
from sklearn.model_selection import train_test_split

# 准备数据
boston = load_boston()
X,Y = boston.data,boston.target
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.3)

'''
    利用LARS的lasso回归:
        不知道说什么了,因为完全不了解
'''

rg = LassoLars(alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, copy_X=True, fit_path=True, positive=False)
rg.fit(X_train,Y_train)
Y_pre = rg.predict(X_test)
rg.score(X_test,Y_test)
rg.coef_
rg.intercept_

'''
    alpha
    fit_intercept 
    verbose
    normalize
    precompute
    max_iter
    eps
    copy_X
    fit_path
    positive
'''
Ejemplo n.º 10
0
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(scale(X))
print(X_poly.shape)  # 104 features

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0)

################ Lasso-Lars ################

from sklearn.linear_model import LassoLars
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

lasso_lars = LassoLars(alpha=0.01)

lasso_lars.fit(X_train, y_train)
lasso_lars.score(X_test, y_test)

np.mean(cross_val_score(lasso_lars, X_train, y_train, cv=10))

###### Tuning alpha parameter

from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': np.logspace(-3, 3, 13)}

grid = GridSearchCV(lasso_lars, param_grid, cv=10)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
Ejemplo n.º 11
0
lm = lm.fit(X_final, y)

# plot regression coefficient
fig2, ax2 = plt.subplots()
plt.bar(ivs, lm.coef_)
plt.tight_layout()

# visualize dim-reduced data along the two dimension with the greatest coefficients
indices = np.argsort(np.abs(lm.coef_))
y_tmp = y - np.min(y)
y_tmp /= np.max(y_tmp)
cmap = plt.cm.inferno
colors = cmap(y_tmp)
#colors[:, 3] = y_tmp
fig, axes = plt.subplots(nrows=3)
plt.sca(axes[0])
plt.scatter(X_final[:, indices[-1]], X_final[:, indices[-2]], c=colors)
axes[0].set_xlabel(ivs[indices[-1]])
axes[0].set_ylabel(ivs[indices[-2]])
plt.sca(axes[1])
plt.scatter(X_final[:, indices[-1]], X_final[:, indices[-3]], c=colors)
axes[1].set_xlabel(ivs[indices[-1]])
axes[1].set_ylabel(ivs[indices[-3]])
plt.sca(axes[2])
plt.scatter(X_final[:, indices[-2]], X_final[:, indices[-3]], c=colors)
axes[2].set_xlabel(ivs[indices[-2]])
axes[2].set_ylabel(ivs[indices[-3]])
plt.tight_layout()
print(lm.score(X_final, y))
plt.show()
Ejemplo n.º 12
0
    print "R^2: ", r2

    print "\n**********测试LassoLars类**********"
    # 在初始化LassoLars类时, 指定超参数α, 默认值是1.0.
    lassoLars = LassoLars(alpha=0.005)
    # 拟合训练集
    lassoLars.fit(train_X, train_Y)
    # 打印模型的系数
    print "系数:", lassoLars.coef_
    print "截距:", lassoLars.intercept_
    print '训练集R2: ', r2_score(train_Y, lassoLars.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = lassoLars.predict(test_X)
    print "测试集得分:", lassoLars.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, lassoLars.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试LassoLarsCV类**********"
    lassoLarscv = LassoLarsCV(cv=5)
    # 拟合训练集
    lassoLarscv.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数