def lasso_regr(wine_set):

    pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
                    'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]
    predictors = pred.copy()
    targets = wine_set.quality

    # standardize predictors to have mean=0 and sd=1
    predictors = pd.DataFrame(preprocessing.scale(predictors))
    predictors.columns = pred.columns
    # print(predictors.head())

    # split into training and testing sets
    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123)

    # specify the lasso regression model
    model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

    print('Predictors and their regression coefficients:')
    d = dict(zip(predictors.columns, model.coef_))
    for k in d:
        print(k, ':', d[k])

    # plot coefficient progression
    m_log_alphas = -np.log10(model.alphas_)
    # ax = plt.gca()
    plt.plot(m_log_alphas, model.coef_path_.T)
    print('\nAlpha:', model.alpha_)
    plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV')
    plt.ylabel("Regression coefficients")
    plt.xlabel("-log(alpha)")
    plt.title('Regression coefficients progression for Lasso paths')
    plt.show()

    # plot mean squared error for each fold
    m_log_alphascv = -np.log10(model.cv_alphas_)
    plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
    plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.show()

    # Mean squared error from training and test data
    train_error = mean_squared_error(tar_train, model.predict(pred_train))
    test_error = mean_squared_error(tar_test, model.predict(pred_test))
    print('\nMean squared error for training data:', train_error)
    print('Mean squared error for test data:', test_error)

    rsquared_train = model.score(pred_train, tar_train)
    rsquared_test = model.score(pred_test, tar_test)
    print('\nR-square for training data:', rsquared_train)
    print('R-square for test data:', rsquared_test)
Exemple #2
0
def lassolarscv():
    print ("Doing cross-validated LassoLars")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf5 = LassoLarsCV(cv=cross_val)
    clf5.fit(base_X, base_Y)
    print ("Score = %f" % clf5.score(base_X, base_Y))
    clf5_pred = clf5.predict(X_test)
    write_to_file("lassolars.csv", clf5_pred)
Exemple #3
0
class LassoLarsCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
def lassovar(data, lag=1, n_samples=None):
    Y = data.T[:, lag:]
    d = Y.shape[0]
    Z = np.vstack([data.T[:, lag - k:-k] for k in range(1, lag + 1)])
    Y, Z = Y.T, Z.T
    if n_samples is not None:
        Y, Z = resample(Y, Z, replace=False, n_samples=n_samples)

    scores = np.zeros((d, d * lag))

    ls = LassoLarsCV(cv=10, n_jobs=1)

    residuals = np.zeros(Y.shape)

    # one variable after the other as target
    for j in range(d):
        target = np.copy(Y[:, j])
        selectedparents = np.full(d * lag, False)
        # we include one lag after the other
        for l in range(1, lag + 1):
            ind_a = d * (l - 1)
            ind_b = d * l
            ls.fit(Z[:, ind_a:ind_b], target)
            selectedparents[ind_a:ind_b] = ls.coef_ > 0
            target -= ls.predict(Z[:, ind_a:ind_b])

        residuals[:, j] = np.copy(target)

        # refit to get rid of the bias
        ZZ = Z[:, selectedparents]
        B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0]
        scores[j, selectedparents] = B

    # the more uncorrelated the residuals the higher the weight
    weight = 1
    res = np.corrcoef(residuals.T)
    if np.linalg.matrix_rank(res) == res.shape[0]:
        weight = np.linalg.det(res)
    return scores * weight
Exemple #5
0
def lassovar(data, maxlags=1, n_samples=None, cv=5):
    # Stack data to perform regression of present on past values
    Y = data.T[:, maxlags:]
    d = Y.shape[0]
    Z = np.vstack([data.T[:, maxlags - k:-k] for k in range(1, maxlags + 1)])
    Y, Z = Y.T, Z.T

    # Subsample data
    if n_samples is not None:
        Y, Z = resample(Y, Z, n_samples=n_samples)

    scores = np.zeros((d, d * maxlags))

    ls = LassoLarsCV(cv=cv, n_jobs=1)

    residuals = np.zeros(Y.shape)

    # Consider one variable after the other as target
    for j in range(d):
        target = np.copy(Y[:, j])
        selectedparents = np.full(d * maxlags, False)
        # Include one lag after the other
        for l in range(1, maxlags + 1):
            ind_a = d * (l - 1)
            ind_b = d * l
            ls.fit(Z[:, ind_a:ind_b], target)
            selectedparents[ind_a:ind_b] = ls.coef_ > 0
            target -= ls.predict(Z[:, ind_a:ind_b])

        residuals[:, j] = np.copy(target)

        # Refit OLS using the selected variables to get rid of the bias
        ZZ = Z[:, selectedparents]
        B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0]
        scores[j, selectedparents] = B

    return scores
Exemple #6
0
        [0.067154, 3.190612], [0.925577, 4.631504], [0.717733, 4.295890],
        [0.015371, 3.085028], [0.335070, 3.448080], [0.040486, 3.167440],
        [0.212575, 3.364266], [0.617218, 3.993482], [0.541196, 3.891471]]

#生成X和y矩阵
dataMat = np.array(data)
X = dataMat[:, 0:1]  # 变量x
y = dataMat[:, 1]  #变量y

# ========Lasso回归========
# model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
# model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(X, y)  # 线性回归建模
print('系数矩阵:\n', model.coef_)
print('线性回归模型:\n', model)
# print('最佳的alpha:',model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效
# 使用模型预测
predicted = model.predict(X)

# 绘制散点图 参数:x横轴 y纵轴
plt.scatter(X, y, marker='x')
plt.plot(X, predicted, c='r')

# 绘制x轴和y轴坐标
plt.xlabel("x")
plt.ylabel("y")

# 显示图形
plt.show()
Exemple #7
0
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

# LassoLarsCV: least angle regression
from sklearn.linear_model import LassoLarsCV
# Compute paths
print("Computing regularization path using the Lars lasso...")
LassoLarsCV_fit = LassoLarsCV(cv=20).fit(X, y)
LassoLarsCV_pred = LassoLarsCV_fit.predict(X_test)
R2_LassoLarsCV = metrics.r2_score(LassoLarsCV_pred, y_test)
# 0.776 du coup un peu meilleur

# Display results
m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas,
         model.mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
Exemple #8
0
from sklearn.linear_model import LassoLarsCV
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
X, y = make_regression(n_features=1, noise=4.0, random_state=0)
y = y.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=100)
reg = LassoLarsCV(cv=5).fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))
print(reg.alpha_)
y_pred = reg.predict(X)
print(X_train.shape, y_train.shape)
plt.scatter(X_train, y_train, label='train')
plt.scatter(X_test, y_test, label='test')
plt.plot(X, y_pred)
plt.show()
class LassoPredictor(Persistent):
    @contract(hypers='dict')
    def __init__(self, hypers):
        modelHypers = self.extract_model_hypers(hypers)
        self.model = LassoLarsCV(**modelHypers)

    @timing
    def fit(self, df, features, targetCol, validationSplit=0.2):

        print("Running fit function:")
        print(df)
        XTrain, yTrain = df2xy(df, features, targetCol)
        if XTrain.shape[0] < 3:
            print("not enough data to form a model!")
            return False

        success = True
        try:
            self.model.fit(XTrain, yTrain)
        #try:
        #Parallel(n_jobs=2, verbose=10, batch_size=20)(delayed(self.fit_helper)(date) for date in self.dates)
        except ValueError:
            traceback.print_exc()
            success = False
        return success

    def predict(self, df, features, targetCol):
        XPred, _ = df2xy(df, features, targetCol)
        try:
            yPred = self.model.predict(XPred)
        except ValueError:
            traceback.print_exc()
            return None

        #df['pred' + targetCol] = yPred
        return yPred

    #def score (self, userXTest):
    #    # *** Needs reworking!
    #    '''
    #    :returns: Score calculated by taking the last yTrain (all data)
    #    and comparing to predicted result.
    #    '''
    #    if self.modelScore is None:
    #        lastDate = self.dates[-1]
    #        actualY = self.yTrains[lastDate]
    #        #preddf = self.predict(userXTest)
    #        preddf = loads(preddf, preserve_order=True)
    #        preddf = pd.DataFrame(preddf['arr'], columns = [self.targetCol])
    #        predY = preddf[self.targetCol]
    #        predY = predY.shift(-self.batchSize)
    #        predY = predY.iloc[:-self.batchSize]

    #        score = metrics.r2_score(actualY, predY)
    #        self.modelScore = score
    #    else:
    #        score = self.modelScore
    #    return score

    def lc(self):
        '''
        Makes learning curve for a player
        '''
        if self.lcScores is None:

            self.lcModel = LassoLarsCV()
            lastDate = self.dates[-1]
            X = self.XTrains[lastDate]
            y = self.yTrains[lastDate]

            N = len(X)
            chopOff = N - (N % 7)
            X = X.iloc[:chopOff]
            y = y.iloc[:chopOff]
            idxs = np.arange(chopOff)

            cvSplits = [(idxs[:i], idxs[i:]) for i in range(7, chopOff, 7)]

            trainSizes, trainScores, testScores = \
                    learning_curve(estimator=self.lcModel,
                                    X=X.as_matrix(),
                                    y=np.array(y),
                                    cv=cvSplits,
                                    train_sizes=[7],
                                    n_jobs=2,
                                    )
            trainSizes = [len(t[0]) for t in cvSplits]
            self.lcScores = dumps((trainSizes, trainScores, testScores))
            result = self.lcScores
        else:
            result = self.lcScores

        return result

    def get_params(self):
        for i, model in self.models.items():
            params = order_dict(model.get_params())
            break
        return params

    def extract_model_hypers(self, hypers):
        '''
        Extracts the parameterse that relevant to the model
        and are not other meta params
        '''
        params = ['verbose']
        modelHypers = {}
        for param in params:
            paramVal = hypers.get(param)
            if paramVal is not None:
                modelHypers[param] = paramVal
        modelHypers = order_dict(modelHypers)
        return modelHypers
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
y_test_score = rd.score(x_test, y_test)
print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score))
print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
'''========9.Lasso回归========'''
import numpy as np
import matplotlib.pyplot as plt  # 可视化绘制
from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV  # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取

#model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
# model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(x_train, y_train)  # 线性回归建模
print('系数矩阵:\n', model.coef_, model.intercept_)

print('线性回归模型:\n', model)
print('最佳的alpha:', model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效

# 使用模型预测
#分别预测训练数据和测试数据
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
#分别计算其均方根误差和拟合优度

y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred))
y_train_score = model.score(x_train, y_train)
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
y_test_score = model.score(x_test, y_test)
print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score))
print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
plt.plot(m_log_alphascv,
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(training_target, model.predict(training_data))
test_error = mean_squared_error(test_target, model.predict(test_data))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train = model.score(training_data, training_target)
rsquared_test = model.score(test_data, test_target)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)
    for pyear in pt_projs_curr.keys():
        ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems])

    x = numpy.array(ivars)
    x2 = numpy.array(ivars2)
    y = numpy.array(depvars)
    model_pt = LassoLarsCV(cv=cv_num)
    model_pt.fit(x,y)

    print("Rough PT model, to choose sample")
    for system, coef in zip(proj_systems, model_pt.coef_):
        print("%40s : %f" % (system, coef))
    print("%40s : %f" % ('intercept', model_pt.intercept_))

    sample_proj_pt_arr = model_pt.predict(x)

    curr_proj_pt_arr = model_pt.predict(x2)

    sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr))
    curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr))

    models = {}
    final_projs = {}

    ivars = {}
    depvars = {}
    ptvars = {}

    player_lists = {}
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train = model.score(pred_train, tar_train)
rsquared_test = model.score(pred_test, tar_test)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)

#-------------------------------------------------------------------------------
Exemple #14
0
    for pyear in pt_projs_curr.keys():
        ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems])

    x = numpy.array(ivars)
    x2 = numpy.array(ivars2)
    y = numpy.array(depvars)
    model_pt = LassoLarsCV(cv=cv_num,fit_intercept=False)
    model_pt.fit(x,y)

    print("Rough PT model, to choose sample")
    for system, coef in zip(proj_systems, model_pt.coef_):
        print("%40s : %f" % (system, coef))
    print("%40s : %f" % ('intercept', model_pt.intercept_))

    sample_proj_pt_arr = model_pt.predict(x)

    curr_proj_pt_arr = model_pt.predict(x2)

    sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr))
    curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr))

    models = {}
    final_projs = {}

    ivars = {}
    depvars = {}
    ptvars = {}

    player_lists = {}
Exemple #15
0
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
#plt.savefig('Fig02')
#print(pred_train.head())

#plt.show()
print(model.alpha_)
print(model.coef_)
print(model.intercept_)
print(pred_train.head())

np.unique(model.predict(pred_test))
model
########################################################################################
###########################   Part 1 RESPONSE            ##############################
#######################################################################################
#(1)
#a.             TABLE:
#AGE,        SYS,         HRA,          RACE_1,     RACE_2,      RACE_3,      TYP_1,       CPR_1S
#0.01946696, -0.01645696, -0.00596813, -0.2566194,  -0.23701148, -0.04399663, 1.12856158,  0.87772558

#b.             Viewing the coefficient of CPR, we have 2.41 odds? or about 71% chance of survival TODO: is this right

#c.             Optimal alpha value from the Lasso section: 0.0013716207531124826

#d.
#The coefficients are:
def lasso_single_prediction(city, state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))
    data, group = get_cluster_data(geocode=city,
                                   clusters=clusters,
                                   data_types=DATA_TYPES,
                                   cols=predictors)

    target = 'casos_est_{}'.format(city)
    casos_est_columns = ['casos_est_{}'.format(i) for i in group]
    # casos_columns = ['casos_{}'.format(i) for i in group]

    # data = data_full.drop(casos_columns, axis=1)
    data_lag = build_lagged_features(data, lookback)
    data_lag.dropna()
    targets = {}
    for d in range(1, horizon + 1):
        if d == 1:
            targets[d] = data_lag[target].shift(-(d - 1))
        else:
            targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

    X_data = data_lag.drop(casos_est_columns, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        data_lag[target],
                                                        train_size=0.7,
                                                        test_size=0.3,
                                                        shuffle=False)

    if sum(y_train) == 0:
        print('aaaah', city)
        return None
    city_name = get_city_names([city, 0])[0][1]
    preds = np.empty((len(data_lag), horizon))
    metrics = pd.DataFrame(index=('mean_absolute_error',
                                  'explained_variance_score',
                                  'mean_squared_error',
                                  'mean_squared_log_error',
                                  'median_absolute_error', 'r2_score'))
    for d in range(1, horizon + 1):
        model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False)

        tgt = targets[d][:len(X_train)]
        tgtt = targets[d][len(X_train):]
        try:
            model.fit(X_train, tgt)
            print(city, 'done')
        except ValueError as err:
            print('-----------------------------------------------------')
            print(city, 'ERRO')
            print('-----------------------------------------------------')
            break
        pred = model.predict(X_data[:len(targets[d])])

        dif = len(data_lag) - len(pred)
        if dif > 0:
            pred = list(pred) + ([np.nan] * dif)
        preds[:, (d - 1)] = pred
        pred_m = model.predict(X_test[:(len(tgtt))])
        metrics[d] = calculate_metrics(pred_m, tgtt)

    metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
        'saved_models/lasso', state, city))
    plot_prediction(preds, targets[1], city_name, len(X_train))
    return None
def lasso_single_state_prediction(state, lookback, horizon, predictors):
    ##LASSO WITHOUT CLUSTER SERIES
    cities = list(get_cities_from_state('Ceará'))

    for city in cities:
        if os.path.isfile(
                '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl'
                .format(state, city)):
            print(city, 'done')
            continue
        data = combined_data(city, DATA_TYPES)
        data = data[predictors]
        data.drop('casos', axis=1, inplace=True)

        target = 'casos_est'
        data_lag = build_lagged_features(data, lookback)
        data_lag.dropna()
        targets = {}
        for d in range(1, horizon + 1):
            if d == 1:
                targets[d] = data_lag[target].shift(-(d - 1))
            else:
                targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

        X_data = data_lag.drop(target, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            data_lag[target],
                                                            train_size=0.7,
                                                            test_size=0.3,
                                                            shuffle=False)

        city_name = get_city_names([city, 0])[0][1]
        preds = np.empty((len(data_lag), horizon))
        metrics = pd.DataFrame(index=('mean_absolute_error',
                                      'explained_variance_score',
                                      'mean_squared_error',
                                      'mean_squared_log_error',
                                      'median_absolute_error', 'r2_score'))
        for d in range(1, horizon + 1):
            model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False)

            tgt = targets[d][:len(X_train)]
            tgtt = targets[d][len(X_train):]
            try:
                model.fit(X_train, tgt)
            except ValueError as err:
                print('-----------------------------------------------------')
                print(city, 'ERRO')
                print('-----------------------------------------------------')
                break
            pred = model.predict(X_data[:len(targets[d])])

            dif = len(data_lag) - len(pred)
            if dif > 0:
                pred = list(pred) + ([np.nan] * dif)
            preds[:, (d - 1)] = pred
            pred_m = model.predict(X_test[:(len(tgtt))])
            metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format(
                'saved_models/lasso_no_cluster', state, city))
        plot_prediction(preds,
                        targets[1],
                        city_name,
                        len(X_train),
                        path='lasso_no_cluster')
        # plt.show()
    return None
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
         

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=55)

# Average CV score on the training set was:-832843188.6270168
exported_pipeline = LassoLarsCV(normalize=True)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
class determine_attribute_quality(object):
    
    def __init__(self,red,white):
        self.red=red
        self.white=white
    
    def remove_column_spaces(self,wine_data):
        wine_data.columns = [x.strip().replace(' ', '_') for x in wine_data.columns]
        return wine_data
    
    def regression(self,wine_data):

        self.pred = wine_data[['density', 
                               'alcohol', 
                               'sulphates', 
                               'pH', 
                               'volatile_acidity', 
                               'chlorides', 
                               'fixed_acidity',
                               'citric_acid', 
                               'residual_sugar', 
                               'free_sulfur_dioxide', 
                               'total_sulfur_dioxide']]
        self.predictors = self.pred.copy()
        self.targets = wine_data.quality

        # Normalization
        self.predictors = pd.DataFrame(preprocessing.scale(self.predictors))
        self.predictors.columns = self.pred.columns
    
        # Split into Training and Testing sets
        (self.pred_train, 
         self.pred_test, 
         self.target_train, 
         self.target_test) = train_test_split(self.predictors, 
                                             self.targets, 
                                             test_size=.2, 
                                             random_state=123)

        # Lasso Regression Model
        self.model = LassoLarsCV(cv=10, precompute=False).fit(self.pred_train, self.target_train)

        print('Predictors and their Regression coefficients:')
        d = dict(zip(self.predictors.columns, self.model.coef_))
        for k in d:
            print(k, ':', d[k])

        # Plot Coefficient Progression
        m_log_alphas = -np.log10(self.model.alphas_)
    
        plt.plot(m_log_alphas, self.model.coef_path_.T)
        print('\nAlpha:', self.model.alpha_)
        plt.axvline(-np.log10(self.model.alpha_), linestyle="dashed", color='k', label='alpha CV')
        plt.ylabel("Regression coefficients")
        plt.xlabel("-log(alpha)")
        plt.title('Regression coefficients progression for Lasso paths')
        plt.show()

        # Plot MSE for each fold
        m_log_alphascv = -np.log10(self.model.cv_alphas_)
        plt.plot(m_log_alphascv, self.model.cv_mse_path_, ':')
        plt.plot(m_log_alphascv, self.model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
        plt.legend()
        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean Squared Error')		
        plt.title('Mean Squared Error on Each Fold')
        plt.show()

        # Mean Squared Error from Training and Test data
        self.train_error = mean_squared_error(self.target_train, self.model.predict(self.pred_train))
        self.test_error = mean_squared_error(self.target_test, self.model.predict(self.pred_test))
        print('\nMean squared error for training data:', self.train_error)
        print('Mean squared error for test data:', self.test_error)

        self.rsquared_train = self.model.score(self.pred_train, self.target_train)
        self.rsquared_test = self.model.score(self.pred_test, self.target_test)
        print('\nR-square for training data:', self.rsquared_train)
        print('R-square for test data:', self.rsquared_test)
plt.title('Regression Coefficients Progression for Lasso Paths')

# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')


# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(training_target, model.predict(training_data))
test_error = mean_squared_error(test_target, model.predict(test_data))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(training_data, training_target)
rsquared_test=model.score(test_data, test_target)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
                        )

                errors = pd.Series()

                if run_type == 'train':
                    target_validate = target_train.copy()
                    baseline_target_validate = baseline_target_train.copy()

                if run_type == 'test':
                    target_validate = target_test.copy()
                    baseline_target_validate = baseline_target_test.copy()

                for key in metrics.keys():
                    # Transfer Error
                    X = np.asarray(target_validate['rep'].values.tolist())
                    yhat = model_trained_on_S.predict(X)
                    np.save(
                        f"./predictions/transfer__{d}__{s}__{rep}__{run_type}__predictions_S_T.npy",
                        yhat)
                    errors.loc[f'{key}_S_T'] = metrics[key](
                        target_validate['target'], yhat)

                    # In-domain Error
                    X = np.asarray(target_validate['rep'].values.tolist())
                    yhat = model_trained_on_T.predict(X)
                    np.save(
                        f"./predictions/transfer__{d}__{s}__{rep}__{run_type}__predictions_T_T.npy",
                        yhat)
                    errors.loc[f'{key}_T_T'] = metrics[key](
                        target_validate['target'], yhat)
Exemple #23
0
        g.set_ylabel("Features", fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " regression coefs")
        nregressors += 1
plt.tight_layout()
plt.show()
plt.gcf().clear()

# Here, the features coefficients (show only top 40 features). It appears that GrLivArea has an important weight in the 4 models.
#
# According to the RMSE scores, i choosed the Lassocv, LassolarsCV and the ElasticNetCV models.

# In[ ]:

Y_pred_lassocv = np.expm1(lassocv.predict(test))
Y_pred_lassolarscv = np.expm1(lassolarscv.predict(test))
Y_pred_elasticnetcv = np.expm1(elasticnetcv.predict(test))

# Don't forget to transform the log1p(SalePrice) to their real values using expm1.

# ### 6.2 Tree based modeling
# #### 6.2.1 Cross validate models

# Next i wanted to combine the linear models to tree based models. I'hve tested the random forest it shows bad performances (~0.14 with hyperparameters tunning).
#
# I decided to focus on, the kaggle "darling" algorithm :p XGBoost, the LightGBM and the Gradient Boosting algorithm.
#
# Thanks to the excellent @Serigne kernel, i get near-optimal parameters for these 3 algorithms.
#
# This spare us a lot of hyperparameters tunning :D!
Exemple #24
0
test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test')

lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon))
if os.path.exists(lasso_file):
	print "Reading PCA from file"
	L = pickle.load(open(lasso_file, 'r'))
else:
	print "Fitting Lasso"
	L = LassoLarsCV(cv=5)
	L.fit(train_data.X, train_data.y[:,0])
	pickle.dump(L, open(lasso_file, 'w'))


## Print Fit stats
print "Alpha", L.alpha_ 
print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X))
print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X))

yhat = L.predict(test_data.X)
print "Pearson Corr", pearsonr(test_data.y[:,0], yhat)
print "Spearman Corr", spearmanr(test_data.y[:,0], yhat)
print "SSE", sum((yhat - test_data.y[:,0])**2)


## Compute monthly data
import datetime
import pandas

t0 = datetime.date(1986, 1, 1)
t1 = datetime.date(1999, 12, 31)
Exemple #25
0
    # Create the pipeline for the model
    est = LassoLarsCV()

    #fit model
    # pdb.set_trace()
    t0 = time.time()
    est.fit(X[train],y[train])
    #get fit time
    runtime = time.time()-t0
    # print("training done")
    # pdb.set_trace()
    # predict on test set

    y_true = y[test]
    y_pred = est.predict(X[test])

    if problem in scale_these:
        test_mse = mean_squared_error(sc_y.inverse_transform(y_true),
                                      sc_y.inverse_transform(y_pred))
        test_r2 = r2_score(sc_y.inverse_transform(y_true),
                                    sc_y.inverse_transform(y_pred))
    else:
        test_mse = mean_squared_error(y_true,y_pred)
        test_r2 = r2_score(y_true,y_pred)

    # print results
    out_text = '\t'.join([dataset.split('/')[-1][2:-4],
                          'lasso',
                          str(i),
                          str(test_mse),
def main():
    u"""Main function for assignment 03."""
    # Load prepared data.
    df = return_proc_and_transf_data_set()
    # Mass is already included as mass in SI units.
    df.drop(['carat'], inplace=True, axis=1)
    # Those are dummy variables not needed in our data set anymore.
    df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1)

    # A bit of error checking.
    if df.isnull().sum().sum() != 0:
        raise ValueError('Your data has unintended nulls.')

    # Cast our dataframe into float type.
    df = df.astype('float64')

    # Scale our dataframe to avoid the sparsity control of our dataframe biased
    # against some variables.
    print('Prior to scaling:')
    print(df.describe())
    df = df.apply(preprocessing.scale)
    print('After scaling:')
    print(df.describe())
    print_separator()
    if (df.mean().abs() > 1e-3).sum() > 0:
        raise ValueError('Scaling of your dataframe went wrong.')

    # Split into training and testing sets
    # The predictirs should not include any price variable since this was used
    # to create the output variable
    predictors = [x for x in df.columns.tolist() if 'price' not in x]
    print('Input variables:')
    pprint(predictors, indent=4)
    input_variables = df[predictors].copy()
    output_variable = df.price.copy()  # Categorized price
    print_separator()

    input_training, input_test, output_training, output_test = train_test_split(
        input_variables, output_variable, test_size=0.3, random_state=0)

    # A few words about the LassoLarsCV:

        # LASSO: least absolute shrinkage and selection operator (discussed in
        # the course material.

        # LARS: least angle regression: algorithm for linear regression models
        # to high-dimensional data (aka 'a lot of categories').
        # Compared to simple LASSO this model uses the LARS algorithm instead of
        # the 'vanilla' 'coordinate_descent' of simple LASSO.

        # CV: cross validation: this sets the alpha parameter (refered to as
        # lambda parameter in the course video) by cross validation.
        # In the simple LARS this alpha (the penalty factor) is an input of the
        # function.
        # 'The alpha parameter controls the degree of sparsity of the
        # coefficients estimated.
        # If alpha = zero then the method is the same as OLS.

    model = LassoLarsCV(
        cv=10,  # Number of folds.
        precompute=False,  # Do not precompute Gram matrix.
        # precompute=True,  # Do not precompute Gram matrix.
        # verbose=3,
    ).fit(input_training, output_training)

    dict_var_lin_coefs = dict(zip(
        predictors,
        model.coef_))

    print('Result of linear model:')
    pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()],
                  key=lambda x: abs(x[1]))
           )
    print_separator()

    # Plot coefficient progression.
    # TODO: plot those on 4 different subplots.
    model_log_alphas = -np.log10(model.alphas_)
    ax = plt.gca()
    plt.plot(model_log_alphas, model.coef_path_.T)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.ylabel('Regression Coefficients')
    plt.xlabel('-log(alpha)')
    plt.title('Regression Coefficients Progression for Lasso Paths')
    plt.legend(predictors,
        loc='best',)
    plt.tight_layout()
    plt.savefig('result00.png', dpi=600)
    plt.close()
    # TODO: why are the coefficients in the result very different than the
    # coefficient path?
    #
    # There seems to be a scaling of the coefficient paths with an arbitrary
    # almost the same constant (194 in this case)
    #
    # print('Resulting alpha is not different than path alpha (difference):')
    # difference = model.alpha_ - model.alphas_
    # pprint(model.alpha_ - model.alphas_)
    # print('Resulting coefficients are very different than path coefficients (difference):')
    # pprint(model.coef_ - model.coef_path_.T)
    # print_separator()


    # Plot mean square error for each fold.
    # To avoid getting dividebyzero warning map zero to an extremely low value.
    model.cv_alphas_ = list(
        map(lambda x: x if x != 0 else np.inf,
            model.cv_alphas_))
    model_log_alphas = -np.log10(model.cv_alphas_)
    plt.figure()
    plt.plot(model_log_alphas, model.cv_mse_path_, ':')
    plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
            label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.legend()
    plt.tight_layout()
    plt.savefig('result01.png', dpi=600)
    plt.close()

    # Mean squared error of our model.
    train_error = mean_squared_error(output_training,
                                     model.predict(input_training))
    test_error = mean_squared_error(output_test,
                                    model.predict(input_test))
    print ('Training data MSE')
    print(train_error)
    print ('Test data MSE')
    print(test_error)
    print_separator()


    # R-square from training and test data.
    rsquared_train = model.score(
        input_training,
        output_training)
    rsquared_test = model.score(
        input_test,
        output_test)
    print ('Training data R-square')
    print(rsquared_train)
    print ('Test data R-square')
    print(rsquared_test)
    print_separator()

    return {'model': model, 'dataframe': df}
def lasso(X, y, value):
    regressor = LassoLarsCV(cv=10, precompute=False)
    regressor.fit(X, y)
    y_pred = regressor.predict(value)
    return y_pred
Exemple #28
0
                doc = content[0]
                score = float(content[1])
                test_set.setdefault(doc, score)
                if cnt == 0:
                    X_test = np.array(weight[idx[doc]]).reshape(cnt + 1, d)
                    Y_test = np.array([score]).reshape(cnt + 1, 1)
                else:
                    X_test = np.concatenate((X_test, np.array(weight[idx[doc]]).reshape(1, d)), axis=0).reshape(cnt + 1, d)
                    Y_test = np.concatenate((Y_test, np.array([score]).reshape(1, 1)), axis=0).reshape(cnt + 1, 1)
                cnt += 1
                line = next(f)



            print('predicting...')
            Y_hat = clflars.predict(X_test)                                                      #predict
            MAE = np.mean(np.abs(Y_hat - Y_test))
            print('MAE: %f' % MAE)
            # print(Y_hat)

            for idx, doc in enumerate(test_set.keys()):
                if idx >= cnt:
                    break
                res.write(QID + ' ' + doc + ' ')
                res.write(str(float(Y_hat[idx])))
                res.write('\n')
            MAE_TOTAL += MAE / 50
            print(QID + ' MAE: %f' % MAE)
            print('===================================\n')
        res.write('\nMAE: %f' % MAE_TOTAL)
Exemple #29
0
    if y_trainset[i] >= 0.01 and y_trainset[i] < 1:
        X_trainset_1.append(X_trainset[i])
        y_trainset_1.append(y_trainset[i])
reg_1 = LassoLarsCV(max_n_alphas=10, positive=True)
reg_1.fit(X_trainset_1, y_trainset_1)

## 预测
mse = 0.0
for i in range(0, y_testset.__len__(), 1):
    predict_x = 0.0
    test_x = X_testset[i]
    test_x = scaler.transform(test_x)
    one_classify_pro = classify_model_0003.predict_proba(test_x)
    probe = one_classify_pro[0]
    if probe[0] - probe[1] > 0.3:
        predict_x = reg_0003.predict(test_x)
    elif probe[1] - probe[0] > 0.3:
        two_classify_pro = classify_model_001.predict_proba(test_x)
        probe_two = two_classify_pro[0]
        if probe_two[0] - probe_two[1] > 0.3:
            predict_x = reg_001.predict(test_x)
        elif probe_two[1] - probe_two[0] > 0.3:
            predict_x = reg_1.predict(test_x)
        else:
            predict_x = probe_two[0] * reg_001.predict(
                test_x) + probe_two[1] * reg_1.predict(test_x)
    else:
        predict_x = probe[0] * reg_0003.predict(
            test_x) + probe[1] * reg_001.predict(test_x)
    print predict_x, y_testset[i]
    mse += abs(predict_x - y_testset[i])
Exemple #30
0
lines = ''

from sklearn.datasets import load_svmlight_file
filename = "data/trainingset/oneThousandProperties.txt"
data = load_svmlight_file(filename)
X_testset, y_testset = data[0], data[1]
X_testset = X_testset.toarray()

for i in range(0, y_testset.__len__(), 1):
    predict_x = 0.0
    test_x = X_testset[i]
    test_x = scaler.transform(test_x)
    one_classify_pro = classify_model_0003.predict_proba(test_x)
    probe = one_classify_pro[0]
    if probe[0] - probe[1] > 0.4:
        predict_x = reg_0003.predict(test_x)
    elif probe[1] - probe[0] > 0.4:
        two_classify_pro = classify_model_001.predict_proba(test_x)
        probe_two = two_classify_pro[0]
        if probe_two[0] - probe_two[1] > 1:
            predict_x = reg_001.predict(test_x)
        elif probe_two[1] - probe_two[0] > 1:
            predict_x = reg_1.predict(test_x)
        else:
            if probe_two[1] > probe_two[0]:
                predict_x = 0.000 * probe_two[0] * reg_001.predict(
                    test_x) + probe_two[1] * reg_1.predict(test_x)
            else:
                predict_x = probe_two[0] * reg_001.predict(
                    test_x) + 0.45 * probe_two[1] * reg_1.predict(test_x)
Exemple #31
0
     normalize=False, scoring=None, store_cv_values=False)
 RG.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1])
 result=RG.predict(trainx[k])
 acc = 0
 for i in range(int(len(trainx[k])/2),len(result)):
     acc=acc+(result[i-1]/trainy[k][i]-trainy[k][i]/trainy[k][i])**2    
 acc=acc/int(len(result)/2)
 acc=acc**(1/2.0)
 print(1-acc)
 
 
 LL = LassoLarsCV(copy_X=True, cv=None, eps=2.2204460492503131e-16,
       fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=1,
       normalize=True, positive=False, precompute='auto', verbose=False)
 LL.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1])
 result=LL.predict(trainx[k])
 acc = 0
 for i in range(int(len(trainx[k])/2),len(result)):
     acc=acc+(result[i-1]/trainy[k][i]-trainy[k][i]/trainy[k][i])**2    
 acc=acc/int(len(result)/2)
 acc=acc**(1/2.0)
 print(1-acc)
 
 
 LSC = LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
     max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
     precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
     verbose=False)
 LSC.fit(trainx[k][0:int(len(trainx[k])/2)],trainy[k][1:int(len(trainy[k])/2)+1])
 result=LSC.predict(trainx[k])
 acc = 0
Exemple #32
0
# Prediction for each clusters
from sklearn.linear_model import LassoLarsCV

results = pd.DataFrame(columns=["Id", "SalePrice"])
for cluster in range(0, kmeans.n_clusters):
    X_clus = total[total["Cluster"] == cluster].drop("SalePrice", axis=1)
    y_clus = total[total["Cluster"] == cluster]
    mean_clus = np.mean(y_clus["SalePrice"])
    y_clus = y_clus["SalePrice"] - mean_clus
    model = LassoLarsCV(cv=3, max_iter=199999999).fit(X_clus, y_clus)

    X_test_clus = X_test[X_test["Cluster"] == cluster]
    X_test_id = X_test_clus.loc[:, X_test_clus.columns == 'Id']

    pred = model.predict(X_test_clus.drop("Id", axis=1))
    X_test_id.loc[:, 1] = pred
    X_test_id.columns = ["Id", "SalePrice"]
    X_test_id["SalePrice"] += mean_clus
    results = pd.concat([results, X_test_id])

test_final = results
# Re-mean the prediction
#test_final["SalePrice"] += mean_y

test_final.head(5)
test_final.tail(5)
# Rename
submission = test_final

submission.to_csv(r'Submission_Daphne.csv', index=False)
Exemple #33
0
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths of Selected Variables')

#plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
            label='Average across the folds',linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
plt.xlim(1.95,4.0)

#MSE from training and test data
training_error = mean_squared_error(target_train,model.predict(predictors_train))
test_error = mean_squared_error(target_test,model.predict(predictors_test))
print('Training data MSE')
print(training_error)
print('Test data MSE')
print(test_error)

#R-squared from training and test data
rsquared_train=model.score(predictors_train,target_train)
rsquared_test=model.score(predictors_test,target_test)
print('Training data R**2')
print(rsquared_train)
print('Test data R**2')
print(rsquared_test)
Exemple #34
0
#train["outlier"] = LocalOutlierFactor().fit_predict(train)
#test["outlier"] = LocalOutlierFactor().fit_predict(test)

########################################################################################################################
##################                             LEAST ANGLE REGRESSION                                 ##################
########################################################################################################################

from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
#import xgboost as xgb

y_LARS_train = train["SalePrice"] - np.mean(train["SalePrice"])
model = LassoLarsCV(cv=10, max_iter=199999999).fit(X_train, y_LARS_train)
model.alpha_
pred = model.predict(X_test)
pred += np.median(train["SalePrice"])
from modules.modelaccuracy import allyouneedtoknow
allyouneedtoknow(pred, y_test)

from sklearn import linear_model
y_other_train = np.log1p(train["SalePrice"])

from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly',
               C=100,
               gamma='auto',
               degree=3,
               epsilon=.1,
GB.fit(x_train, y_train)
y_pred = GB.predict(x_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print('GB', mape)
#################MLPRegressor##################ANN
from sklearn.neural_network import MLPRegressor

Neural_MLP = MLPRegressor(hidden_layer_sizes=(50, ), max_iter=250)
#Neural_MLP = MLPRegressor()
Neural_MLP.fit(x_train, y_train)  #Fitting the Model
y_pred = Neural_MLP.predict(x_test)  #Predicting on Test DataSet
mape = mean_absolute_percentage_error(y_test, y_pred)
print('MAPE', mape)
#################################LASSO###############
from sklearn.linear_model import LassoLarsCV
#from sklearn import preprocessing
Lasso_model = LassoLarsCV()
Lasso_model.fit(x_train, y_train)
y_pred = Lasso_model.predict(
    x_test)  # we are predicting the values from the test dataset
mape = mean_absolute_percentage_error(y_test, y_pred)
print('MAPE', mape)

#########linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()  #creatingan object of linear regression model
lm.fit(x_train, y_train)  #running the model.
y_pred = lm.predict(x_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print('MAPE', mape)
Exemple #36
0
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
# There is variability across individual cv as variables area added in the same pattern
# = Decrease rapidly and then level off to point where more prediction is not reducing MSE

# 3.5 MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)  #similar accuracy

# 3.6 R-square from training and test data
rsquared_train = model.score(pred_train, tar_train)
rsquared_test = model.score(pred_test, tar_test)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)  #more accurate than training data
Exemple #37
0
plt.title('Regression Coefficients Progression for Lasso Paths')
plt.show()

m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
plt.show()         

from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, 
                                 model.predict(pred_train))
test_error = mean_squared_error(tar_test, 
                                model.predict(pred_test))
print ('training data MSE %s'%(train_error))
print ('test data MSE %s'%(test_error))

rsquared_train = model.score(pred_train,
                             tar_train)
rsquared_test=model.score(pred_test,
                          tar_test)
print ('training data R-square %s'%(rsquared_train))z
print ('test data R-square %s'%(rsquared_test))