def train_model_cv(self,
                       train_file,
                       normalize,
                       is_bool_value,
                       is_percentage,
                       cv=10,
                       save_model=False):
        # training
        features_array, label_array, feature_names = self.get_features_array_label_array_from_file(
            train_file,
            normalize=normalize,
            is_bool_value=is_bool_value,
            is_percentage=is_percentage)
        # TODO: you can change the model here. Now we are using 10-cross valication for the model.
        # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
        # self.model = linear_model.Lasso(alpha = 0.1)
        self.model = linear_model.LassoCV(cv=cv,
                                          normalize=False,
                                          verbose=True,
                                          max_iter=10000)
        print("Model Settings:", self.model)
        self.model.fit(features_array, label_array)

        self.print_linear_regression_formular(feature_names)
Exemple #2
0
def model_lasso(s, t, s_, t_, flagCV, nFeat):
    if flagCV:
        #bad r2
        #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV
        clf = sklm.LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, \
                           precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, \
                           verbose=False, n_jobs=1, positive=False, random_state=None, \
                           selection='cyclic')
    else:
        #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso
        clf = sklm.Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, \
                         copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False,\
                         random_state=None, selection='cyclic')
    clf.fit(s, t)
    print 'coeffs = ', clf.coef_, '  intercept = ', clf.intercept_

    feature_imp = clf.coef_
    feature_imp_ind_neg = feature_imp.argsort()[0:nFeat]
    feature_imp_ind_pos = feature_imp.argsort()[-nFeat:][::-1]
    print 'feature_imp_ind_neg=', feature_imp_ind_neg, 'feature_imp_ind_pos=', feature_imp_ind_pos

    r2_train = clf.score(s, t)
    r2_test = clf.score(s_, t_)
    print 'r2_train=', r2_train, ' r2_test=', r2_test
Exemple #3
0
def franke_lasso(n=10000, eps=0.0):
    max_order = 5
    x = rng.uniform(size=n)
    y = rng.uniform(size=n)
    err = eps * rng.normal(size=n)

    x_train = x[:int(n / 2)]
    y_train = y[:int(n / 2)]
    err_train = err[:int(n / 2)]
    x_valid = x[int(n / 2):]
    y_valid = y[int(n / 2):]
    err_valid = err[:int(n / 2)]

    z_train = FrankeFunction(x_train, y_train) + err_train
    z_valid = FrankeFunction(x_valid, y_valid) + err_valid
    xb = np.column_stack(sol_tup(x_train, y_train, max_order))

    lasso = linear_model.LassoCV(max_iter=100000, cv=5)
    lasso.fit(xb, z_train)
    predl = lasso.predict(np.column_stack(sol_tup(x_valid, y_valid,
                                                  max_order)))

    return lasso.coef_, mean_squared_error(z_valid,
                                           predl), r2_score(z_valid, predl)
Exemple #4
0
def fs_lasso_cv(X,
                y,
                feat_list,
                n_alphas=1000,
                cv=10,
                tol=0.00001,
                max_iter=10000,
                hard_shrink=None):
    '''Wrapper function to build a LassoCV model from sklearn and return important features'''

    lcv = linear_model.LassoCV(n_jobs=max(1,
                                          mp.cpu_count() - 1),
                               n_alphas=n_alphas,
                               cv=cv,
                               tol=tol,
                               max_iter=max_iter)
    coefs = lcv.fit(X, y).coef_

    # force shrinkage to zero if hard_shrink is provided
    if hard_shrink is not None: np.place(coefs, np.abs(coefs) < hard_shrink, 0)

    selected_feats = list(it.compress(feat_list, coefs))

    return selected_feats
def lasso_regression(X_train, y_train, X_test, y_test, normalize=False):
    print("-------------------------- Lasso Regression")
    clf = linear_model.LassoCV(alphas=np.arange(0.1, 2, 0.1), max_iter=5000)
    clf.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = clf.predict(X_test)

    # The intercept
    print("Intercept: %.4f" % clf.intercept_)
    # The mean squared error
    print("Mean squared error: %.2f"
          % mean_squared_error(y_test, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Coefficient of determination(R^2): %.2f' % r2_score(y_test, y_pred))
    # The coefficients
    cols = X_train.columns.tolist()
    coef = clf.coef_.tolist()
    coef = list(zip(cols, coef))
    df_coef = pd.DataFrame.from_records(coef)
    print('Coefficients: \n', df_coef.T)
    print('Alpha: \n', clf.alpha_)

    return clf
import numpy as np
from sklearn import cross_validation, datasets, linear_model

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
alphas = np.logspace(-4, -.5, 30)
lasso_cv = linear_model.LassoCV(alphas=alphas)
k_fold = cross_validation.KFold(len(X), 5)
alphas = np.logspace(-4, -.5, 30)

for k, (train, test) in enumerate(k_fold):
    lasso_cv.fit(X[train], y[train])
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))

Exemple #7
0
speed_corr_neurons = template_info.loc[speed_corr_neurons_index]

#   Standartize X, Y
Y = preprocessing.StandardScaler().\
    fit_transform(np.reshape(speeds_0p25[:-1], (-1, 1))).reshape(-1)

X = preprocessing.StandardScaler().\
    fit_transform(spike_rates_0p25[speed_corr_neurons_index].transpose())

#   Or not
Y = np.array(speeds_0p25)
X = spike_rates_0p25[speed_corr_neurons_index].transpose()

#   Set up the regressors
model_linear = linear_model.LinearRegression(fit_intercept=True)
model_lassoCV = linear_model.LassoCV(cv=5, fit_intercept=True)
model_lasso = linear_model.Lasso(alpha=0.02,
                                 fit_intercept=True,
                                 max_iter=10000,
                                 normalize=False)
model_gam = pg.LinearGAM()

Y = np.exp(Y) / (np.exp(Y) + 1)
model_gam = pg.GammaGAM()

#   Split the data to train and test sets
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    X, Y, test_size=0.2, random_state=0)

# Fit
model = model_gam
# boston_X_test_scaled = boston_X_scaled[-20:]
# boston_y_train = boston.target[:-20]
# boston_y_test = boston.target[-20:]

# Prepare ensemble regressors

regressors = (
    linear_model.LinearRegressi        on(fit_intercept=True),
    Pipeline(
        [('poly', PolynomialFeatures(degree=2)),
         ('linear', linear_model.LinearRegression(fit_intercept=False))]
    ),
    linear_model.Ridge(alpha=.1, fit_intercept=True),
    linear_model.RidgeCV(alphas=[.01, .1, .3, .5, 1], fit_intercept=True),
    linear_model.Lasso(alpha=1, fit_intercept=True),
    linear_model.LassoCV(n_alphas=100, fit_intercept=True),
    linear_model.ElasticNet(alpha=1),
    linear_model.ElasticNetCV(n_alphas=100, l1_ratio=.5),
    linear_model.OrthogonalMatchingPursuit(),
    linear_model.BayesianRidge(),
    linear_model.ARDRegression(),
    linear_model.SGDRegressor(),
    linear_model.PassiveAggressiveRegressor(loss='squared_epsilon_insensitive'),
    linear_model.RANSACRegressor(),
    LinearSVR(max_iter=1e4, fit_intercept=True, loss='squared_epsilon_insensitive', C=0.5),
    SVR(max_iter=1e4, kernel='poly', C=1, degree=4),
    SVR(max_iter=1e4, kernel='rbf', C=1, gamma=0.1),
    SVR(kernel='linear', C=1),
    SVR(kernel='linear', C=0.5),
    SVR(kernel='linear', C=0.1),
    DecisionTreeRegressor(max_depth=5),
Exemple #9
0
l_eye = l[idx] * np.eye(X.shape[1])
H_ridge = np.linalg.inv(X.T.dot(X) + l_eye)
beta_ridge = H_ridge.dot(X.T).dot(z_flat)
z_tilde_ridge = X @ beta_ridge

plt.figure()
plt.title('Terrain data from Norway after ridge regression')

plt.imshow(np.reshape(z_tilde_ridge, np.shape(z)), cmap='gray')
plt.xlabel('X')
plt.ylabel('Y')
save_fig('SRTM_data_Norway_1_ridge_regression')

# lasso

lasso=linear_model.LassoCV(max_iter=1000000, cv=5)
lasso.fit(X[:,1:], z_flat)
predl=lasso.predict(X[:,1:])

plt.figure()
plt.title('Terrain data from Norway after lasso')

plt.imshow(np.reshape(predl, np.shape(z)), cmap='gray')
plt.xlabel('X')
plt.ylabel('Y')
save_fig('SRTM_data_Norway_1_lasso')

## compressing image data (unused)
r = 20
U, S, V = np.linalg.svd(z)        #using SVD method to decompose image
Exemple #10
0
    def __init__(
        self,
        method,
        yrange,
        params,
        i=0
    ):  #TODO: yrange doesn't currently do anything. Remove or do something with it!
        self.algorithm_list = [
            'PLS',
            'GP',
            'OLS',
            'OMP',
            'Lasso',
            'Elastic Net',
            'Ridge',
            'Bayesian Ridge',
            'ARD',
            'LARS',
            'LASSO LARS',
            'SVR',
            'KRR',
        ]
        self.method = method
        self.outliers = None
        self.ransac = False

        print(params)
        if self.method[i] == 'PLS':
            self.model = PLSRegression(**params[i])

        if self.method[i] == 'OLS':
            self.model = linear.LinearRegression(**params[i])

        if self.method[i] == 'OMP':
            # check whether to do CV or not
            self.do_cv = params[i]['CV']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove CV parameter
            params_temp.pop('CV')
            if self.do_cv is False:
                self.model = linear.OrthogonalMatchingPursuit(**params_temp)
            else:
                params_temp.pop('precompute')
                self.model = linear.OrthogonalMatchingPursuitCV(**params_temp)

        if self.method[i] == 'LASSO':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # check whether to do CV or not
            try:
                self.do_cv = params[i]['CV']
                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lasso(**params_temp)
            else:
                params_temp.pop('alpha')
                self.model = linear.LassoCV(**params_temp)

        if self.method[i] == 'Elastic Net':
            params_temp = copy.copy(params[i])
            try:
                self.do_cv = params[i]['CV']
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.ElasticNet(**params_temp)
            else:
                params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1]
                self.model = linear.ElasticNetCV(**params_temp)

        if self.method[i] == 'Ridge':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv:
                self.model = linear.RidgeCV(**params_temp)
            else:
                self.model = linear.Ridge(**params_temp)

        if self.method[i] == 'BRR':
            self.model = linear.BayesianRidge(**params[i])

        if self.method[i] == 'ARD':
            self.model = linear.ARDRegression(**params[i])

        if self.method[i] == 'LARS':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lars(**params_temp)
            else:
                self.model = linear.LarsCV(**params_temp)

        if self.method[i] == 'LASSO LARS':
            model = params[i]['model']
            params_temp = copy.copy(params[i])
            params_temp.pop('model')

            if model == 0:
                self.model = linear.LassoLars(**params_temp)
            elif model == 1:
                self.model = linear.LassoLarsCV(**params_temp)
            elif model == 2:
                self.model = linear.LassoLarsIC(**params_temp)
            else:
                print("Something went wrong, \'model\' should be 0, 1, or 2")

        if self.method[i] == 'SVR':
            self.model = svm.SVR(**params[i])

        if self.method[i] == 'KRR':
            self.model = kernel_ridge.KernelRidge(**params[i])

        if self.method[i] == 'GP':
            # get the method for dimensionality reduction and the number of components
            self.reduce_dim = params[i]['reduce_dim']
            self.n_components = params[i]['n_components']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove parameters not accepted by Gaussian Process
            params_temp.pop('reduce_dim')
            params_temp.pop('n_components')
            self.model = GaussianProcess(**params_temp)
Exemple #11
0
# print('Final prediction score with optimal: [%.8f](use RF)' % mean_absolute_error(y_test, y_pred))
#
# '''5. just use the EXTRA'''
# model = ExtraTreesRegressor(random_state=0, n_estimators=100)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print('Final prediction score with optimal: [%.8f](use EXTRA)' % mean_absolute_error(y_test, y_pred))
#
# '''6. just use the XGB'''
# model = XGBRegressor(random_state=0,n_estimators=100, n_jobs=-1)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print('Final prediction score with optimal: [%.8f](use XGB)' % mean_absolute_error(y_test, y_pred))

models = [
    linear_model.LassoCV(random_state=0, n_jobs=1),
    GradientBoostingRegressor(random_state=0),
    SVR(),
    RandomForestRegressor(random_state=0,
                          n_jobs=1,
                          n_estimators=100,
                          max_depth=3),
    ExtraTreesRegressor(random_state=0, n_estimators=100),
    XGBRegressor(random_state=0, n_estimators=100, n_jobs=1)
]
# # '''With optimal'''
model = Superknn(models=models,
                 metric=mean_absolute_error,
                 n_jobs=10,
                 random_state=0,
                 folds=5)
def selectByLassoL1(X, y, value, diagnose=False):
    '''
    Select the features using lasso regression

    Parameters
    ----------
    X : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable
    value : TYPE float
        DESCRIPTION. correlation treshold 0->1
    diagnose : TYPE, optional if true generates the correlation of each variable to the target var
        DESCRIPTION. The default is False.


    Returns
    -------
    res : TYPE pandas dataframe
        DESCRIPTION. output dataframe with selected variables
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary containing the figures with the plot of the selected features depending o the variance


    '''

    output_figure = {}
    #lasso feature selection works for regression models
    Q = dummyColumns(X)
    clf = linear_model.LassoCV(cv=5)

    if diagnose:
        numFeatSelected = []
        for i in range(1, 101):
            val = i / 100
            sfm = SelectFromModel(clf, threshold=val)
            sfm.fit(Q, y)
            nn = len(Q.columns[sfm.get_support()])
            #print(nn)
            numFeatSelected.append(nn)
        fig1 = plt.figure()
        plt.plot(range(1, 101), numFeatSelected)
        plt.title('Lasso graph')
        plt.xlabel('Coeff threshold %')
        plt.ylabel('Num. selected features')
        output_figure['LassoChart'] = fig1
        plt.close('all')

    #_, _, _, alphaValue, _= LassoRegressionCV(Q,y,'',nFolds=5, saveFig=False) #previous version with lasso from ZO_RegressionLinearModel
    #las=LassoRegressionComplete(Q,y,alphaValue)

    sfm = SelectFromModel(clf, threshold=value)
    sfm.fit(Q, y)
    #n_features = sfm.transform(X).shape[1]
    #model = SelectFromModel(las, prefit=True,threshold=0.25)

    Z = sfm.transform(Q)
    feature_idx = sfm.get_support()
    feature_name = Q.columns[feature_idx]
    res = pd.DataFrame(Z, columns=feature_name)
    return res, output_figure
def organize_data(ManagerID, data_mktbeta, data_indubeta, data_FAndMdata,
                  startdatestr, enddatestr, ob_win):
    """
    give ManagerID, organize data to store in sql
    ob_win: ob window length
    return result_df
    """
    # print(datetime.datetime.now())
    # print('Start organize data of: ' + ManagerID)
    # read mng record data
    data_allrecord = data_FAndMdata[data_FAndMdata.ManagerID == ManagerID]
    # store
    cols = ['ID', 'EndDate', 'InvestAdvisor', 'ManagerID', 'Ret']
    result_df = pd.DataFrame(columns=cols)  # store here
    # 先算出复合收益率
    time_array = data_allrecord.EndDate.unique()
    for date in time_array:
        data_subrecord = data_allrecord[data_allrecord.EndDate == date]
        wgted_ret = 0
        wgt = 0
        InAdv = ''
        for index, row in data_subrecord.iterrows():
            wgt += 1 / row.ManagersofFund
            wgted_ret += row.dailyreturn / row.ManagersofFund
            InAdv = row.InvestAdvisorAbbrName
        ret = wgted_ret / wgt
        if np.isnan(ret):
            continue
        else:
            IDstr = ManagerID + pd.to_datetime(date).strftime('%Y%m%d')
            result_df = result_df.append(dict(
                zip(cols, [IDstr, date, InAdv, ManagerID, ret])),
                                         ignore_index=True)
    # 业绩分解
    # fetch index data and construct np.array
    addcols = [
        'beta_mkt1',
        'beta_mkt2',
        'beta_mkt3',
        'name_mkt1',
        'name_mkt2',
        'name_mkt3',
        'intercept_mkt',
        'score_mkt',
        'bias_ret_mkt',
        'bias_var_mkt',
        'bias_score_mkt',  # bias mkt: 风格的偏离
        'beta_indu1',
        'beta_indu2',
        'beta_indu3',
        'name_indu1',
        'name_indu2',
        'name_indu3',
        'intercept_indu',
        'score_indu',
        'bias_ret_indu',
        'bias_var_indu',
        'bias_score_indu'
    ]  # bias indu:行业的偏离
    newcols = cols + addcols
    result_df = result_df.reindex(columns=newcols)
    idx = 0
    # total = len(result_df)
    while idx < len(result_df):
        if idx < ob_win - 1:
            # no enough data
            # print('line:' + str(idx) + '/' + str(total) + ', skip')
            idx += 1
            continue
        else:
            # get date
            obdates = result_df.iloc[(idx - ob_win + 1):(idx + 1)].EndDate
            timegap = (obdates.iloc[-1] - obdates.iloc[0]).days
            if timegap / ob_win > 9 / 5:
                # dates not continuous
                # print('line:' + str(idx) + '/' + str(total) + ', skip')
                idx += 1
                continue
            else:
                # calc
                mng_ret = result_df.iloc[(idx - ob_win + 1):(idx + 1)].Ret
                mkt_ret = data_mktbeta.loc[obdates]
                indu_ret = data_indubeta.loc[obdates]
                mng_ret = mng_ret.values
                mkt_ret = mkt_ret.values
                indu_ret = indu_ret.values
                # remove NaN rows
                isnanrow = np.isnan(mkt_ret[:, 1])
                mng_ret = mng_ret[~isnanrow]
                mkt_ret = mkt_ret[~isnanrow]
                indu_ret = indu_ret[~isnanrow]
                # define mkt model
                model = linear_model.LassoCV(
                    positive=True,
                    cv=int(ob_win / 30),  # subsample size = 30
                    selection='random',
                    fit_intercept=True,
                    normalize=False)
                # mkt
                model.fit(mkt_ret, mng_ret)  # fit(X, y)
                beta_mkt = model.coef_
                name_mkt = data_mktbeta.columns.values
                sortedidx = np.argsort(beta_mkt)
                result_df.ix[idx, 'beta_mkt1'] = beta_mkt[sortedidx[-1]]
                result_df.ix[idx, 'name_mkt1'] = name_mkt[sortedidx[-1]]
                result_df.ix[idx, 'beta_mkt2'] = beta_mkt[sortedidx[-2]]
                result_df.ix[idx, 'name_mkt2'] = name_mkt[sortedidx[-2]]
                result_df.ix[idx, 'beta_mkt3'] = beta_mkt[sortedidx[-3]]
                result_df.ix[idx, 'name_mkt3'] = name_mkt[sortedidx[-3]]
                result_df.ix[idx, 'intercept_mkt'] = model.intercept_
                result_df.ix[idx, 'score_mkt'] = model.score(mkt_ret, mng_ret)
                # bias mkt
                # calc ret
                b_avg = np.mean(beta_mkt)
                b_adj = beta_mkt - b_avg
                ct = 1 / np.sum(b_adj[b_adj > 0])  # scale factor
                b_adj = b_adj * ct
                bias_retts_mkt = np.dot(mkt_ret, b_adj)  # dot成个加权的收益率
                temp = np.mean(bias_retts_mkt) * 250  # daily ret 的年化
                if np.isnan(temp):
                    temp = 0
                result_df.ix[idx, 'bias_ret_mkt'] = temp
                temp = np.std(bias_retts_mkt) * 250**0.5  # daily ret std 的年化
                if np.isnan(temp):
                    temp = 0
                result_df.ix[idx, 'bias_var_mkt'] = temp
                # calc score
                # std coef
                result_df.ix[idx, 'bias_score_mkt'] = np.std(beta_mkt)
                # define indu model
                model = linear_model.LassoCV(
                    positive=True,
                    cv=int(ob_win / 30),  # subsample size = 30
                    selection='random',
                    fit_intercept=True,
                    normalize=False)
                # indu
                model.fit(indu_ret, mng_ret)
                beta_indu = model.coef_
                name_indu = data_indubeta.columns.values
                sortedidx = np.argsort(beta_indu)
                result_df.ix[idx, 'beta_indu1'] = beta_indu[sortedidx[-1]]
                result_df.ix[idx, 'name_indu1'] = name_indu[sortedidx[-1]]
                result_df.ix[idx, 'beta_indu2'] = beta_indu[sortedidx[-2]]
                result_df.ix[idx, 'name_indu2'] = name_indu[sortedidx[-2]]
                result_df.ix[idx, 'beta_indu3'] = beta_indu[sortedidx[-3]]
                result_df.ix[idx, 'name_indu3'] = name_indu[sortedidx[-3]]
                result_df.ix[idx, 'intercept_indu'] = model.intercept_
                result_df.ix[idx,
                             'score_indu'] = model.score(indu_ret, mng_ret)
                # bias indu
                # calc ret
                b_avg = np.mean(beta_indu)
                b_adj = beta_indu - b_avg
                ct = 1 / np.sum(b_adj[b_adj > 0])  # scale factor
                b_adj = b_adj * ct
                bias_retts_indu = np.dot(indu_ret, b_adj)  # dot成个加权的收益率
                temp = np.mean(bias_retts_indu) * 250  # daily ret 的年化
                if np.isnan(temp):
                    temp = 0
                result_df.ix[idx, 'bias_ret_indu'] = temp
                temp = np.std(bias_retts_indu) * 250**0.5  # daily ret std 的年化
                if np.isnan(temp):
                    temp = 0
                result_df.ix[idx, 'bias_var_indu'] = temp
                # calc score
                # std coef
                result_df.ix[idx, 'bias_score_indu'] = np.std(beta_indu)
                # end of calc
                idx += 1
                # print('line:' + str(idx) + '/' + str(total) + ', done')
    # end of while loop
    # 截取starttime和endtime之间的result
    sdtime = datetime.datetime.strptime(startdatestr, '%Y-%m-%d')
    edtime = datetime.datetime.strptime(enddatestr, '%Y-%m-%d')
    result_df = result_df[(result_df.EndDate >= sdtime)
                          & (result_df.EndDate <= edtime)]
    # print(datetime.datetime.now())
    print('End organize data of: ' + ManagerID)
    return result_df
Exemple #14
0
def computeRscores(product_features_list, product_ratings_list, num_features,
                   file_name):
    from sklearn import linear_model
    result = []
    #创建一个文件夹
    import os
    new_file_dir = path + "data\\feature_coefficient_list_" + file_name.split(
        ".")[0] + "\\"
    isExists = os.path.exists(new_file_dir)
    if not isExists:
        os.makedirs(new_file_dir)
    new_file_name = new_file_dir + file_name.split(".")[0] + "_"

    #第一种,直接利用linear regression
    print "start to linear regression"
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    reg = linear_model.LinearRegression()
    reg.fit(copy_product_features_list, copy_product_ratings_list)
    del copy_product_features_list
    del copy_product_ratings_list
    file_path = new_file_name + "linear_regression_" + str(
        num_features) + ".xls"
    save_coefficients_new(reg.coef_, reg.intercept_, file_path)
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    r2_linearregression = reg.score(copy_product_features_list,
                                    copy_product_ratings_list)
    print "r2 score: ", r2_linearregression
    del copy_product_features_list
    del copy_product_ratings_list

    #第二种,lasso
    print "start to lasso regression"
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    reg = linear_model.LassoCV(cv=5, random_state=0)
    reg.fit(copy_product_features_list, copy_product_ratings_list)
    del copy_product_features_list
    del copy_product_ratings_list
    file_path = new_file_name + "linear_lassocv_regression_" + str(
        num_features) + ".xls"
    save_coefficients_new(reg.coef_, reg.intercept_, file_path)
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    r2_lasso = reg.score(copy_product_features_list, copy_product_ratings_list)
    print "r2 score: ", r2_lasso
    del copy_product_features_list
    del copy_product_ratings_list
    result = reg.coef_

    #第三种 ridge
    print "start to ridge regression"
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    reg = linear_model.RidgeCV(cv=5)
    reg.fit(copy_product_features_list, copy_product_ratings_list)
    del copy_product_features_list
    del copy_product_ratings_list
    file_path = new_file_name + "linear_ridge_regression_" + str(
        num_features) + ".xls"
    save_coefficients_new(reg.coef_, reg.intercept_, file_path)
    copy_product_features_list = copy.deepcopy(product_features_list)
    copy_product_ratings_list = copy.deepcopy(product_ratings_list)
    r2_ridge = reg.score(copy_product_features_list, copy_product_ratings_list)
    print "r2 score: ", r2_ridge
    del copy_product_features_list
    del copy_product_ratings_list

    return result
Exemple #15
0
#%%
import sklearn.linear_model as sk_lm
import sklearn.ensemble as sk_ens
import xgboost as xgb
import sklearn.neural_network as sk_nn

from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold, KFold

df_X_train = df_non_obj_feats[:train_len]
df_X_test = df_non_obj_feats[train_len:]
y = target_log1p

df_importance = pd.DataFrame(data=None, index=df_non_obj_feats.columns)
#%%
model_lasso = sk_lm.LassoCV(alphas=[3e-4, 3e-3, 3e-2, 3e-1, 3, 30])
model_lasso.fit(df_X_train, y)
model_lasso.score(df_X_train, y)
r2_score_lasso = r2_score(y, model_lasso.predict(df_X_train))
print('r2_score of Lasso:', r2_score_lasso)
lasso_importance = pd.DataFrame(model_lasso.coef_,
                             df_X_train.columns,['LS_feat_importance'])
lasso_importance.plot(); plt.show()

df_importance['LassoCV'] = r2_score_lasso*lasso_importance/np.max(np.abs(lasso_importance))
#%%
model_elen = sk_lm.ElasticNetCV(alphas=[3e-4, 3e-3, 3e-2, 3e-1, 3, 30])
model_elen.fit(df_X_train, y)
model_elen.score(df_X_train, y)
r2_score_elen = r2_score(y, model_elen.predict(df_X_train))
print('r2_score of ElasticNet:', r2_score_elen)
def main():
    pd.set_option('display.max_columns', None)
    from sklearn.neighbors import KNeighborsClassifier
    data = pd.read_csv('Life Expectancy Data.csv')
    # print(data.columns)
    data.rename(columns={'Life expectancy ': "Life_expectancy"}, inplace=True)
    data.rename(columns={'Adult Mortality': "Adult_Mortality"}, inplace=True)
    data.rename(columns={'infant deaths': "infant_deaths"}, inplace=True)
    data.rename(columns={'percentage expenditure': 'percentage_expenditure'}, inplace=True)
    data.rename(columns={'Hepatitis B': "Hepatitis_B"}, inplace=True)
    data.rename(columns={' BMI ': "BMI"}, inplace=True)
    data.rename(columns={'under-five deaths ': "under-five_deaths"}, inplace=True)
    data.rename(columns={'Total expenditure': "Total_expenditure"}, inplace=True)
    data.rename(columns={' HIV/AIDS': "HIV/AIDS"}, inplace=True)
    data.rename(columns={' thinness  1-19 years': "thinness_1-19_years"}, inplace=True)
    data.rename(columns={' thinness 5-9 years': "thinness_5-9_years"}, inplace=True)
    data.rename(columns={'Income composition of resources': "Income_composition_of_resources"}, inplace=True)
    data.rename(columns={'HIV/AIDS': "HIV_AIDS"}, inplace=True)
    data.rename(columns={'Measles ': "Measles"}, inplace=True)
    data.rename(columns={'Diphtheria ': "Diphtheria"}, inplace=True)

    # delet the data with null life expectancy value
    data['Life_expectancy'] = data['Life_expectancy'].fillna(999)
    drop_index = data[(data.Life_expectancy == 999)].index.tolist()
    data = data.drop(drop_index)
    # print(data['Life_expectancy'])

    # make life_expectancy as our output
    labels = data.loc[:, ['Life_expectancy']]

    # del data['Life_expectancy']

    # deal with categorical data "status" since it contains numerical quality
    data['Status'] = data['Status'].str.replace('Developed', '2', case=False)
    data['Status'] = data['Status'].str.replace('Developing', '1', case=False)
    # print('data=',data)
    #    data=pd.get_dummies(data, prefix=['Country'], columns=['Country'])
    #    print('size after dummy', data.shape)

    # Separate the data to train, val and test data
    x, x_test, y, y_test = train_test_split(data, labels, test_size=0.2, train_size=0.8, random_state=1)
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=50)
    # print(x_train)
    # calculate the null value in each column
    NoNull_train = x_train.isnull().sum(axis=0)
    # print('--------Null data in train----------')
    # print(NoNull_train)
    # print('--------Null data in train----------')

    '''
    sns.distplot(y_train['Life_expectancy'])
    mu=y_train['Life_expectancy'].mean()
    sigma=y_train['Life_expectancy'].std()
    #Now plot the distribution
    plt.legend(['Original dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
    plt.ylabel('Frequency')
    plt.title('Life_expectancy')

    #Get also the QQ-plot
    fig = plt.figure()
    res = stats.probplot(y_train['Life_expectancy'], plot=plt)
    plt.show()
    '''

    # make the target_train data become more closed to the normal distribution
    # ----------------------------------------------
    # We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
    '''
    #Check the new distribution
    sns.distplot(y_train['Life_expectancy'] , fit=norm);
    
    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(y_train['Life_expectancy'])
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

    #Now plot the distribution
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
    plt.ylabel('Frequency')
    plt.title('Life Expectancy')

    #Get also the QQ-plot
    fig = plt.figure()
    res = stats.probplot(y_train['Life_expectancy'], plot=plt)
    plt.show()
    '''
    Null_train_ratio = (x_train.isnull().sum() / len(x_train)) * 100
    Null_train_ratio = Null_train_ratio.sort_values(ascending=False)
    AllNull_train_ratio = Null_train_ratio.drop(Null_train_ratio[Null_train_ratio == 0].index)
    missing_train_ratio = pd.DataFrame({'Missing train data ratio': AllNull_train_ratio})
    # print(missing_train_ratio)

    f, ax = plt.subplots(figsize=(15, 12))
    plt.xticks(rotation='90')  # ratate direction of words for each feature
    sns.barplot(x=Null_train_ratio.index, y=Null_train_ratio)
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title('missing data percentage by feature', fontsize=15)
    plt.show()

    # print("--------------train data description-------------")
    # print(x_train.describe())
    # draw to recognize the outliers
    # Show the boxplot before dealing with outliers
    # feature "Adult_Mortality"
    # showplot(x_train,'before')

    # try to remove the outliers
    # print('len of x train=',len(x_train))
    Q1 = x_train.quantile(0.25)
    Q3 = x_train.quantile(0.75)
    IQR = Q3 - Q1
    x_train = outlier_remove_traindata(x_train, Q1, Q3, IQR)
    x_val = outlier_remove_traindata(x_val, Q1, Q3, IQR)
    # print('finish removing the outliers')

    # ----------finished dealing with the outlier in each features-------------

    # Show the boxplot after dealing with outliers
    # feature "Adult_Mortality"
    # showplot(x_train,'after')

    ####fill the missing data with mean value
    x_train = x_train.fillna(x_train.mean())
    x_val = x_val.fillna(x_train.mean())
    x = x.fillna(x.mean())
    x_test = x_test.fillna(x.mean())
    ####fill the missing data with median
    # x_train=x_train.fillna(x_train.median())
    # x_val=x_val.fillna(x_train.median())

    ####fill the missing data with -1
    # x_train=x_train.fillna(-1)
    # x_val=x_val.fillna(-1)

    ####fill the missing data with 0
    # x_train=x_train.fillna(0)
    # x_val=x_val.fillna(0)

    NoNull_train = x_train.isnull().sum(axis=0)
    # print('--------Null data in train----------')
    # print('NoNull_train',NoNull_train)

    # separate the string and numerical data
    x_train_str = x_train.loc[:, ["Country"]]
    x_train_num = x_train.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol',
                                  'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths',
                                  'Polio', 'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population',
                                  'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources',
                                  'Schooling']]
    x_val_str = x_val.loc[:, ["Country"]]
    x_val_num = x_val.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol',
                              'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio',
                              'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years',
                              'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']]
    x_str = x.loc[:, ["Country"]]
    x_num = x.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol',
                      'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio',
                      'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years',
                      'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']]
    x_test_num = x_test.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol',
                                'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio',
                                'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population',
                                'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources',
                                'Schooling']]

    x_train_str = pd.get_dummies(x_train_str)
    # Try to see the correlation between country(after get dummy) and life expectancy
    country_col = x_train_str.columns
    # print('country column =',len(country_col))
    # Try to see the correlation between country(after get dummy) and life expectancy
    x_train_str["Life_expectancy"] = y_train

    country_corrmat = x_train_str.corr()
    cols = abs(country_corrmat).nlargest(10, 'Life_expectancy')['Life_expectancy'].index
    cm = np.corrcoef(x_train_str[cols].values.T)
    sns.set(font_scale=1.25)
    plt.subplots(figsize=(15, 12))
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values,
                     xticklabels=cols.values)
    bottom, top = hm.get_ylim()
    hm.set_ylim(bottom + 0.5, top - 0.5)
    plt.title('The country that is most related to life expectancy')
    plt.show()
    # Since the highest correlation between country and life expectancy is 0.17, we decide not to use the feature "country"

    # standardize the data
    standar = preprocessing.StandardScaler().fit(x_train_num)
    x_train = standar.transform(x_train_num)
    x_val = standar.transform(x_val_num)
    x_train1 = pd.DataFrame(data=x_train, columns=x_train_num.columns)
    x_val1 = pd.DataFrame(data=x_val, columns=x_train_num.columns)
    # Correlation map to see how features are correlated with LifeExpectancy
    corrmat = x_train1.corr()
    plt.subplots(figsize=(18, 15))
    ax = sns.heatmap(corrmat, vmax=1, annot=True, square=True, vmin=0)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title('Correlation Heatmap Between Each Feature')
    plt.show()

    cols = abs(corrmat).nlargest(19, 'Life_expectancy')['Life_expectancy'].index
    cm = np.corrcoef(x_train1[cols].values.T)
    sns.set(font_scale=1.25)
    plt.subplots(figsize=(15, 12))
    plt.title('18 Features that are most related to Life Expectancy')
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values,
                     xticklabels=cols.values)
    bottom, top = hm.get_ylim()
    hm.set_ylim(bottom + 0.5, top - 0.5)

    plt.show()

    cols = abs(corrmat).nlargest(21, 'Life_expectancy')['Life_expectancy'].index
    related_col = cols.drop(['Life_expectancy']).drop(['Status']).drop(['Hepatitis_B']).drop(['infant_deaths']).drop(
        ['GDP']).drop(['Measles']).drop(['Population']).drop(['percentage_expenditure']).drop(['Diphtheria'])
    # related_col = related_col.drop(['Status'])

    # related_col = related_col.drop(['under-five_deaths'])
    # print("The columns most related to Life expectancy=", related_col)

    # transform the test data
    standar_all = preprocessing.StandardScaler().fit(x_num)
    x = standar_all.transform(x_num)
    x_test = standar_all.transform(x_test_num)
    x1 = pd.DataFrame(data=x, columns=x_train_num.columns)
    x_test1 = pd.DataFrame(data=x_test, columns=x_train_num.columns)

    x_train = x_train1[related_col]
    x_val = x_val1[related_col]
    x = x1[related_col]
    x_test = x_test1[related_col]
    '''
    # Choose the optimal no of features => 18 features (k=19, we need to deduct 'life expectancy')
    meanerror_NoFeature_val = []
    mse_NoFeature_val = []
    for k in range(5, 21):
        cols = abs(corrmat).nlargest(k, 'Life_expectancy')['Life_expectancy'].index
        related_col = cols.drop(['Life_expectancy'])
        x_train = x_train1[related_col]
        x_val = x_val1[related_col]
        mean_train = np.mean(y_train)
        mean_train_array = [x * mean_train for x in np.ones(y_train.shape[0])]
        np.ones(y_train.shape)
        # We found that the min MSE happened when n_estimators=25
        error_train = []
        error_val = []
        mse_train = []
        mse_val = []
        for j in range(10):
            pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train,
                                                                                          test_size=1 / 3)
            RF = RandomForestRegressor(n_estimators=25, bootstrap=True, max_features=3)
            RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel())
            predict_train = RF.predict(x_train)
            predict_val = RF.predict(x_val)
            acc_train = RF.score(x_train, y_train)
            acc_val = RF.score(x_val, y_val)
            error_train = np.append(error_train, 1 - acc_train)
            error_val = np.append(error_val, 1 - acc_val)
            mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train))
            mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val))
        meanerror_val = np.mean(error_val)
        mean_mse_val = np.mean(mse_val)
        meanerror_NoFeature_val = np.append(meanerror_NoFeature_val, meanerror_val)
        mse_NoFeature_val = np.append(mse_NoFeature_val, mean_mse_val)
    # print('mean error in validation set when 4~19 features')
    # print(meanerror_NoFeature_val)
    # print(' ')
    # print('mse in validation set when 4~19 features')
    # print(mse_NoFeature_val)
    print('When we set the number of features from 4-19,')
    print('min mean error = %.2f and min MSE = %.2f in validation set when we chose %.0f correlated features' % (
    min(meanerror_NoFeature_val), min(mse_NoFeature_val), np.argmin(meanerror_NoFeature_val) + 4))
    X = np.arange(4, 20)
    plt.plot(X, meanerror_NoFeature_val, label='Mean Error')
    plt.title('Number of Features vs Mean Error')
    plt.ylabel('Mean Error')
    plt.xlabel('Number of features')
    plt.show()
    plt.plot(X, mse_NoFeature_val, label='MSE')
    plt.title('Number of features vs MSE')
    plt.ylabel('MSE')
    plt.xlabel('Number of features')
    plt.show()
    '''
    ################Regression Tree#########################
    # Find the best way to fill the missing data
    print('###################Random Forest Regression########################')
    meanerror_train = []
    meanerror_val = []
    mean_mse_train = []
    mean_mse_val = []
    for i in range(50):
        error_train = []
        error_val = []
        mse_train = []
        mse_val = []
        for j in range(10):
            pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train,
                                                                                          test_size=1 / 3)
            RF = RandomForestRegressor(n_estimators=i + 1, bootstrap=True, random_state=0)
            RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel())
            predict_train = RF.predict(x_train)
            predict_val = RF.predict(x_val)
            acc_train = RF.score(x_train, y_train)
            acc_val = RF.score(x_val, y_val)
            error_train = np.append(error_train, 1 - acc_train)
            error_val = np.append(error_val, 1 - acc_val)
            mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train))
            mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val))
        meanerror_train = np.append(meanerror_train, np.mean(error_train))
        meanerror_val = np.append(meanerror_val, np.mean(error_val))
        mean_mse_train = np.append(mean_mse_train, np.mean(mse_train))
        mean_mse_val = np.append(mean_mse_val, np.mean(mse_val))
    # print('When fill the missing data with mean:')
    # print('mean error in training set =', meanerror_train)
    # print('mean error in validation set =', meanerror_val)
    # print('MSE in training set =',mean_mse_train)
    # print('MSE in validation set =',mean_mse_val)
    print("we got the min MSE value=%.3f and error rate=%.3f in validation set when there are %.0f trees" % (
    min(mean_mse_val), min(meanerror_val), np.argmin(mean_mse_val) + 1))

    fi = pd.DataFrame({'feature': list(x_train.columns),
                       'importance': RF.feature_importances_}). \
        sort_values('importance', ascending=False)
    #print('importance=', fi)

    # plot the figure
    X = np.arange(1, 51)
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
    fig.suptitle('Random Forest Regression')
    ax1.plot(X, meanerror_train, label='train data')
    ax1.plot(X, meanerror_val, color='r', label='val data')
    ax1.set_ylabel('Error Rate')
    ax1.plot(np.argmin(meanerror_val) + 1, min(meanerror_val), '*', label='minimum', color='b', markersize=15)
    ax1.legend(loc='best')

    ax2.plot(X, mean_mse_train, label='train data')
    ax2.plot(X, mean_mse_val, color='r', label='val data')
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('Number of trees')
    ax2.plot(np.argmin(mean_mse_val) + 1, min(mean_mse_val), '*', label='minimum', color='b', markersize=15)
    ax2.legend(loc='best')
    plt.show()
    ''' 
    #We found that the min MSE happened when n_estimators=25
    error_train = []
    error_val = []
    mse_train = []
    mse_val = []
    for j in range(10):
        pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train, test_size=1 / 3)
        RF = RandomForestRegressor(n_estimators=25, bootstrap=True, max_features=3)
        RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel())
        predict_train = RF.predict(x_train)
        predict_val = RF.predict(x_val)
        acc_train = RF.score(x_train, y_train)
        acc_val = RF.score(x_val, y_val)
        error_train = np.append(error_train, 1 - acc_train)
        error_val = np.append(error_val, 1 - acc_val)
        mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train))
        mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val))
    meanerror_train = np.mean(error_train)
    meanerror_val = np.mean(error_val)
    mean_mse_train = np.mean(mse_train)
    mean_mse_val = np.mean(mse_val)
    print('When fill the missing data with 0 and n_estimator = 25:')
    print('mean error in training set =', meanerror_train)
    print('mean error in validation set =', meanerror_val)
    print('MSE in training set =',mean_mse_train)
    print('MSE in validation set =',mean_mse_val)
    '''
    #############Linear Regression################
    # linear regression (general)
    print('###################linear regression (general)########################')
    lin_mse_val = []
    lin_error_val = []
    lin_mse_train = []
    lin_error_train = []
    corrmat = x_train1.corr()
    for k in range(5, 21):
        cols = abs(corrmat).nlargest(k, 'Life_expectancy')['Life_expectancy'].index
        related_col = cols.drop(['Life_expectancy'])
        x_train = x_train1[related_col]
        x_val = x_val1[related_col]
        reg = linear_model.LinearRegression()
        reg.fit(x_train, y_train)
        reg_predict_val = reg.predict(x_val)
        reg_predict_train = reg.predict(x_train)
        reg_acc_val = reg.score(x_val, y_val)
        reg_acc_train = reg.score(x_train, y_train)
        lin_mse_val = np.append(lin_mse_val, mean_squared_error(y_val, reg_predict_val))
        lin_mse_train = np.append(lin_mse_train, mean_squared_error(y_train, reg_predict_train))
        lin_error_val = np.append(lin_error_val, 1 - reg_acc_val)
        lin_error_train = np.append(lin_error_train, 1 - reg_acc_train)
    print('Linear regression when featrues=4-19')
    # print('error rate in validation set =')
    # print(lin_error_val)
    # print('MSE in validation set')
    # print(lin_mse_val)
    print('We can find the min error rate= %.4f and min MSE= %.4f when there are %.0f features' % (
    min(lin_error_val), min(lin_mse_val), np.argmin(lin_mse_val) + 4))

    X = np.arange(4, 20, 1)
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
    fig.suptitle('Linear Regression')
    ax1.plot(X, lin_error_val, color='r', label='Validation set')
    ax1.plot(X, lin_error_train, label='Training set')
    ax1.set_ylabel('Error Rate')
    ax1.plot(np.argmin(lin_error_val) + 4, min(lin_error_val), '*', label='minimum', color='b', markersize=15)
    ax1.legend(loc='best')

    ax2.plot(X, lin_mse_val, color='r', label='Validation set')
    ax2.plot(X, lin_mse_train, label='Training set')
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('Number of Features')
    ax2.plot(np.argmin(lin_mse_val) + 4, min(lin_mse_val), '*', label='minimum', color='b', markersize=15)
    ax2.legend(loc='best')
    plt.show()
    ############Ridge Regression###############
    print('##############Ridge Regression(without CV)###############')

    X = np.linspace(-3, 1, 30)
    cols = abs(corrmat).nlargest(19, 'Life_expectancy')[
        'Life_expectancy'].index  # Select 7 features that are the most related to life expectancy
    related_col = cols.drop(['Life_expectancy'])
    x_train = x_train1[related_col]
    x_val = x_val1[related_col]
    rid_mse_val = []
    rid_mse_train = []
    rid_error_val = []
    rid_error_train = []
    for i in X:
        ridge = linear_model.Ridge(alpha=10 ** i, normalize=True)
        ridge.fit(x_train, y_train)
        ridge_predict_val = ridge.predict(x_val)
        ridge_predict_train = ridge.predict(x_train)
        ridge_acc_val = ridge.score(x_val, y_val)
        ridge_acc_train = ridge.score(x_train, y_train)
        rid_mse_val = np.append(rid_mse_val, mean_squared_error(y_val, ridge_predict_val))
        rid_mse_train = np.append(rid_mse_train, mean_squared_error(y_train, ridge_predict_train))
        rid_error_val = np.append(rid_error_val, 1 - ridge_acc_val)
        rid_error_train = np.append(rid_error_train, 1 - ridge_acc_train)
    # print('error rate in validation set =')
    # print(rid_error_val)
    # print('MSE in validation set')
    # print(rid_mse_val)
    print('We can find the min error rate= %.4f and min MSE= %.4f when alpha= %.6f ' % (
    min(rid_error_val), min(rid_mse_val), 10 ** X[np.argmin(rid_mse_val)]))
    # print('alpha=',X)
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
    fig.suptitle('Ridge Regression (Without CV)')
    ax1.plot(X, rid_error_train, label='Training set')
    ax1.plot(X, rid_error_val, color='r', label='Validation set')
    ax1.set_ylabel('Error Rate')
    ax1.set_xlabel('log(alpha)')
    ax1.plot(X[np.argmin(rid_error_val)], min(rid_error_val), '*', label='minimum', color='b', markersize=15)
    ax1.legend(loc='best')

    ax2.plot(X, rid_mse_train, label='Training set')
    ax2.plot(X, rid_mse_val, color='r', label='Validation set')
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('log(alpha)')
    ax2.plot(X[np.argmin(rid_mse_val)], min(rid_mse_val), '*', label='minimum', color='b', markersize=15)
    ax2.legend(loc='best')
    plt.show()

    print('##############Ridge Regression(with CV)###############')
    X = np.linspace(-3, 1, 30)
    cols = abs(corrmat).nlargest(19, 'Life_expectancy')[
        'Life_expectancy'].index  # Select 7 features that are the most related to life expectancy
    related_col = cols.drop(['Life_expectancy']).drop(['Status'])

    #print('in ridge regression')
    #print('related col=', related_col)
    xx = x1[related_col]
    ridCV_err = np.zeros([len(X), 6])
    ridCV_mse = np.zeros([len(X), 6])
    for i in range(5, 11):  # column
        kfold = KFold(n_splits=i, shuffle=True)
        for j in range(len(X)):  # row
            ridgeCV = linear_model.RidgeCV(alphas=10 ** X, normalize=True)
            ridCV_neg_mse = cross_val_score(ridgeCV, xx, y, cv=kfold, scoring='neg_mean_squared_error')
            ridCV_score = cross_val_score(ridgeCV, xx, y, cv=kfold, scoring='r2')
            # ridCV_err[j][i-5] = 1- np.mean(ridCV_score)
            ridCV_mse[j][i - 5] = np.mean(ridCV_neg_mse) * (-1)
    min_err_index = np.unravel_index(ridCV_err.argmin(), ridCV_err.shape)
    min_mse_index = np.unravel_index(ridCV_mse.argmin(), ridCV_mse.shape)
    print('When we use Ridge Regression with cross validation')
    print('We got the min MSE value= %.3f when we applied %.0f fold and alpha = %.5f' % (
    ridCV_mse.min(), min_mse_index[1] + 5, 10 ** X[min_mse_index[0]]))
    print('')
    bestK_alpha_mse = ridCV_mse[:, min_mse_index[1]].reshape((ridCV_mse[:, min_mse_index[1]].shape[0], 1))
    # bestK_alpha_err = ridCV_err[:,min_err_index[1]].reshape((ridCV_err[:,min_err_index[1]].shape[0],1))
    fig, ax2 = plt.subplots(nrows=1, ncols=1)
    fig.suptitle('Ridge Regression (With CV when K= %.0f)' % (min_mse_index[1] + 5))

    # ax1.plot(X, bestK_alpha_err)
    ax1.set_ylabel('Error Rate')
    ax1.set_xlabel('log(alpha)')
    # ax1.plot(X[min_err_index[0]],bestK_alpha_err.min(),'*', label='minimum',color='b',markersize=15)
    ax1.legend(loc='best')

    ax2.plot(X, bestK_alpha_mse)
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('log(alpha)')
    ax2.plot(X[min_mse_index[0]], bestK_alpha_mse.min(), '*', label='minimum', color='b', markersize=15)
    ax2.legend(loc='best')
    # plt.show()

    ###########Lasso Regression###############
    print('##############Lasso Regression(without CV)###############')
    X = np.linspace(-3, 1, 30)
    x_train = x_train1[related_col]
    x_val = x_val1[related_col]
    lasso_mse_val = []
    lasso_mse_train = []
    lasso_error_val = []
    lasso_error_train = []
    for i in X:
        lasso = linear_model.Lasso(alpha=10 ** i, normalize=True)
        lasso.fit(x_train, y_train)
        lasso_predict_val = lasso.predict(x_val)
        lasso_predict_train = lasso.predict(x_train)
        lasso_acc_val = lasso.score(x_val, y_val)
        lasso_acc_train = lasso.score(x_train, y_train)
        lasso_mse_val = np.append(lasso_mse_val, mean_squared_error(y_val, lasso_predict_val))
        lasso_mse_train = np.append(lasso_mse_train, mean_squared_error(y_train, lasso_predict_train))
        lasso_error_val = np.append(lasso_error_val, 1 - lasso_acc_val)
        lasso_error_train = np.append(lasso_error_train, 1 - lasso_acc_train)
    # print('error rate in validation set =')
    # print(lasso_error_val)
    # print('MSE in validation set')
    # print(lasso_mse_val)
    print('We can find the min error rate= %.4f and min MSE= %.4f when alpha= %.6f ' % (
    min(lasso_error_val), min(lasso_mse_val), 10 ** X[np.argmin(lasso_mse_val)]))

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
    fig.suptitle('Lasso Regression (Without CV)')
    ax1.plot(X, lasso_error_train, label='Training set')
    ax1.plot(X, lasso_error_val, color='r', label='Validation set')
    ax1.set_ylabel('Error Rate')
    ax1.plot(X[np.argmin(lasso_error_val)], min(lasso_error_val), '*', label='minimum', color='b', markersize=15)
    ax1.legend(loc='lower right')

    ax2.plot(X, lasso_mse_train, label='Training set')
    ax2.plot(X, lasso_mse_val, color='r', label='Validation set')
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('log(alpha)')
    ax2.plot(X[np.argmin(lasso_mse_val)], min(lasso_mse_val), '*', label='minimum', color='b', markersize=15)
    plt.show()

    print('##############Lasso Regression(with CV)###############')
    X = np.linspace(-3, 1, 30)
    xx = x1[related_col]
    lassoCV_err = np.zeros([len(X), 6])
    lassoCV_mse = np.zeros([len(X), 6])
    for i in range(5, 11):  # column
        kfold = KFold(n_splits=i, shuffle=True)
        for j in range(len(X)):  # row
            lassoCV = linear_model.LassoCV(alphas=10 ** X, normalize=True)
            lassoCV_neg_mse = cross_val_score(lassoCV, xx, y, cv=kfold, scoring='neg_mean_squared_error')
            lassoCV_score = cross_val_score(lassoCV, xx, y, cv=kfold, scoring='r2')
            # lassoCV_err[j][i-5] = 1- np.mean(lassoCV_score)
            lassoCV_mse[j][i - 5] = np.mean(lassoCV_neg_mse) * (-1)
    # min_err_index=np.unravel_index(lassoCV_err.argmin(), lassoCV_err.shape)
    min_mse_index = np.unravel_index(lassoCV_mse.argmin(), lassoCV_mse.shape)
    print('When we use Lasso Regression with cross validation')
    print('We got the min MSE value= %.3f when we applied %.0f fold and alpha = %.5f' % (
    lassoCV_mse.min(), min_mse_index[1] + 5, 10 ** X[min_mse_index[0]]))

    bestK_alpha_mse = lassoCV_mse[:, min_mse_index[1]].reshape((lassoCV_mse[:, min_mse_index[1]].shape[0], 1))
    # bestK_alpha_err = lassoCV_err[:,min_err_index[1]].reshape((lassoCV_err[:,min_err_index[1]].shape[0],1))
    fig, ax2 = plt.subplots(nrows=1, ncols=1)
    fig.suptitle('Lasso Regression (With CV when K= %.0f)' % (min_mse_index[1] + 5))

    # ax1.plot(X, bestK_alpha_err)
    ax1.set_ylabel('Error Rate')
    ax1.set_xlabel('log(alpha)')
    # ax1.plot(X[min_err_index[0]],bestK_alpha_err.min(),'*', label='minimum',color='b',markersize=15)
    ax1.legend(loc='best')

    ax2.plot(X, bestK_alpha_mse)
    ax2.set_ylabel('MSE')
    ax2.set_xlabel('log(alpha)')
    ax2.plot(X[min_mse_index[0]], bestK_alpha_mse.min(), '*', label='minimum', color='b', markersize=15)
    ax2.legend(loc='best')

    # plt.show()

    avg_ytest = np.mean(y_test)
    one_array = np.ones([len(y_test), 1])
    mean_arr = avg_ytest['Life_expectancy'] * one_array
    baseline_mse = mean_squared_error(y_test, mean_arr)
    baseline_err = 1 - r2_score(y_test, mean_arr)

    print('Number of training data (original):', len(y))
    print('Number of test data:', len(y_test))
    print('Number of training data (New):', len(y_train))
    print('Number of validation data:', len(y_val))

    print('###################Final model: Random Forest Regression########################')
    cols = abs(corrmat).nlargest(21, 'Life_expectancy')['Life_expectancy'].index
    related_col = cols.drop(['Life_expectancy']).drop(['Status']).drop(['Hepatitis_B']).drop(['infant_deaths']).drop(
        ['GDP']).drop(['Measles']).drop(['Population']).drop(['percentage_expenditure']).drop(['Diphtheria'])

    error_x = []
    error_test = []
    mse_x = []
    mse_test = []
    oob_error = []
    for j in range(10):
        pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x, y, test_size=1 / 3)
        RF = RandomForestRegressor(n_estimators=38, bootstrap=True, random_state=0, oob_score=True)
        RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel())
        predict_x = RF.predict(x)
        predict_test = RF.predict(x_test)
        acc_x = RF.score(x, y)
        acc_test = RF.score(x_test, y_test)
        error_x = np.append(error_x, 1 - acc_x)
        error_test = np.append(error_test, 1 - acc_test)
        oob_error = np.append(oob_error, 1 - RF.oob_score_)
        mse_x = np.append(mse_x, mean_squared_error(predict_x, y))
        mse_test = np.append(mse_test, mean_squared_error(predict_test, y_test))
    mean_oob_err = np.mean(oob_error)
    meanerror_x = np.mean(error_x)
    meanerror_test = np.mean(error_test)
    mean_mse_x = np.mean(mse_x)
    mean_mse_test = np.mean(mse_test)
    var_mse_test = np.var(mse_test)
    var_err_test = np.var(error_test)
    print('In our final model (38 trees):')
    print('In the whole training data')
    print('Mean MSE =', mean_mse_x)
    print('Mean Error Rate =', meanerror_x)
    print('In the test data')
    print('Mean MSE = %.3f with variance = %.3f ' % (mean_mse_test, var_mse_test))
    print('Mean Error Rate = %.5f with variance = %.5f ' % (meanerror_test, var_err_test))
    print('Out Of Sample Error = ', mean_oob_err)
    print('####################Baseline######################')
    print('The baseline for the test data:')
    print('MSE = ', baseline_mse)
    print('Error Rate=', baseline_err)

    #Draw the 2D plot
    feature = ['HIV_AIDS', 'Income_composition_of_resources', 'Adult_Mortality', 'Schooling']
    for i in feature:
        for j in range(10):
            pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x, y, test_size=1 / 3)
            RF = RandomForestRegressor(n_estimators=38, bootstrap=True, random_state=0)
            RF.fit(pre_X_train_pick[[i]], pre_y_train_pick.values.ravel())
            predict_x = RF.predict(x[[i]])
            predict_test = RF.predict(x_test[[i]])
        X_grid = np.arange(min(x[i]), max(x[i]), 0.001)
        # reshape for reshaping the data into a len(X_grid)*1 array,
        # i.e. to make a column out of the X_grid value
        X_grid = X_grid.reshape((len(X_grid), 1))

        # Scatter plot for original data
        plt.scatter(x[i], y, color='blue', label='training data points')
        # plot predicted data
        plt.plot(X_grid, RF.predict(X_grid), color='green', label='regression function')
        plt.title('Random Forest Regression')
        plt.xlabel(i)
        plt.ylabel('Life expectancy')
        plt.legend(loc='best')
        plt.show()
pylab.ylabel("Ridge_error_Tst")

pylab.figure(4)
pylab.plot([0, 0.1, 1, 10, 100, 1000], lasso_error_Tst)
pylab.xlabel("lambda")
pylab.ylabel("lasso_error_Tst")

pylab.show()

# running Cross Validation for Ridge and lasso and extracting the best fitted lambda/alpha
R_Trn = linear_model.RidgeCV(fit_intercept=False, cv=5)
Ridge_Trn = R_Trn.fit(var_Trn, price_Trn)
Ridge_Trn.alpha = Ridge_Trn.alpha_
print "The best alpha for Ridge_train is: \n", Ridge_Trn.alpha, "\n"

l_Trn = linear_model.LassoCV(fit_intercept=False, cv=5)
lasso_Trn = l_Trn.fit(var_Trn, price_Trn)
lasso_Trn.alpha = lasso_Trn.alpha_
print "The best alpha for lasso_train is: \n", lasso_Trn.alpha, "\n"

# extracting the w using the best fitted lambda for both Ridge and lasso
R_Trn = linear_model.Ridge(alpha=Ridge_Trn.alpha, fit_intercept=False)
Ridge_Trn = R_Trn.fit(var_Trn, price_Trn)
print "The best fitted coefs for Ridge_train is: \n", Ridge_Trn.coef_, "\n"

l_Trn = linear_model.Lasso(alpha=lasso_Trn.alpha, fit_intercept=False)
lasso_Trn = l_Trn.fit(var_Trn, price_Trn)
print "The best fitted coefs for lasso_train is: \n", lasso_Trn.coef_, "\n"

# fitting obtained model to test dataset and mesuring errors
R_Tst = linear_model.Ridge(alpha=Ridge_Trn.alpha, fit_intercept=False)
def regression(XTrain, betaTrain, XTest):
    model = linear_model.LassoCV(
        cv=10, alphas=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10])
    model.fit(XTrain, betaTrain)
    Beta = model.predict(XTest)
    return [i for i in Beta]
def model_selection():
    # This is to avoid division by zero while doing np.log10
    EPSILON = 1e-5

    # #############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion

    model_bic = linear_model.LassoLarsIC(criterion='bic')
    model_bic.fit(data, label)
    # alpha_bic_ = model_bic.alpha_

    model_aic = linear_model.LassoLarsIC(criterion='aic')
    model_aic.fit(data, label)
    # alpha_aic_ = model_aic.alpha_

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b', EPSILON)
    plot_ic_criterion(model_bic, 'BIC', 'r', EPSILON)
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.savefig('information_criterion_model_selection.png')

    # #############################################################################
    # LassoCV: coordinate descent

    # Compute paths
    model = linear_model.LassoCV(cv=10).fit(data, label)

    # Display results
    m_log_alphas = -np.log10(model.alphas_ + EPSILON)

    plt.figure()
    ymin, ymax = 20, 300
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas,
             model.mse_path_.mean(axis=-1),
             'k',
             label='Average across the folds',
             linewidth=2)
    plt.axvline(-np.log10(model.alpha_ + EPSILON),
                linestyle='--',
                color='k',
                label='alpha: CV estimate')

    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: coordinate descent ')
    plt.axis('tight')
    plt.ylim(ymin, ymax)
    plt.savefig('lasso_model_selection.png')

    # #############################################################################
    # LassoLarsCV: least angle regression

    # Compute paths
    model = linear_model.LassoLarsCV(cv=10).fit(data, label)

    # Display results
    m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas,
             model.mse_path_.mean(axis=-1),
             'k',
             label='Average across the folds',
             linewidth=2)
    plt.axvline(-np.log10(model.alpha_),
                linestyle='--',
                color='k',
                label='alpha CV')
    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: Lars')
    plt.axis('tight')
    plt.ylim(ymin, ymax)
    plt.savefig('lasso_Lars_model_selection.png')
Exemple #20
0
     ),
    ])

X, y = make_xy_data('./data/merged_data.csv', ['surface_m2', 'piece'])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=2)

X_tr = features.fit_transform(X_train, None)
X_te = features.transform(X_test)

###############################################################################

t1 = time.time()
model = lm.LassoCV(cv=20, verbose=2).fit(X_tr, y_train)
t = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.alphas_)
plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent'
          ' (train time: %.2fs)' % t)
Exemple #21
0
# features is the cols - 1 (the 1 is the output label)
numFeatures = dataframe.shape[1] - 1
print(numFeatures)
X = dataframe[features].values
Y = dataframe[output_label]
# prepare configuration for cross validation test harness
num_folds = 10
seed = 7
# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge()))
#models.append(('ARDRegression', linear_model.ARDRegression()))
models.append(('Lasso', linear_model.Lasso()))
models.append(('LassoCV', linear_model.LassoCV()))
models.append(('LassoLars', linear_model.LassoLars()))
# Decision tree
models.append(('Dec tree', tree.DecisionTreeRegressor()))

# sanity check
models.append(('Dummy', DummyRegressor("median")))


def keras_baseline_model():
    # create model
    model = Sequential()
    model.add(
        Dense(128, input_dim=numFeatures, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation="relu"))
    # Compile model
Exemple #22
0
 def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs):
     return linear_model.LassoCV(alphas=alphas, cv=kfold, **kargs)
 
# load data
#path=#pwd#'path to the file/'
df=pd.read_csv('ex1data2.txt',header=None)
df.columns=['Size','Bedrooms','Price'] # rename columns


## Inputs (X) and labels (y) (Population and profit in restaurent business)
y=np.array(df['Price'])
X=np.array(df.drop(['Price'],1))
X=X.astype('float64')
Sscaler=preprocessing.StandardScaler()
Xs=Sscaler.fit_transform(X)

# Robust scaler is very helpful in handling outliers
#Rscaler=preprocessing.RobustScaler()
#Xr=Rscaler.fit_transform(X)

# linear regression model
Lreg=linear_model.LassoCV(eps=0.08,max_iter=400,tol=1e-5)
#
Lreg.fit(Xs,y)
#

#print('------ Multivariate Linear Regression------------')
print('Accuracy of Linear Regression Model is ',round(Lreg.score(Xs,y)*100,2))
#
# predicting price of house with 1650 sq. feet in size with 3 bed rooms
Predict1=Lreg.predict(Sscaler.transform(np.reshape([1650,3],(1,-1))))
print('Predicted price of a house with 1650 sq. feet and 3 bed room is $', round(Predict1[0],2))
 def __fit(self):
     X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=0.33, random_state=42)
     self.lr = linear_model.LassoCV()
     self.lr.fit(X_train, y_train)
Exemple #25
0
fig1 = plt.figure(figsize=(12, 8))
ax1 = fig1.add_subplot(111)
ax1.set_xscale('log')
ax1.plot(penalisations, coeffs)
plt.xlabel("Penalisations")
plt.ylabel("thetas")

# Question 5

# Determinitation of the penalisation factor with Cross Validation

#lasscv = linear_model.LassoCV(cv=20)

lassCV = linear_model.LassoCV(alphas=penalisations,
                              fit_intercept=False,
                              normalize=False)
lassCV.fit(X, y)

# Question 5b
print "Lasso with CV : "
print "Penalisation trouvée par CV : " + str(lassCV.alpha_)

# Determining the smallest penalisation factor so that all coefficients equal to 0.

# Question 5c
x_test = np.array([6, 0.3, 0.2, 6, 0.053, 25, 149, 0.9934, 3.24, 0.35, 10])

print "Score : "
print lassCV.predict(x_test)
Exemple #26
0
lr.score(XDF.values, y)
from sklearn import linear_model as lm
r = lm.Ridge().fit(XDF.values, y)
get_ipython().run_line_magic('pinfo', 'r.score')
r.score(XDF.values, y)
r = lm.Ridge(alpha=0.5).fit(XDF.values, y)
r.score(XDF.values, y)
get_ipython().run_line_magic('pinfo', 'lm.RidgeCV')
get_ipython().run_line_magic('pinfo', 'lm.RidgeCV')
rcv = lm.RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000], cv=5)
rcv.fit(XDF.values, y)
rcv
rcv.score(XDF.values, y)
get_ipython().set_next_input('lasso = lm.LassoCV')
get_ipython().run_line_magic('pinfo', 'lm.LassoCV')
lasso = lm.LassoCV(n_jobs=-1, cv=5)
lasso.fit(XDF.values, y)
lasso.score(XDF.values, y)
lasso.coef_
rcv.coef_
ecv = lm.ElasticNetCV()
ecv.fit(XDF.values, y)
ecv.score(XDF.values, y)
from sklearn.feature_selection import RFECV
RFECV.head()
RFECV
lr = lm.LinearRegression()
rfecv = RFECV()
rfecv = RFECV(lr, cv=5, n_jobs=-1)
rfecv.fit(XDF.values, y)
rfecv.grid_scores_
Exemple #27
0
    df = pd.read_csv("input_data.csv")  # pandasでcsvを読込
    df = pd.DataFrame(df)               # DataFrame形式にする
    df = df.loc[df["play_time"] >= df["play_time"].median()]  # プレイタイム上位半分のデータを選択
    data_y = df["probability_6man"]     # DataFrameから,yとして使用する列を抽出
    drop_idx = ["player_id", "frag_starting", "play_time", "frag_6man",
                "period", "position", "probability_6man"]  # xとして使用しない列を選択
    data_x = df.drop(drop_idx, axis=1)  # DataFrameから,xとして使用する列を抽出

    data_y = np.array(data_y, dtype=float)  # numpyのarray形式にすることで演算可能になる
    data_x = np.array(data_x, dtype=float)

    data_x = (data_x - data_x.mean(axis=0)) / data_x.std(axis=0)  # xを標準化

    # ハイパーパラメータを交差確認法により推定
    # sklearnのlinear_modelにあるLassoCV(Lassoの交差確認法のひとつ。LassoCVの他にもう一つやり方があったんだけど関数名が出てこない……)を、lasso_cvと定義
    lasso_cv = linear_model.LassoCV()
    lasso_cv.fit(data_x, data_y)  # data_xとdata_yをlasso_cvにかける

    # alphaとして交差確認法で算出した値を使用
    # lasso = linear_model.Lasso(alpha=lasso_cv.alpha_)
    lasso = linear_model.Lasso(alpha=0)
    lasso.fit(data_x, data_y)
    c = np.array(lasso.coef_)  # cは係数のベクトル

    # 結果の表示
    print("交差確認法により推定された適切なハイパーパラメータ alpha :")
    print(lasso_cv.alpha_)  # 交差確認法の結果を表示,ハイパーパラメータalphaを返す
    print()  # 1行空ける
    print("LASSO により推定されたパラメータ:")
    print(c)
    print()  # 1行空ける
Exemple #28
0
# -------------------------------------------------
# <editor-fold desc="COMMON VARIABLES">

smoothing_time = 0.2  # The smoothing window for both the firing rates and the distance to poke time series
smoothing_frames = int(smoothing_time / 0.00833)

fr_final_smoothing_time = 1  # The final smoothing window of the firing rates
fr_extra_smoothing_frames = int(fr_final_smoothing_time / smoothing_time)

leave_percentage_out = 0.005

model = pipeline.make_pipeline(PolynomialFeatures(2),
                               linear_model.LinearRegression())
model_npb_dtp = pipeline.make_pipeline(
    PolynomialFeatures(2), linear_model.LassoCV(cv=None, fit_intercept=True))

common_pb_npb_dtp_neuron_indices = np.intersect1d(
    correlated_neuron_indices['pb_dtp'], correlated_neuron_indices['npb_dtp'])

correlated_neuron_indices_unique_pb_dtp = np.delete(
    correlated_neuron_indices['pb_dtp'],
    np.argwhere(
        np.isin(correlated_neuron_indices['pb_dtp'],
                common_pb_npb_dtp_neuron_indices)))
correlated_neuron_indices_unique_npb_dtp = np.delete(
    correlated_neuron_indices['npb_dtp'],
    np.argwhere(
        np.isin(correlated_neuron_indices['npb_dtp'],
                common_pb_npb_dtp_neuron_indices)))
Exemple #29
0
这里不给出原始公式推导,因为相比较岭回归,LASSO回归的区别不大
二者之间主要是正则化项不相同
在代价函数方面:
    岭回归:1/(2*m)[sum(i=1->m)(hθ(xi)-yi)^2]+λ*sum(j=0->n)θj^2
    LASSO回归:1/(2*m)[sum(i=1->m)(hθ(xi)-yi)^2]+λ*sum(j=0->n)|θj|
LASSO回归具有更强的解释性,他将与其他特征线性相关的特征系数置于0
详细数学细节不予赘述
"""

import numpy as np
from sklearn import linear_model

# 读入数据
data = np.genfromtxt("longley.csv", delimiter=",")
x_data = data[1:, 2:]
y_data = data[1:, 1]

# 训练LASSO
model = linear_model.LassoCV()
model.fit(x_data, y_data)

# lasso 系数
print("Lasso系数: ", model.alpha_)
# 相关系数
# 这里打印出的系数有0说明存在多重共线性
print("相关系数: ", model.coef_)

# 做一个预测
predict = model.predict(x_data[-2, np.newaxis])
print(predict)
Exemple #30
0
X_test = np.array(X_test)
Y_test = []
for y in raw_test:
    Y_test.append(y['Value'])
Y_test = np.array(Y_test, np.double)

# train
cvParams = [
    0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1,
    0.3
]
clf1 = linear_model.RidgeCV(alphas=cvParams,
                            normalize=True,
                            scoring='mean_squared_error')
clf1.fit(X, Y)
clf2 = linear_model.LassoCV(alphas=cvParams, normalize=True, max_iter=2000)
clf2.fit(X, Y)
clf3 = linear_model.ElasticNetCV(max_iter=2000, eps=0.0001)
clf3.fit(X, Y)
print 'Ridge:', clf1.coef_, clf1.intercept_, clf1.alpha_
print 'Lasso:', clf2.coef_, clf2.intercept_, clf2.alpha_, np.min(
    clf2.mse_path_)
print 'ElasticNet:', clf3.coef_, clf3.intercept_, clf3.alpha_, np.min(
    clf3.mse_path_)

# test
print 'Ridge:', np.mean((clf1.predict(X_test) - Y_test)**2)
print 'Lasso:', np.mean((clf2.predict(X_test) - Y_test)**2)
print 'ElasticNet:', np.mean((clf3.predict(X_test) - Y_test)**2)

# plot