Beispiel #1
0
YData = df_train.SalePrice

#Applying Lasso Model

from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LinearRegression


def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(
        model, trainData, YData, scoring="neg_mean_squared_error", cv=5))
    return (rmse)


model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005],
                      selection='random',
                      max_iter=15000).fit(trainData, YData)
res = rmse_cv(model_lasso)
#print(res)
print("Lasso Mean:", res.mean())
print("Lasso Min: ", res.min())

coef = pd.Series(model_lasso.coef_, index=trainData.columns)
#print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

imp_coef = pd.concat(
    [coef.sort_values().head(10),
     coef.sort_values().tail(10)])
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind="barh", color='r')
plt.title("Coefficients used in the Lasso Model")
#trainFilled.plot.scatter(x='TotalBsmtSF', y='SalePrice')
#trainFilled.plot.scatter(x='MasVnrArea', y='SalePrice')

numRuns = 1
errors = [0] * numRuns
ridgeCV = RidgeCV(alphas=[21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
                  scoring="neg_mean_squared_error",
                  normalize=False,
                  cv=nFolds)
ridge = Ridge(normalize=False)
alphasRidge = {"alpha": [13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25]}

alphasLasso = {"alpha": [0.0005, 0.001, 0.005]}
lasso = Lasso(max_iter=1000)
lassoCV = LassoCV(alphas=[0.0005, 0.001, 0.005],
                  normalize=False,
                  cv=nFolds,
                  max_iter=1000)
#lassoCV.fit(trainFilled.drop(columns='SalePrice'), trainFilled['SalePrice'])

testModel = ridge
testParams = alphasRidge
predModel = ridgeCV

Xtrain = trainFilled.drop(columns=['SalePrice', 'Id'])
ytrain = trainFilled['SalePrice']

testFilled = testFilled.drop(columns=['Id'])
print trainFilled.shape
print testFilled.shape

# error = nestedCrossValidation(Xtrain, ytrain,
Beispiel #3
0
def Lasso_Mode(X_train, y_train, X_test, y_test, num_class):
    algo_name = 'Lasso Regression'
    lasso_model = LassoCV(alphas=[0.01, 0.05, 0.10, 0.20, 0.50, 1])
    lasso_model.fit(X_train, y_train)
    y_pred_lm = lasso_model.predict(X_test)
    PRAF(y_test, y_pred_lm, num_class, algo_name)
                              max_depth=15,
                              learning_rate=0.01,
                              subsample=1),
    XGBRegressor(seed=0,
                 n_estimators=800,
                 max_depth=15,
                 learning_rate=0.01,
                 subsample=1,
                 colsample_bytree=0.75),
    XGBRegressor(seed=0,
                 n_estimators=800,
                 max_depth=12,
                 learning_rate=0.01,
                 subsample=0.8,
                 colsample_bytree=0.75),
    LassoCV(alphas=[1, 0.1, 0.001, 0.0005, 0.0002, 0.0001, 0.00005]),
    KNeighborsRegressor(n_neighbors=5),
    KNeighborsRegressor(n_neighbors=10),
    KNeighborsRegressor(n_neighbors=15),
    KNeighborsRegressor(n_neighbors=25),
    LassoLarsCV(),
    ElasticNet(),
    SVR()
]

ensem = ensemble(n_folds=10,
                 stacker=linear_model.LinearRegression(),
                 base_models=base_models)

X_train, X_test, y_train = data_preprocess(train, test)
y_pred, score = ensem.fit_predict(X_train, X_test, y_train)
Beispiel #5
0
def QuickML_Stacking(X_train, y_train, X_test='', modeltype='Regression',Boosting_Flag=False,
                    scoring='', verbose=0):
    """
    Quickly build Stacks of multiple model results
    Input must be a clean data set (only numeric variables, no categorical or string variables).
    """
    X_train = copy.deepcopy(X_train)
    X_test = copy.deepcopy(X_test)
    y_train = copy.deepcopy(y_train)
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Stacking models
    estimators = []
    ### This keeps tracks of the number of predict_proba columns generated by each model ####
    estimator_length = []
    if isinstance(X_test, str):
        no_fit = True
    else:
        no_fit = False
    if no_fit:
        #### This is where you don't fit the model but just do cross_val_predict ####
        if modeltype == 'Regression':
            if scoring == '':
                scoring = 'neg_mean_squared_error'
            scv = KFold(n_splits=FOLDS, shuffle=False)
            if Boosting_Flag:
                ######    Bagging models if Bagging is chosen ####
                #model4 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                #                            n_estimators=NUMS,random_state=seed)
                model4 = LinearSVR()
                results = cross_val_predict(model4,X_train,y_train, cv=scv,n_jobs=-1)
                estimators.append(('Linear_SVR',model4))
                estimator_length.append(1)
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model5 = DecisionTreeRegressor(random_state=seed,min_samples_leaf=2)
                results = cross_val_predict(model5,X_train,y_train, cv=scv,n_jobs=-1)
                estimators.append(('Decision Trees',model5))
                estimator_length.append(1)
            else:
                ####   Linear Models if Boosting is chosen #####
                model6 = LassoCV(alphas=np.logspace(-5,-1,20), cv=scv,random_state=seed)
                results = cross_val_predict(model6,X_train,y_train, cv=scv,n_jobs=-1)
                estimators.append(('LassoCV Regularization',model6))
                estimator_length.append(1)
        else:
            n_classes = len(Counter(y_train))
            if n_classes > 2:
                #### In multi-class setting, it makes sense to turn it into binary class in stage-1
                #### In stage 2, a complex model will take the inputs of this model and try to predict
                rare_class = find_rare_class(y_train)
                if rare_class == 0:
                    majority_class = 1
                else:
                    majority_class = 0
                y_train = y_train.map(lambda x: rare_class if x==rare_class else majority_class)
            if scoring == '':
                scoring = 'accuracy'
            scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
            if Boosting_Flag:
                ####   Linear Models if Boosting is chosen #####
                model4 = LinearDiscriminantAnalysis()
                results = cross_val_predict(model4,X_train,y_train, cv=scv,n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Linear Discriminant',model4))
                estimator_length.append(results.shape[1])
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model6 = DecisionTreeClassifier(min_samples_leaf=2)
                results = cross_val_predict(model6,X_train,y_train, cv=scv,n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Decision Tree',model6))
                estimator_length.append(results.shape[1])
            else:
                ######    Naive Bayes models if Bagging is chosen ####
                if n_classes <= 2:
                    try:
                        model7 = GaussianNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                else:
                    try:
                        model7 = MultinomialNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                results = cross_val_predict(model7,X_train,y_train, cv=scv,n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Naive Bayes',model7))
                estimator_length.append(results.shape[1])
    else:
        #### This is where you fit the model and then predict ########
        if modeltype == 'Regression':
            if scoring == '':
                scoring = 'neg_mean_squared_error'
            scv = KFold(n_splits=FOLDS, shuffle=False)
            if Boosting_Flag:
                ######    Bagging models if Bagging is chosen ####
                #model4 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                #                            n_estimators=NUMS,random_state=seed)
                model4 = LinearSVR()
                results = model4.fit(X_train,y_train).predict(X_test)
                estimators.append(('Linear_SVR',model4))
                estimator_length.append(1)
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model5 = DecisionTreeRegressor(random_state=seed,min_samples_leaf=2)
                results = model5.fit(X_train,y_train).predict(X_test)
                estimators.append(('Decision Trees',model5))
                estimator_length.append(1)
            else:
                ####   Linear Models if Boosting is chosen #####
                model6 = LassoCV(alphas=np.logspace(-5,-1,20), cv=scv,random_state=seed)
                results = model6.fit(X_train,y_train).predict(X_test)
                estimators.append(('LassoCV Regularization',model6))
                estimator_length.append(1)
        else:
            n_classes = len(Counter(y_train))
            if n_classes > 2:
                #### In multi-class setting, it makes sense to turn it into binary class in stage-1
                #### In stage 2, a complex model will take the inputs of this model and try to predict
                rare_class = find_rare_class(y_train)
                if rare_class == 0:
                    majority_class = 1
                else:
                    majority_class = 0
                y_train = y_train.map(lambda x: rare_class if x==rare_class else majority_class)
            if scoring == '':
                scoring = 'accuracy'
            scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True)
            if Boosting_Flag:
                ####   Linear Models if Boosting is chosen #####
                model4 = LinearDiscriminantAnalysis()
                results = model4.fit(X_train,y_train).predict_proba(X_test)
                estimators.append(('Linear Discriminant',model4))
                estimator_length.append(results.shape[1])
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model6 = DecisionTreeClassifier(min_samples_leaf=2)
                results = model6.fit(X_train,y_train).predict_proba(X_test)
                estimators.append(('Decision Tree',model6))
                estimator_length.append(results.shape[1])
            else:
                ######    Naive Bayes models if Bagging is chosen ####
                if n_classes <= 2:
                    try:
                        model7 = GaussianNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                else:
                    try:
                        model7 = MultinomialNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                results = model7.fit(X_train,y_train).predict_proba(X_test)
                estimators.append(('Naive Bayes',model7))
                estimator_length.append(results.shape[1])
    #stacks = np.c_[results1,results2,results3]
    estimators_list = [(tuples[0],tuples[1]) for tuples in estimators]
    estimator_names = [tuples[0] for tuples in estimators]
    #### Here is where we consolidate the estimator names and their results into one common list ###
    ls = []
    for x,y in dict(zip(estimator_names,estimator_length)).items():
        els = [x+'_'+str(eachy) for eachy in range(y)]
        ls += els
    if verbose == 1:
        print('    Time taken for Stacking: %0.1f seconds' %(time.time()-start_time))
    return ls, results
Beispiel #6
0
def train_linear_model(X,
                       y,
                       random_state=1,
                       test_size=0.2,
                       regularization_type='elasticnet',
                       k_fold=5,
                       max_iter=1000000,
                       tol=0.0001,
                       l1_ratio=None):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [.1, .5, .7, .9, .95, 1]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {
        'random_state': random_state,
        'test_size': test_size,
        'k_fold': k_fold,
        'tol': tol,
        'max_iter': max_iter
    }
    if regularization_type == 'lasso' and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True,
                          alphas=None,
                          tol=tol,
                          cv=k_fold,
                          max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True,
                             alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters['l1_ratio'] = 1

    elif regularization_type == 'ridge' and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = 0

    elif regularization_type == 'elasticnet' and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(fit_intercept=True,
                                    normalize=False,
                                    alphas=None,
                                    cv=k_fold,
                                    l1_ratio=l1_ratio,
                                    max_iter=max_iter)
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(fit_intercept=True,
                                  normalize=False,
                                  l1_ratio=l1_ratio_opt,
                                  alpha=alpha_opt,
                                  max_iter=max_iter)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == 'elasticnet' and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True,
                                                    cv=k_fold,
                                                    normalize=False,
                                                    l1_ratio=l1_ratio,
                                                    max_iter=max_iter)
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True,
                                           normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test - mu) / s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x / y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96 * np.sqrt(
        mean_squared_error(np.ones(y_pred.shape),
                           pred_actual_ratio,
                           multioutput='raw_values') / y_pred.shape[0])
    hyperparameters['alpha'] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
Beispiel #7
0
rcv.fit(X_train, Y_train)

print('\nBest RidgeCV alpha value:')
print(rcv.alpha_)

#Ridge regression using best alpha#
rbest = Ridge(alpha=rcv.alpha_, normalize=True)
rbest.fit(X_train, Y_train)

print('\nBest Ridge MSE:')
print(mean_squared_error(Y_test, rbest.predict(X_test)))

###Lasso regression###

#LassoCV with 10-fold cross-validation(similar to ISLR)#
lcv = LassoCV(alphas=None, max_iter=100000, normalize=True, cv=kfcv, n_jobs=2)
lcv.fit(X_train, Y_train)

print('\nBest LassoCV alpha value:')
print(lcv.alpha_)

#Ridge regression using best alpha#
lbest = Lasso(alpha=lcv.alpha_, normalize=True)
lbest.fit(X_train, Y_train)

print('\nBest Lasso MSE:')
print(mean_squared_error(Y_test, lbest.predict(X_test)))

print('\nLasso Coeficients:')
print(pd.Series(lbest.coef_, index=xcols))
Beispiel #8
0
def test_linreg():
    for m in [LinearRegression(), Ridge(), Lars(), LarsCV(), ElasticNet(), ElasticNetCV(), \
              Lasso(), LassoCV(), LassoLars(), LassoLarsCV(), LassoLarsIC(), SVR(kernel='linear')]:
        verify(m.fit(X, y), ['predict'])
        if not isinstance(m, SVR):
            verify(m.set_params(fit_intercept=False).fit(X, y), ['predict'])
#
# Another possibility to take into account correlated variables in the dataset,
# is to estimate sparse coefficients. In some way we already did it manually
# when we dropped the AGE column in a previous Ridge estimation.
#
# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse
# coefficients. LassoCV applies cross validation in order to
# determine which value of the regularization parameter (`alpha`) is best
# suited for the model estimation.

from sklearn.linear_model import LassoCV

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=LassoCV(alphas=np.logspace(-10, 10, 21), max_iter=100000),
        func=np.log10,
        inverse_func=sp.special.exp10,
    ),
)

_ = model.fit(X_train, y_train)

# %%
# First we verify which value of :math:`\alpha` has been selected.

model[-1].regressor_.alpha_

# %%
# Then we check the quality of the predictions.
Beispiel #10
0
print(X_train.shape)
X_test0 = X_test0[:, i_keep_columns]
print(X_test0.shape)
X_test1 = X_test1[:, i_keep_columns]
print(X_test1.shape)

sys.exit()

################################################################################################################
### linear LASSO Model ###
################################################################################################################
# from sklearn import linear_model
from sklearn.linear_model import LassoCV
print("training")
# should be cv = 5
model = LassoCV(cv=5, verbose=1, n_jobs=-1).fit(X_train, y_train)
print("done training")

coef = model.coef_
type(coef)
len(coef)
np.savetxt(dir_lasso + "coef_lasso_linear_pronoun.txt", coef)

# predicted probability for a post being Female
'''ypred_train=model.predict_proba(X_train)[:,1] # Pr(female=1)
ypred_test0=model.predict_proba(X_test0)[:,1]
ypred_test1=model.predict_proba(X_test1)[:,1]

np.savetxt("linear_ypred_train.txt",ypred_train)
np.savetxt("linear_ypred_test0.txt",ypred_test0)
np.savetxt("linear_ypred_test1.txt",ypred_test1)'''
Beispiel #11
0
            score_mse = metrics.mean_absolute_error(Y_cv, one_result)
            print('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' %
                  (i, cv_score, score_mse))
            blend_test_j[:, i] = clf.predict(X_test)
        blend_test[:, j] = blend_test_j.mean(1)
        print('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' %
              (j, cv_results[j, ].mean(), cv_results[j, ].std()))

    end_time = datetime.now()
    time_taken = (end_time - start_time)
    print("Time taken for pre-blending calculations: {0}".format(time_taken))
    print("CV-Results", cv_results)
    print("Blending models.")

    bclf = LassoCV(n_alphas=100,
                   alphas=None,
                   normalize=True,
                   cv=5,
                   fit_intercept=True,
                   max_iter=10000,
                   positive=True)
    bclf.fit(blend_train, Y_dev)

    Y_test_predict = bclf.predict(blend_test)

    cv_score = cv_results.mean()
    print('Avg. CV-Score = %s' % (cv_score))
    submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict})
    submission = submission.set_index('Id')
    submission.to_csv("farons_solution.csv")
Beispiel #12
0
def regression_regularization(Xs, y):
    #---------------------------------------------
    # RidgeCV
    #----------------------------------------------
    # optimal value for Ridge regression alpha using RidgeCV
    ridge_alphas = np.logspace(0, 5, 200)

    optimal_ridge = RidgeCV(alphas=ridge_alphas, cv=30)
    optimal_ridge.fit(Xs, y)

    print("Ridge Alpha: ", optimal_ridge.alpha_)

    # Cross-validate the Ridge regression
    ridge = Ridge(alpha=optimal_ridge.alpha_)

    ridge_scores = cross_val_score(ridge, Xs, y, cv=10)

    print(ridge_scores)
    print("RidgeCV Mean: ", np.mean(ridge_scores))
    print("-----------------------------------------------------")
    print("\n")

    #----------------------------------------------
    # LassoCV
    #----------------------------------------------
    # Optimal value for Lasso regression alpha using LassoCV
    optimal_lasso = LassoCV(n_alphas=500, cv=10, verbose=1)
    optimal_lasso.fit(Xs, y)

    print("Lasso Alpha: ",optimal_lasso.alpha_)

    # Cross-validate the Lasso regression
    lasso = Lasso(alpha=optimal_lasso.alpha_)

    lasso_scores = cross_val_score(lasso, Xs, y, cv=30)

    print(lasso_scores)
    print("LassoCV Mean: ", np.mean(lasso_scores))
    print("-----------------------------------------------------")
    print("\n")


    #----------------------------------------------
    # ElasticNetCV
    #----------------------------------------------
    # Optimal value for Elastic Net regression alpha using ElasticNetCV
    l1_ratios = np.linspace(0.01, 1.0, 25)

    optimal_enet = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=100, cv=10,
                                verbose=1)
    optimal_enet.fit(Xs, y)

    print("ElasticNet Alpha: ",optimal_enet.alpha_)
    print(optimal_enet.l1_ratio_)

    # Cross-validate the ElasticNet with L1_ratio
    enet = ElasticNet(alpha=optimal_enet.alpha_, l1_ratio=optimal_enet.l1_ratio_)

    enet_scores = cross_val_score(enet, Xs, y, cv=10)

    print(enet_scores)
    print("ElasticCV Mean: ", np.mean(enet_scores))
    print("-----------------------------------------------------")
    print("\n")
Beispiel #13
0
def sklearnLassoCV(params, alphas=(0.1, 1.0, 10.0), folds=10):
    X = params['X_train']
    Y = params['y_train']
    name = params['name']
    model = LassoCV(alphas=alphas, cv=folds)
    return MachinLearningModel(model, X, Y, modelType="Linear", name=name)
Beispiel #14
0
        Pipeline([
            ('scaler', RobustScaler()),
            ('knn', KNeighborsRegressor(5, weights='distance', n_jobs=-1))
        ]),
        Pipeline([
            ('scaler', RobustScaler()),
            ('svr', SVR(
                C=160,
                gamma=0.1,
                epsilon=0.1
            ))
        ]),
        Pipeline([
            ('scaler', RobustScaler()),
            ('lasso', LassoCV(
                cv=KFold(4, True, 0),
                n_jobs=-1
            ))
        ])
    ]

    model = Model(estimators, [.45, .05, .45, .05])

    routine(
        dir_path=os.path.dirname(__file__),
        task_id=2,
        model=model,
        mode=os.environ.get('KDD_MODE', 'train')
    )
    print('task2 train done')
Beispiel #15
0
    from sklearn.linear_model import Lasso
    lasso = Lasso(alpha=np.float('{}'.format(i)),
                  max_iter=1000000).fit(house_price_train_X,
                                        house_price_train_y)  # 默认的lamda的参数为i
    # 拉索模型训练的准确率
    predict_result_lasso = lasso.predict(house_price_test_X)
    predict_result_lasso1 = lasso.score(house_price_train_X,
                                        house_price_train_y)
    predict_result_lasso0 = lasso.score(house_price_test_X, house_price_test_y)
    print('拉索回归惩罚参数为 {} ,训练集的准确率:'.format(i), predict_result_lasso1)
    print('拉索回归惩罚参数为 {} ,测试集的准确率:'.format(i), predict_result_lasso0)
    print('拉索回归惩罚参数为 {},使用的特征属性有:{}'.format(i, np.sum(lasso.coef_ != 0)))

    # 实现交叉检验拉索回归模型的导入
    from sklearn.linear_model import LassoCV
    lasso_cv = LassoCV(alphas=np.logspace(-3, 1, 2, 50), max_iter=1000000).fit(
        house_price_train_X, house_price_train_y)  # 默认的lamda的参数为i
    # 交叉检验拉索模型训练的准确率
    predict_result_lasso_cv = lasso_cv.predict(house_price_test_X)
    predict_result_lasso_cv1 = lasso_cv.score(house_price_train_X,
                                              house_price_train_y)
    predict_result_lasso_cv0 = lasso_cv.score(house_price_test_X,
                                              house_price_test_y)
    print('交叉检验拉索回归 训练集的准确率:', predict_result_lasso_cv1)
    print('交叉检验拉索回归 测试集的准确率:', predict_result_lasso_cv0)

from sklearn.linear_model import LinearRegression
# 训练多元线性回归模型
lr = LinearRegression().fit(house_price_train_X, house_price_train_y)
# 预测测试集房价结果
predict_result_lr = lr.predict(house_price_test_X)
# 模型训练的准确率
Beispiel #16
0
           compact=False)
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(HuberRegressor(), "HuberAuto")
build_auto(LarsCV(), "LarsAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(LassoLarsCV(), "LassoLarsAuto")
build_auto(
    OptimalLGBMRegressor(objective="regression",
                         n_estimators=17,
                         num_iteration=11), "LGBMAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
    "LinearRegressionEnsembleAuto")
build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto")
build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3),
           "RandomForestAuto",
           flat=True)
build_auto(RidgeCV(), "RidgeAuto")
build_auto(TheilSenRegressor(n_subsamples=15, random_state=13), "TheilSenAuto")
def exp(n):

    COV_CLIP = 10 / n

    X_data = {
        colname: gen_data(datatype, n)
        for colname, datatype in X_colnames.items()
    }
    X_data = pd.DataFrame({**X_data})
    # Turn strings into categories for numeric mapping
    X_data['os_type'] = X_data.os_type.astype('category').cat.codes
    X_pre = X_data.values.astype('float')

    true_fn = lambda X: (.8 + .5 * X[:, 0] - 3 * X[:, 6])

    y, T, Z = dgp_binary(X_pre, n, true_fn)
    X = QuantileTransformer(subsample=n // 10).fit_transform(X_pre)

    true_ate = np.mean(true_fn(X_pre))
    print("True ATE: {:.3f}".format(true_ate))
    print("New members: in treatment = {:f}, in control = {:f}".format(
        T[Z == 1].sum() / Z.sum(), T[Z == 0].sum() / (1 - Z).sum()))
    print("Z treatment proportion: {:.5f}".format(np.mean(Z)))

    # ### Defining some generic regressors and classifiers

    # This a generic non-parametric regressor
    # model = lambda: GradientBoostingRegressor(n_estimators=20, max_depth=3, min_samples_leaf=20,
    #                                        n_iter_no_change=5, min_impurity_decrease=.001, tol=0.001)
    #model = lambda: XGBWrapper(XGBRegressor(gamma=0.001, n_estimators=50, min_child_weight=50, n_jobs=10),
    #                        early_stopping_rounds=5, eval_metric='rmse', binary=False)

    # model = lambda: RandomForestRegressor(n_estimators=100)
    # model = lambda: Lasso(alpha=0.0001) #CV(cv=5)
    # model = lambda: GradientBoostingRegressor(n_estimators=60)
    # model = lambda: LinearRegression(n_jobs=-1)
    model = lambda: LassoCV(cv=5, n_jobs=-1)

    # This is a generic non-parametric classifier. We have to wrap it with the RegWrapper, because
    # we want to use predict_proba and not predict. The RegWrapper calls predict_proba of the
    # underlying model whenever predict is called.
    # model_clf = lambda: RegWrapper(GradientBoostingClassifier(n_estimators=20, max_depth=3, min_samples_leaf=20,
    #                                        n_iter_no_change=5, min_impurity_decrease=.001, tol=0.001))
    # model_clf = lambda: RegWrapper(XGBWrapper(XGBClassifier(gamma=0.001, n_estimators=50, min_child_weight=50, n_jobs=10),
    #                                        early_stopping_rounds=5, eval_metric='logloss', binary=True))
    # model_clf = lambda: RandomForestClassifier(n_estimators=100)
    # model_clf = lambda: RegWrapper(GradientBoostingClassifier(n_estimators=60))
    # model_clf = lambda: RegWrapper(LogisticRegression(C=10, penalty='l1', solver='liblinear'))
    model_clf = lambda: RegWrapper(
        LogisticRegressionCV(n_jobs=-1, cv=5, scoring='neg_log_loss'))

    model_clf_dummy = lambda: RegWrapper(DummyClassifier(strategy='prior'))

    # We need to specify models to be used for each of these residualizations
    model_Y_X = lambda: model()  # model for E[Y | X]
    model_T_X = lambda: model_clf(
    )  # model for E[T | X]. We use a classifier since T is binary

    # model_Z_X = lambda: model_clf() # model for E[Z | X]. We use a classifier since Z is binary
    model_Z_X = lambda: model_clf_dummy(
    )  # model for E[Z | X]. We use a classifier since Z is binary

    # E[T | X, Z]
    model_T_XZ = lambda: SeparateModel(model_clf(), model_clf())

    # E[TZ | X]
    model_TZ_X = lambda: model_clf()

    # We fit DMLATEIV with these models and then we call effect() to get the ATE.
    # n_splits determines the number of splits to be used for cross-fitting.

    # # Algorithm 2 - Current Method

    # In[121]:

    dmlateiv_obj = DMLATEIV(
        model_Y_X(),
        model_T_X(),
        model_Z_X(),
        n_splits=
        10,  # n_splits determines the number of splits to be used for cross-fitting.
        binary_instrument=
        True,  # a flag whether to stratify cross-fitting by instrument
        binary_treatment=
        True  # a flag whether to stratify cross-fitting by treatment
    )

    dmlateiv_obj.fit(y, T, X, Z)

    ta_effect = dmlateiv_obj.effect()
    ta_effect_conf = dmlateiv_obj.normal_effect_interval(lower=2.5, upper=97.5)

    print("{:.3f}, ({:.3f}, {:3f})".format(ta_effect, ta_effect_conf[0],
                                           ta_effect_conf[1]))

    # # Algorithm 3 - DRIV ATE

    driv_model_effect = lambda: Pipeline(
        [('poly', PolynomialFeatures(degree=0, include_bias=True)),
         ('reg', StatsModelLinearRegression())])

    dmliv_featurizer = lambda: PolynomialFeatures(degree=1, include_bias=True)
    dmliv_model_effect = lambda: SelectiveLasso(np.arange(1, X.shape[1] + 1),
                                                LassoCV(cv=5, n_jobs=-1))
    prel_model_effect = DMLIV(model_Y_X(),
                              model_T_X(),
                              model_T_XZ(),
                              dmliv_model_effect(),
                              dmliv_featurizer(),
                              n_splits=1)
    #dmliv_model_effect = lambda: model()
    #prel_model_effect = GenericDMLIV(model_Y_X(), model_T_X(), model_T_XZ(),
    #                                 dmliv_model_effect(),
    #                                 n_splits=1)
    dr_cate = DRIV(
        model_Y_X(),
        model_T_X(),
        model_Z_X(),  # same as in DMLATEIV
        prel_model_effect,  # preliminary model for CATE, must support fit(y, T, X, Z) and effect(X)
        model_TZ_X(),  # model for E[T * Z | X]
        driv_model_effect(),  # model for final stage of fitting theta(X)
        cov_clip=
        COV_CLIP,  # covariance clipping to avoid large values in final regression from weak instruments
        n_splits=10,  # number of splits to use for cross-fitting
        binary_instrument=
        True,  # a flag whether to stratify cross-fitting by instrument
        binary_treatment=
        True  # a flag whether to stratify cross-fitting by treatment
    )
    dr_cate.fit(y, T, X, Z)
    dr_effect = dr_cate.effect_model.named_steps['reg'].coef_[0]
    dr_effect_conf = dr_cate.effect_model.named_steps['reg'].model.conf_int(
        alpha=0.05)[0]
    print("{:.3f}, ({:.3f}, {:3f})".format(dr_effect, dr_effect_conf[0],
                                           dr_effect_conf[1]))
    return true_ate, ta_effect, ta_effect_conf[0], ta_effect_conf[
        1], dr_effect, dr_effect_conf[0], dr_effect_conf[1]
Beispiel #18
0
    N = 9
    x = np.linspace(0, 6, N) + np.random.randn(N)
    x = np.sort(x)
    y = x**2 - 4*x - 3 + np.random.randn(N)
    x.shape = -1, 1
    y.shape = -1, 1

    models = [Pipeline([
        ('poly', PolynomialFeatures()),
        ('linear', LinearRegression(fit_intercept=False))]),
        Pipeline([
            ('poly', PolynomialFeatures()),
            ('linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
        Pipeline([
            ('poly', PolynomialFeatures()),
            ('linear', LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
        Pipeline([
            ('poly', PolynomialFeatures()),
            ('linear', ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                    fit_intercept=False))])
    ]
    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    np.set_printoptions(suppress=True)

    plt.figure(figsize=(18, 12), facecolor='w')
    d_pool = np.arange(1, N, 1)  # 阶
    m = d_pool.size
    clrs = []  # 颜色
    for c in np.linspace(16711680, 255, m):
        clrs.append('#%06x' % int(c))
                             learning_rate=0.07,
                             max_depth=20,
                             min_child_weight=1.5,
                             n_estimators=300,
                             reg_alpha=0.65,
                             reg_lambda=0.45,
                             subsample=0.95)

model_xgb.fit(X_train, y)
xgb_p = np.expm1(model_xgb.predict(X_test))

#-------------------------------------Use Lasso CV to tune Parameters----------------------------#

lasso = LassoCV(alphas=[
    0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3,
    0.6, 1
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y)
alpha = lasso.alpha_
print "Best alpha :", alpha

print "Trying alphas centered around " + str(alpha)
lasso = LassoCV(alphas=[
    alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
    alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
    alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y)
Beispiel #20
0
def main(args):

    logfile = args['logfile']
    directory = args['directory']  # full fly func path
    pca_subfolder = args['pca_subfolder']
    glm_date = args['glm_date']
    printlog = getattr(flow.Printlog(logfile=logfile), 'print_to_log')

    ### Load PCA ###
    pca_directory = os.path.join(directory, 'pca', pca_subfolder)
    printlog("performing glm on {}".format(pca_directory))
    file = os.path.join(pca_directory, 'scores_(spatial).npy')
    pca_spatial = np.load(file)
    file = os.path.join(pca_directory, 'loadings_(temporal).npy')
    pca_loadings = np.load(file)

    ### Load Fictrac and Timstamps###
    timestamps = bbb.load_timestamps(os.path.join(directory, 'imaging'))
    fictrac_raw = bbb.load_fictrac(os.path.join(directory, 'fictrac'))

    ### Prepare Fictrac ###
    resolution = 100  #desired resolution in ms
    expt_len = 1000 * 30 * 60
    fps = 50  #of fictrac camera
    behaviors = ['dRotLabY', 'dRotLabZ']
    fictrac = {}
    for behavior in behaviors:
        if behavior == 'dRotLabY': short = 'Y'
        elif behavior == 'dRotLabZ': short = 'Z'
        fictrac[short] = bbb.smooth_and_interp_fictrac(fictrac_raw,
                                                       fps,
                                                       resolution,
                                                       expt_len,
                                                       behavior,
                                                       timestamps=timestamps,
                                                       smoothing=51)
        fictrac[short] = fictrac[short] / np.std(fictrac[short])
    xnew = np.arange(0, expt_len, resolution)

    ### Fit GLM ###
    Y_glm = {}
    Y_glm['Y'] = fictrac['Y'].copy()
    Y_glm['Z'] = np.abs(fictrac['Z'].copy())

    models = {}
    num_pcs = 1000
    behaviors = ['Y', 'Z']
    for behavior in behaviors:
        t0 = time.time()
        models[behavior] = {'num_pcs': num_pcs, 'model': LassoCV()}
        X_glm = pca_loadings[:, :num_pcs]
        models[behavior]['model'].fit(X_glm, Y_glm[behavior])
        models[behavior]['score'] = models[behavior]['model'].score(
            X_glm, Y_glm[behavior])

        ### Construct Spatial Map ###
        coef = models[behavior]['model'].coef_
        spatial_map = np.tensordot(coef, pca_spatial[:1000, :, :, :], axes=1)

        ### Save map ###
        glm_directory = os.path.join(directory, 'glm')
        if not os.path.exists(glm_directory):
            os.mkdir(glm_directory)

        save_file = os.path.join(glm_directory,
                                 '{}_{}.nii'.format(glm_date, behavior))
        nib.Nifti1Image(spatial_map, np.eye(4)).to_filename(save_file)

        ### Save scores ###
        score_file = os.path.join(glm_directory,
                                  '{}_score_{}.txt'.format(glm_date, behavior))
        with open(score_file, "a") as f:
            f.write("{}:{}".format(behavior, models[behavior]['score']))
Beispiel #21
0
print(__doc__)

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.
Beispiel #22
0
## 범주형 독립변수 EDA
#for g_i in cat_var_names:
#    p1 = ggplot(all[-all["SalePrice"].isnull()], aes(x = g_i, y = "SalePrice")) + geom_boxplot()
#    p1 = p1 + geom_hline(yintercept=all["SalePrice"].median(), linetype = "dashed", colour = "red")
#    p1 = p1 + ggtitle("Boxplot of SalePrice ~ " + g_i) + theme(title=element_text(hjust=0.5))
#    p1 = p1 + theme(axis_text_x = element_text(angle = 45, hjust = 1))
#    p1.save("./OUT/EDA_BOX_SalePrice~"+g_i+".png", width = 7, height = 3)
    

#%% 변수 선택(feature selection)

#숫자형 독립변수 Lasso로 추출  
train_x = all[-all["SalePrice"].isnull()][num_var_names]
train_y = all[-all["SalePrice"].isnull()]["SalePrice"]

lasso = LassoCV(normalize=True, random_state = 2019)    
lasso.fit(train_x, train_y)
lasso_coef = lasso.coef_
imp_num_var_names = pd.DataFrame(num_var_names)[lasso_coef!=0][0].tolist()

#범주형 독립변수 Random Forest로 추출
train_x = all[-all["SalePrice"].isnull()][cat_var_names]
#범주형 독립변수 LabelEncoder 변형
for ind in cat_var_names:
    a = LabelEncoder()
    a.fit(train_x[ind])
    train_x[ind] = a.transform(train_x[ind])

train_y = all[-all["SalePrice"].isnull()]["SalePrice"]

Beispiel #23
0
y_pred = ridgeregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# Lasso regression
# try alpha=0.001 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_

# try alpha=0.01 and examine coefficients
lassoreg = Lasso(alpha=0.01, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_

# calculate RMSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# select the best alpha with LassoCV
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
lassoregcv.alpha_

# examine the coefficients
print lassoregcv.coef_

# predict method uses the best alpha value
y_pred = lassoregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))
Beispiel #24
0

plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)' %
          t_bic)

# #############################################################################
# LassoCV: coordinate descent
from sklearn.linear_model import LassoCV

# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
LassoCV_fit = LassoCV(cv=20).fit(X, y)
LassoCV_pred = LassoCV_fit.predict(X_test)
R2_LassoCV = metrics.r2_score(LassoCV_pred, y_test)
# 0.60 pas top du tout

# Display results
m_log_alphas = -np.log10(model.alphas_ + EPSILON)

plt.figure()
ymin, ymax = 2300, 3800
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas,
         model.mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
Beispiel #25
0
# Linear regression
clfreg = LinearRegression(n_jobs=-2)
clfreg.fit(X_train, y_train)

# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

# LassoCV
clfLasso = LassoCV(eps=0.002,
                   n_alphas=100,
                   fit_intercept=True,
                   normalize=False)
clfLasso.fit(X_train, y_train)

# Score models
scores = []
modelNames = ["LinearRegression", "Quadratic2", "Quadratic3", "LassoCV"]
confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test, y_test)
confidencepoly3 = clfpoly3.score(X_test, y_test)
confLasso = clfLasso.score(X_test, y_test)
scores.append(confidencereg)
scores.append(confidencepoly2)
scores.append(confidencepoly3)
scores.append(confLasso)
    x = np.linspace(0, 6, N) + np.random.randn(N)
    x = np.sort(x)
    y = x**2 - 4 * x - 3 + np.random.randn(N)
    x.shape = -1, 1
    y.shape = -1, 1

    models = [
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear', LinearRegression(fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   RidgeCV(alphas=np.logspace(-3, 2, 10),
                           fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   LassoCV(alphas=np.logspace(-3, 2, 10),
                           fit_intercept=False))]),
        Pipeline([('poly', PolynomialFeatures()),
                  ('linear',
                   ElasticNetCV(alphas=np.logspace(-3, 2, 10),
                                l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                fit_intercept=False))])
    ]
    mpl.rcParams['font.sans-serif'] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    np.set_printoptions(suppress=True)

    plt.figure(figsize=(18, 12), facecolor='w')
    d_pool = np.arange(1, N, 1)  # 阶
    m = d_pool.size
    clrs = []  # 颜色
    for c in np.linspace(16711680, 255, m, dtype=int):
Beispiel #27
0
        beta_est = ridCV.coef_
    else:
        # Estimating beta using Ordinary Least Squares
        linReg = LinearRegression().fit(X, y_true)
        beta_est = linReg.coef_[0]


    # Weights
    gam_w = 2
    w = 1/np.abs(beta_est)**gam_w
    X_prim = np.empty(shape=np.shape(X))

    for c in range(np.shape(X)[1]):
        X_prim[:, c] = X[:, c]/w[c]

    lasPCV = LassoCV(max_iter=10000).fit(X_prim, y_true.ravel())
    betaP_las = lasPCV.coef_
    beta_adp = betaP_las/w
    lasCV = LassoCV(max_iter=10000).fit(X, y_true.ravel())

    coefAdp = beta_adp
    coefLas = lasCV.coef_

    coefAdpIX = np.where(coefAdp != 0)[0]
    coefLasIX = np.where(coefLas != 0)[0]

    # print(lasPCV.alpha_)

    # print(A_true)
    # print(np.where(coefAdp != 0)[0])
    # print(np.where(coefLas != 0)[0])
Beispiel #28
0
imprtc = model.feature_importances_
imprtc = pd.DataFrame(imprtc, index=keys[2:], columns=["Importance"])
imprtc["Std"] = np.std(
    [tree.feature_importances_ for tree in model.estimators_], axis=0)
x = range(imprtc.shape[0])
y = imprtc.ix[:, 0]
yerr = imprtc.ix[:, 1]
plt.xticks(x, keys[2:], rotation=90)
plt.bar(x, y, yerr=yerr, align="center")
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

######### Filter out Important Variables ############

model = LassoCV()
rfe1 = RFE(model, 25)
rfe2 = rfe1.fit(df[keys[2:]], df[keys[1]])
x = rfe2.transform(df[keys[2:]])
new_features = df[keys[2:]].columns[rfe2.get_support()]

######### Create dummy variables ############

dummy = []
for i in df[new_features]:
    ct = df[i].nunique()
    if ct > 2 and ct <= 10:
        dummy.append(i)

df_with_dummies = pd.get_dummies(df[keys[2:]], columns=dummy, drop_first=True)
df_with_dummies = df_with_dummies.drop('target', axis=1)
Beispiel #29
0
## RidgeCV和Ridge的区别是:前者可以进行交叉验证
models = [
    Pipeline([
        #  include_bias: 是否添加多项式扩展中的1
            ('Poly', PolynomialFeatures(include_bias=True)),
            ('Linear', LinearRegression(fit_intercept=False))
        ]),
    Pipeline([
            ('Poly', PolynomialFeatures(include_bias=True)),
            # alpha给定的是Ridge算法中,L2正则项的权重值,也就是ppt中的兰姆达
            # alphas是给定CV交叉验证过程中,Ridge算法的alpha参数值的取值的范围
            ('Linear', RidgeCV(alphas=np.logspace(-3,2,50), fit_intercept=False))
        ]),
    Pipeline([
            ('Poly', PolynomialFeatures(include_bias=True)),
            ('Linear', LassoCV(alphas=np.logspace(0,1,10), fit_intercept=False))
        ]),
    Pipeline([
            ('Poly', PolynomialFeatures(include_bias=True)),
            # la_ratio:给定EN算法中L1正则项在整个惩罚项中的比例,这里给定的是一个列表;
            # 表示的是在CV交叉验证的过程中,EN算法L1正则项的权重比例的可选值的范围
            ('Linear', ElasticNetCV(alphas=np.logspace(0,1,10), l1_ratio=[.1, .5, .7, .9, .95, 1], fit_intercept=False))
        ])
]

## 线性回归、Lasso回归、Ridge回归、ElasticNet比较
N = 2
plt.figure(facecolor='w')
degree = np.arange(1, N, 2)  # 阶, 多项式扩展允许给定的阶数
dm = degree.size
colors = []  # 颜色
Beispiel #30
0
# is to estimate sparse coefficients. In some way we already did it manually
# when we dropped the AGE column in a previous ridge estimation.
#
# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse
# coefficients. :class:`~sklearn.linear_model.LassoCV` applies cross
# validation in order to determine which value of the regularization parameter
# (`alpha`) is best suited for the model estimation.

from sklearn.linear_model import LassoCV

alphas = np.logspace(-10, 10,
                     21)  # alpha values to be chosen from by cross-validation
model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=LassoCV(alphas=alphas, max_iter=100_000),
        func=np.log10,
        inverse_func=sp.special.exp10,
    ),
)

_ = model.fit(X_train, y_train)

# %%
# First we verify which value of :math:`\alpha` has been selected.

model[-1].regressor_.alpha_

# %%
# Then we check the quality of the predictions.