Exemple #1
0
def lassolarscv():
    print ("Doing cross-validated LassoLars")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf5 = LassoLarsCV(cv=cross_val)
    clf5.fit(base_X, base_Y)
    print ("Score = %f" % clf5.score(base_X, base_Y))
    clf5_pred = clf5.predict(X_test)
    write_to_file("lassolars.csv", clf5_pred)
def lasso_regr(wine_set):

    pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
                    'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]
    predictors = pred.copy()
    targets = wine_set.quality

    # standardize predictors to have mean=0 and sd=1
    predictors = pd.DataFrame(preprocessing.scale(predictors))
    predictors.columns = pred.columns
    # print(predictors.head())

    # split into training and testing sets
    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123)

    # specify the lasso regression model
    model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

    print('Predictors and their regression coefficients:')
    d = dict(zip(predictors.columns, model.coef_))
    for k in d:
        print(k, ':', d[k])

    # plot coefficient progression
    m_log_alphas = -np.log10(model.alphas_)
    # ax = plt.gca()
    plt.plot(m_log_alphas, model.coef_path_.T)
    print('\nAlpha:', model.alpha_)
    plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV')
    plt.ylabel("Regression coefficients")
    plt.xlabel("-log(alpha)")
    plt.title('Regression coefficients progression for Lasso paths')
    plt.show()

    # plot mean squared error for each fold
    m_log_alphascv = -np.log10(model.cv_alphas_)
    plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
    plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.show()

    # Mean squared error from training and test data
    train_error = mean_squared_error(tar_train, model.predict(pred_train))
    test_error = mean_squared_error(tar_test, model.predict(pred_test))
    print('\nMean squared error for training data:', train_error)
    print('Mean squared error for test data:', test_error)

    rsquared_train = model.score(pred_train, tar_train)
    rsquared_test = model.score(pred_test, tar_test)
    print('\nR-square for training data:', rsquared_train)
    print('R-square for test data:', rsquared_test)
Exemple #3
0
def lasso(X,y,value):
    regressor = LassoLarsCV(cv = 10, precompute = False)
    regressor.fit(X,y)
    y_pred = regressor.predict(value)
    return y_pred
Exemple #4
0
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

##############################################################################
# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphas, model.cv_mse_path_, ':')
plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging1', model5, metrics1))
        else:
            model5 = LassoLarsCV(cv=scv)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('LassoLarsCV Regression', model5, metrics1))
        model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                         cv=scv,
                         random_state=seed)
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = rmse(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('LassoCV Regularization', model6, metrics2))
        model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv)
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = rmse(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('RidgeCV Regression', model7, metrics3))
        ## Create an ensemble model ####
        if Boosting_Flag:
            model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging2', model8, metrics4))
        else:
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                min_samples_leaf=2, max_depth=1, random_state=seed),
                                       n_estimators=NUMS,
                                       random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if verbose >= 2:
            print('QuickML_Ensembling Model results:')
            print(
                '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                % (estimator_names[0], metrics1, estimator_names[1], metrics2,
                   estimator_names[2], metrics3, estimator_names[3], metrics4))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging', model5, metrics1))
        else:
            model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20),
                                          cv=scv,
                                          scoring=scoring,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Logistic Regression', model5, metrics1))
        model6 = LinearDiscriminantAnalysis()
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = accu(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('Linear Discriminant', model6, metrics2))
        if modeltype == 'Binary_Classification':
            if (X_train < 0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = GaussianNB()
        else:
            if (X_train < 0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = MultinomialNB()
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = accu(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('Naive Bayes', model7, metrics3))
        if Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here.
            model8 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging', model8, metrics4))
        else:
            ## Create an ensemble model ####
            model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if not isinstance(y_test, str):
            if verbose >= 2:
                print('QuickML_Ensembling Model results:')
                print(
                    '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                    % (estimator_names[0], metrics1, estimator_names[1],
                       metrics2, estimator_names[2], metrics3,
                       estimator_names[3], metrics4))
        else:
            if verbose >= 1:
                print('QuickML_Ensembling completed.')
    stacks = np.c_[results1, results2, results3, results4]
    if verbose == 1:
        print('    Time taken for Ensembling: %0.1f seconds' %
              (time.time() - start_time))
    return estimator_names, stacks


#########################################################
def main():
    u"""Main function for assignment 03."""
    # Load prepared data.
    df = return_proc_and_transf_data_set()
    # Mass is already included as mass in SI units.
    df.drop(['carat'], inplace=True, axis=1)
    # Those are dummy variables not needed in our data set anymore.
    df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1)

    # A bit of error checking.
    if df.isnull().sum().sum() != 0:
        raise ValueError('Your data has unintended nulls.')

    # Cast our dataframe into float type.
    df = df.astype('float64')

    # Scale our dataframe to avoid the sparsity control of our dataframe biased
    # against some variables.
    print('Prior to scaling:')
    print(df.describe())
    df = df.apply(preprocessing.scale)
    print('After scaling:')
    print(df.describe())
    print_separator()
    if (df.mean().abs() > 1e-3).sum() > 0:
        raise ValueError('Scaling of your dataframe went wrong.')

    # Split into training and testing sets
    # The predictirs should not include any price variable since this was used
    # to create the output variable
    predictors = [x for x in df.columns.tolist() if 'price' not in x]
    print('Input variables:')
    pprint(predictors, indent=4)
    input_variables = df[predictors].copy()
    output_variable = df.price.copy()  # Categorized price
    print_separator()

    input_training, input_test, output_training, output_test = train_test_split(
        input_variables, output_variable, test_size=0.3, random_state=0)

    # A few words about the LassoLarsCV:

        # LASSO: least absolute shrinkage and selection operator (discussed in
        # the course material.

        # LARS: least angle regression: algorithm for linear regression models
        # to high-dimensional data (aka 'a lot of categories').
        # Compared to simple LASSO this model uses the LARS algorithm instead of
        # the 'vanilla' 'coordinate_descent' of simple LASSO.

        # CV: cross validation: this sets the alpha parameter (refered to as
        # lambda parameter in the course video) by cross validation.
        # In the simple LARS this alpha (the penalty factor) is an input of the
        # function.
        # 'The alpha parameter controls the degree of sparsity of the
        # coefficients estimated.
        # If alpha = zero then the method is the same as OLS.

    model = LassoLarsCV(
        cv=10,  # Number of folds.
        precompute=False,  # Do not precompute Gram matrix.
        # precompute=True,  # Do not precompute Gram matrix.
        # verbose=3,
    ).fit(input_training, output_training)

    dict_var_lin_coefs = dict(zip(
        predictors,
        model.coef_))

    print('Result of linear model:')
    pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()],
                  key=lambda x: abs(x[1]))
           )
    print_separator()

    # Plot coefficient progression.
    # TODO: plot those on 4 different subplots.
    model_log_alphas = -np.log10(model.alphas_)
    ax = plt.gca()
    plt.plot(model_log_alphas, model.coef_path_.T)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.ylabel('Regression Coefficients')
    plt.xlabel('-log(alpha)')
    plt.title('Regression Coefficients Progression for Lasso Paths')
    plt.legend(predictors,
        loc='best',)
    plt.tight_layout()
    plt.savefig('result00.png', dpi=600)
    plt.close()
    # TODO: why are the coefficients in the result very different than the
    # coefficient path?
    #
    # There seems to be a scaling of the coefficient paths with an arbitrary
    # almost the same constant (194 in this case)
    #
    # print('Resulting alpha is not different than path alpha (difference):')
    # difference = model.alpha_ - model.alphas_
    # pprint(model.alpha_ - model.alphas_)
    # print('Resulting coefficients are very different than path coefficients (difference):')
    # pprint(model.coef_ - model.coef_path_.T)
    # print_separator()


    # Plot mean square error for each fold.
    # To avoid getting dividebyzero warning map zero to an extremely low value.
    model.cv_alphas_ = list(
        map(lambda x: x if x != 0 else np.inf,
            model.cv_alphas_))
    model_log_alphas = -np.log10(model.cv_alphas_)
    plt.figure()
    plt.plot(model_log_alphas, model.cv_mse_path_, ':')
    plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
            label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.legend()
    plt.tight_layout()
    plt.savefig('result01.png', dpi=600)
    plt.close()

    # Mean squared error of our model.
    train_error = mean_squared_error(output_training,
                                     model.predict(input_training))
    test_error = mean_squared_error(output_test,
                                    model.predict(input_test))
    print ('Training data MSE')
    print(train_error)
    print ('Test data MSE')
    print(test_error)
    print_separator()


    # R-square from training and test data.
    rsquared_train = model.score(
        input_training,
        output_training)
    rsquared_test = model.score(
        input_test,
        output_test)
    print ('Training data R-square')
    print(rsquared_train)
    print ('Test data R-square')
    print(rsquared_test)
    print_separator()

    return {'model': model, 'dataframe': df}
Exemple #7
0
if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto")
	build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto")
	build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto")
	build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(cv = 3), "LarsAuto")
	build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto")
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto")
	build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto")
	build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True)
	build_auto(RidgeCV(), "RidgeAuto")
	build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto")
	build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto")
	build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto")
	build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto")

if "Auto" in datasets:
	build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
	build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")

def build_auto_hist(regressor, name):
Exemple #8
0
    def fit(self):

        # 1. construct a placeholder called 'qhat_k_container' for the list of all q_hat^k (defined in Algorithm 2) of each subsample
        qhat_k_container = list()

        # 2. estimate q_hat^k (for the solution path) on each subsample and save them as elements of the placeholder
        for j in range(self.n_repeat):

            # a. randomly choose a subset of sample points (whose index is 'index_subsample') that is used to generate a subsample in each repeat
            index_subsample = np.random.choice(self.train_size,
                                               self.subsample_size,
                                               replace=False)
            # b. based on 'index_subsample', take the corresponding observations of X out and save them as the subample
            X_subsample = self.X_so[index_subsample]
            # c. based on 'index_subsample', take the corresponding observations of Y out and save them as the subample
            y_subsample = self.y_so[index_subsample]

            # d. scikit-learn requires 'y_subsample' to be an one-dimension array
            y_subsample.shape = (y_subsample.shape[0], )

            # e. given a subsample, compute q_hat^k (the solution path) using lars

            # e(1). call the class 'Lars'
            trial_1 = Lars(n_nonzero_coefs=min(X_subsample.shape[1] +
                                               1, X_subsample.shape[0] + 1))
            # e(2). fit lars on the subsample
            trial_1.fit(X_subsample, y_subsample)
            # e(3). save the active set of lars (indices of variables select by lars) as 'active'.
            active = trial_1.active_

            # f. The active set of lars is ranked based on the chronology of variable inclusion at different stages of lars. For example [2,1,3] means x_2 is included at stage 1, x_1 is included at stage 2 and x_3 is included at stage 3. Based on the active set of lars, we compute q_hat^k (defined as 'qhat_k' in code) as defined in Algorithm 2

            # f(1). we generate 'qhat_k' as an array of zeros;
            qhat_k = np.zeros((1, self.n_dim))
            # f(2). we compute the i-th value of q_hat^k for the corresponding variable based on Algorithm 2; replace i-th term in 'qhat_k' with the value we just compute
            for i in active:

                qhat_k[0, i] = 1 - \
                    (np.where(np.array(active) == i)[0][0]) / (self.n_dim)

            # f(3). we append the result into 'qhat_k_container' as one element of the list
            qhat_k_container.append(qhat_k)

        # 3. if self.lasso == True, we compute CV-lars-lasso and CV-cd on the original sample X and Y (not on the subsample)
        if (self.lasso == True):

            # a(1). call the class for CV-lars-lasso (called LassoLarsCV in Scikit-learn)
            # a(2). we set the number of folds in CV as 10
            trial_2 = LassoLarsCV(cv=10)
            # b. change y into one-dimensional array (required by Scikit-learn)
            yy = self.y
            yy.shape = (self.sample_size, )
            # c.  fit CV-lars-lasso on X and Y
            trial_2.fit(self.X, yy)

            # d. save 'la_list' as the number of variables in the active set of CV-lars-lasso
            la_list = len(trial_2.active_)
            # e. save 'la_vari_list' as the active set of CV-lars-lasso
            la_vari_list = trial_2.active_

            # f. call the class for CV-cd (called LassoCV in Scikit-learn)
            # f(1). we set the number of folds in CV as 10
            # f(2). for reproduction, we fix the random seed of training-validation split in CV (random_state=0)
            trial_3 = LassoCV(cv=10, random_state=0)

            # g.  fit cv-cd on X and Y
            trial_3.fit(self.X, yy)

            # h. save 'cd_list' as the number of variables in the active set of CV-cd
            cd_list = np.count_nonzero(trial_3.coef_)
            # i. save 'cd_vari_list' as the active set of CV-cd
            cd_vari_list = np.nonzero(trial_3.coef_)[0]

        # 4. compute q_hat and Q(c) (defined in Algorithm 2)
        # a(1). we transform the list of all q_hat^k ('qhat_k_container') into a matrix ('qhat_k_container_matrix')
        # a(2). row of the matrix: the q_hat^k on a given subsample for all variables
        # a(3). colum of the matrix: the corresponding value of q_hat^k for a given variable on all subsamples
        qhat_k_container_matrix = np.concatenate(qhat_k_container, axis=0)
        # b.  compute the the value of qhat for each variable (qhat defined in Algorithm 2 of the paper)
        qhat_value = np.mean(qhat_k_container_matrix, axis=0)

        # c. set 'Qc_list' as the container of Q(c) for all value of c
        Qc_list = list()
        # d. set 'c_seq' as the sequence of c for the grid search of c* in solar
        c_seq = np.arange(max(qhat_value), 0.1, self.step_size)

        # e. generate Q(c) for each value of c
        for j in c_seq:
            # e(1). define 'container' as the placeholder of Q(c) when c == j;
            container = list()

            for i in range(self.X.shape[1]):
                # e(2). include all variables into 'container' if their corresponding values in q-hat are larger or equal to j;
                if (qhat_value[i] >= j):

                    container.append(i)
            # e(3). append 'container' (Q(c) when c == j) into 'Qc_list' (the container of Q(c) for all value of c);
            Qc_list.append(container)

        # 5. compute the test error of each value of c
        # we use grid search on test set to choose c*;
        # for each value of c in the grid search, train a OLS of Y_so on the variables of Q(c) in X_so (Y_so and X_so defined at the begining);

        # a. container for test errors
        test_error = list()

        # b. compute the test error of each Q(c) on test set
        # b(0). set i as the indices of all variables in Q(c) for a given value of c;
        for i in Qc_list:
            # b(1). call the LinearRegression class;
            OLS_1 = LinearRegression()
            # b(2). compute OLS of Y_so on the variables in Q(c) in X_so;
            OLS_1.fit(self.X_so[:, i], self.y_so)
            # b(3). compute the L2 prediction error of OLS on test set (X_test, y_test);
            s1 = costs_com(self.X_test[:, i], self.y_test, OLS_1)
            loss_test_1, _ = s1.L2()
            # b(4). save the L2 error as the test error of Q(c) for each value of c; append it into the container of test errors;
            test_error.append(loss_test_1)

        # 6. tuning c via grid search
        # 6(a). transform 'test_error' from a list into an array;
        test_error = np.asarray(test_error)
        # 6(b). save the location of minimum of 'test_error' as 'min_loc_val';
        min_loc_val = np.where(test_error == min(test_error))[0]
        # 6(c). save the correpsonding value of c (c*) as 'opt_c';
        opt_c = c_seq[min_loc_val]
        # 6(d). find Q(c*) and save it as 'Q_opt_c';
        Q_opt_c = Qc_list[max(min_loc_val)]

        # 7. Regression of Y onto the selected variables ( Q(c*) ) in X
        # 7(a). call the LinearRegression class;
        OLS_2 = LinearRegression()
        # 7(b). fit OLS of Y on the variables of Q(c*) in X;
        OLS_2.fit(self.X[:, Qc_list[max(min_loc_val)]], self.y)
        # 7(c). set 'solar_coef' (an array of zeros) as the placeholder of solar regression coefficents
        solar_coef = np.zeros([self.n_dim, 1])
        # 7(d). put the estimated regression coefficents into their corresponding place of 'solar_coef'
        solar_coef[Q_opt_c, 0] = OLS_2.coef_

        # 8. define la_list, la_vari_list as empty list if self.lasso != True (if we don't want to compute cv-lars-lasso and cv-cd)
        if (self.lasso != True):

            la_list = []
            la_vari_list = []
            cd_list = []
            cd_vari_list = []

        return solar_coef, opt_c, test_error, Qc_list, la_list, la_vari_list, cd_list, cd_vari_list
targets_test_data = []
s = wbtesttarget.sheet_by_index(0)
testtarget = s.col(0)
for row in range(0, s.nrows):
    value = (s.cell(row, 0).value)
    targets_test_data.append(value)
test_targets = targets_test_data

###############################################################################

# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=30).fit(train_features, train_targets)
t_lasso_lars_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
#plt.figure(figsize=(32,18), dpi=1200) # used to expose the figure at higher resolution 
plt.plot(m_log_alphas, model.cv_mse_path_, ':')
plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel('-log(alpha)')
    else:
        return 0

subset['High Income'] = subset['absolute_deviations'].apply(high_income_flag)

"""
" ==========================  Build LASSO Regression  ==========================
"""
predictors = subset[variables]
targets = subset['High Income']

#Split into training and testing sets
training_data, test_data, training_target, test_target  = train_test_split(predictors, targets, test_size=.3)

# Build the LASSO regression model
model=LassoLarsCV(cv=10, precompute=False).fit(training_data, training_target)

"""
" ==========================  Evaluate LASSO Model  ============================
"""
# print variable names and regression coefficients
feature_name = list(predictors.columns.values)
feature_coefficient = list(model.coef_)
features = pd.DataFrame({'Variable':feature_name, 'Regression Coefficients':feature_coefficient}).sort_values(by='Regression Coefficients', ascending=False)
print(features.head(len(feature_name)))

#print(dict(zip(predictors.columns, model.coef_)))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
    ivars = []
    ivars2 = []
    depvars = []
    columns = []

    for pyear in player_years:
        ivars.append([pt_projs[pyear][system] for system in proj_systems])
        depvars.append(pt_actuals[pyear]['actual'])

    for pyear in pt_projs_curr.keys():
        ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems])

    x = numpy.array(ivars)
    x2 = numpy.array(ivars2)
    y = numpy.array(depvars)
    model_pt = LassoLarsCV(cv=cv_num)
    model_pt.fit(x,y)

    print("Rough PT model, to choose sample")
    for system, coef in zip(proj_systems, model_pt.coef_):
        print("%40s : %f" % (system, coef))
    print("%40s : %f" % ('intercept', model_pt.intercept_))

    sample_proj_pt_arr = model_pt.predict(x)

    curr_proj_pt_arr = model_pt.predict(x2)

    sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr))
    curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr))

    models = {}
Exemple #12
0
    for i, mask in enumerate(MASK):
        X = X_all[mask,:][:,keep]
        y = y_train[mask]
        N_SEG.append(X.shape[0])
        # parameters search range
        #param_ridge_post = list(np.arange(200,400,10))
        #param_ridge_post.append(0.5)
        param_ridge_post= np.concatenate((np.arange(0.1,1,0.1),np.arange(3,5,0.1)))
        #param_ridge_post = [330, 0.5] #p=24489
        #param_ridge_post = [3.7, 0.5] #p=303

        # fit
        from sklearn.linear_model import LassoLarsCV
        from sklearn import linear_model
        lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=5, n_jobs=2)
        lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        lasso_refit = linear_model.LassoLars(alpha=lasso_cv.alpha_,
                            fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000,
                            eps=2.2204460492503131e-16, copy_X=True,
                            fit_path=False)
        lasso_refit.fit(X, y)
        active = lasso_refit.coef_
        for i, x in enumerate(active[0]):
            if x != 0 and i > main.shape[1] - 1:
Exemple #13
0
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
                use_lars=True, fit_intercept=True, normalize=True,
                cv_folds=None, cv_repeats=None, skip_cv=False,
                xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Lasso/LassoLars model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      alpha       alpha parameter for LassoLars (See Note 5) [None]
      use_lars    bool to use LassoLars instead of Lasso [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]

    Returns
    -------
      group with trained LassoLars model, to be used with lasso_predict
    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  alpha is the regularization parameter. if alpha is None it will
         be set using LassoLarsSCV
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws.update(dict(fit_intercept=fit_intercept, normalize=normalize))
    creator = LassoLars if use_lars else Lasso
    model = None

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        if alpha is None:
            lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
                                 max_iter=1e7, eps=1.e-12, **kws)
            lcvmod.fit(spectra, ydat)
            alpha = lcvmod.alpha_

        model = creator(alpha=alpha, **kws)
        resid = []
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    if alpha is None:
        cvmod = creator(**kws)
        cvmod.fit(spectra, ydat)
        alpha = cvmod.alpha_

    if model is None:
        model = creator(alpha=alpha, **kws)

    # final fit without cross-validation
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 alpha=alpha, active=model.active_, coefs=model.coef_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats,
                 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, fit_intercept=fit_intercept,
                 normalize=normalize, groupnames=groupnames, keywords=kws)
class LassoPredictor(Persistent):
    @contract(hypers='dict')
    def __init__(self, hypers):
        modelHypers = self.extract_model_hypers(hypers)
        self.model = LassoLarsCV(**modelHypers)

    @timing
    def fit(self, df, features, targetCol, validationSplit=0.2):

        print("Running fit function:")
        print(df)
        XTrain, yTrain = df2xy(df, features, targetCol)
        if XTrain.shape[0] < 3:
            print("not enough data to form a model!")
            return False

        success = True
        try:
            self.model.fit(XTrain, yTrain)
        #try:
        #Parallel(n_jobs=2, verbose=10, batch_size=20)(delayed(self.fit_helper)(date) for date in self.dates)
        except ValueError:
            traceback.print_exc()
            success = False
        return success

    def predict(self, df, features, targetCol):
        XPred, _ = df2xy(df, features, targetCol)
        try:
            yPred = self.model.predict(XPred)
        except ValueError:
            traceback.print_exc()
            return None

        #df['pred' + targetCol] = yPred
        return yPred

    #def score (self, userXTest):
    #    # *** Needs reworking!
    #    '''
    #    :returns: Score calculated by taking the last yTrain (all data)
    #    and comparing to predicted result.
    #    '''
    #    if self.modelScore is None:
    #        lastDate = self.dates[-1]
    #        actualY = self.yTrains[lastDate]
    #        #preddf = self.predict(userXTest)
    #        preddf = loads(preddf, preserve_order=True)
    #        preddf = pd.DataFrame(preddf['arr'], columns = [self.targetCol])
    #        predY = preddf[self.targetCol]
    #        predY = predY.shift(-self.batchSize)
    #        predY = predY.iloc[:-self.batchSize]

    #        score = metrics.r2_score(actualY, predY)
    #        self.modelScore = score
    #    else:
    #        score = self.modelScore
    #    return score

    def lc(self):
        '''
        Makes learning curve for a player
        '''
        if self.lcScores is None:

            self.lcModel = LassoLarsCV()
            lastDate = self.dates[-1]
            X = self.XTrains[lastDate]
            y = self.yTrains[lastDate]

            N = len(X)
            chopOff = N - (N % 7)
            X = X.iloc[:chopOff]
            y = y.iloc[:chopOff]
            idxs = np.arange(chopOff)

            cvSplits = [(idxs[:i], idxs[i:]) for i in range(7, chopOff, 7)]

            trainSizes, trainScores, testScores = \
                    learning_curve(estimator=self.lcModel,
                                    X=X.as_matrix(),
                                    y=np.array(y),
                                    cv=cvSplits,
                                    train_sizes=[7],
                                    n_jobs=2,
                                    )
            trainSizes = [len(t[0]) for t in cvSplits]
            self.lcScores = dumps((trainSizes, trainScores, testScores))
            result = self.lcScores
        else:
            result = self.lcScores

        return result

    def get_params(self):
        for i, model in self.models.items():
            params = order_dict(model.get_params())
            break
        return params

    def extract_model_hypers(self, hypers):
        '''
        Extracts the parameterse that relevant to the model
        and are not other meta params
        '''
        params = ['verbose']
        modelHypers = {}
        for param in params:
            paramVal = hypers.get(param)
            if paramVal is not None:
                modelHypers[param] = paramVal
        modelHypers = order_dict(modelHypers)
        return modelHypers
predictors['DAYWED']=preprocessing.scale(predictors['DAYWED'].astype('float64'))
predictors['FFMC']=preprocessing.scale(predictors['FFMC'].astype('float64'))
predictors['DMC']=preprocessing.scale(predictors['DMC'].astype('float64'))
predictors['DC']=preprocessing.scale(predictors['DC'].astype('float64'))
predictors['ISI']=preprocessing.scale(predictors['ISI'].astype('float64'))
predictors['TEMP']=preprocessing.scale(predictors['TEMP'].astype('float64'))
predictors['RH']=preprocessing.scale(predictors['RH'].astype('float64'))
predictors['WIND']=preprocessing.scale(predictors['WIND'].astype('float64'))
predictors['RAIN']=preprocessing.scale(predictors['RAIN'].astype('float64'))

# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, 
                                                              test_size=.3, random_state=123)

# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)

# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')

# plot mean square error for each fold
Exemple #16
0
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LarsCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.linear_model import LassoLars, LassoLarsCV, Ridge, RidgeCV

from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

import xgboost as xgb

models = []
models.append(("LrE", LinearRegression()))
models.append(("RidCV", RidgeCV()))
models.append(("LarCV", LarsCV()))
models.append(("LasCV", LassoCV()))
models.append(("ElNCV", ElasticNetCV()))
models.append(("LaLaCV", LassoLarsCV()))
models.append(("XGB", xgb.XGBRegressor()))

kfold = KFold(n_splits=10)


def getCVResult(models, X_learning, Y_learning):

    for name, model in models:
        cv_results = cross_val_score(model,
                                     X_learning,
                                     Y_learning,
                                     scoring='neg_mean_squared_error',
                                     cv=kfold)
        rmsd_scores = np.sqrt(-cv_results)
        print("\n[%s] Mean: %.8f Std. Dev.: %8f" %
ax2 = pl.axes([.08, .5, .05, .47])
cb = pl.colorbar(cax=ax2, ax=ax1)
cb.ax.yaxis.set_ticks_position('left')
cb.ax.yaxis.set_tick_params(labelcolor='white')
cb.ax.yaxis.set_tick_params(labelsize=20)
cb.set_ticks(np.arange(0., .8, .2))
pl.savefig(os.path.join('miyawaki', 'encoding_scores.pdf'))
pl.savefig(os.path.join('miyawaki', 'encoding_scores.png'))
pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps'))
pl.clf()

### Compute receptive fields

from sklearn.linear_model import LassoLarsCV

lasso = LassoLarsCV(max_iter=10,)

p = (4, 2)
# Mask for chosen pixel
pixmask = np.zeros((10, 10), dtype=bool)
pixmask[p] = 1

for index in [1780, 1951, 2131, 1935]:
    rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10)
    pl.figure(figsize=(8, 8))
    # Black background
    pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray')
    pl.imshow(np.ma.masked_equal(rf, 0.), vmin=0., vmax=0.75,
            interpolation="nearest", cmap=cm.bluegreen)
    plot_lines(pixmask, linewidth=6, color='r')
    pl.axis('off')
Exemple #18
0
def _fit_model(x, y, names, operators, **kw):
    steps = [("trafo", LibTrafo(names, operators)),
             ("lasso", LassoLarsCV(**kw))]
    model = Pipeline(steps).fit(x, y)
    return model, model.score(x, y)
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from scipy.stats import gmean
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import DMatrix


df = pd.read_csv("processed.csv", header=0, index_col="ID")
#df.TARGET.describe()

y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values

lr = LassoLarsCV()
sfm = SelectFromModel(lr, threshold=1e-3)
X_std = StandardScaler().fit_transform(X, y)
sfm.fit(X_std,y)
lr.fit(X_std, y)

#feat_imp = pd.DataFrame(lr.coef_, index=X_labels)
#feat_imp.plot(kind="bar", title="Feature Importance", use_index=False)

chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ]
#chosen_feat = pickle.load(open("feat", "rb"))
print(len(chosen_feat))
chosen_feat

# kaggle forum
df.var3 = df.var3.replace(-999999,2)
# ---------------------------
# Now we take a closer look at the receptive fields of the four marked voxels.
# A voxel's `receptive field <http://en.wikipedia.org/wiki/Receptive_field>`_
# is the region of a stimulus (like an image) where the presence of an object,
# like a white instead of a black pixel, results in a change in activity
# in the voxel. In our case the receptive field is just the vector of 100
# regression  coefficients (one for each pixel) reshaped into the 10x10
# form of the original images. Some voxels are receptive to only very few
# pixels, so we use `Lasso regression
# <http://en.wikipedia.org/wiki/Lasso_(statistics)>`_ to estimate a sparse
# set of regression coefficients.

from sklearn.linear_model import LassoLarsCV

# automatically estimate the sparsity by cross-validation
lasso = LassoLarsCV(max_iter=10)

# Mark the same pixel in each receptive field
marked_pixel = (4, 2)

from matplotlib import gridspec
from matplotlib.patches import Rectangle

fig = plt.figure(figsize=(12, 8))
fig.suptitle('Receptive fields of the marked voxels', fontsize=25)

# GridSpec allows us to do subplots with more control of the spacing
gs1 = gridspec.GridSpec(2, 3)

# we fit the Lasso for each of the three voxels of the upper row
for i, index in enumerate([1780, 1951, 2131]):
Exemple #21
0
    lambda_ = datas['cat_data']['lambda']
    shift = datas['cat_data']['shift']

    # models
    models = {}
    models["RF"] = GridSearchCV(
        RFR(n_jobs=-1),
        param_grid={
            "n_estimators": [10, 100, 1000, 10000],
            "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        },
        cv=5,
        n_jobs=20)
    models["LASSO"] = LassoCV(max_iter=100000, cv=5, n_jobs=20)
    models["RIDGE"] = RidgeCV(cv=5)
    models["LASSOLARS"] = LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1)
    models["SVR_POLY2"] = GridSearchCV(
        SVR(kernel='poly', degree=2),
        param_grid={
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
            "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "epsilon": [0.01, 0.1, 0.5, 1, 2, 4]
        },
        cv=5,
        n_jobs=20)
    models["SVR_RBF"] = GridSearchCV(
        SVR(kernel='rbf'),
        param_grid={
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
            "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "epsilon": [0.01, 0.1, 0.5, 1, 2, 4]
Exemple #22
0
    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self
Exemple #23
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = LassoLarsCV(cv=15)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
    prediction = grid_search.predict(validation_features)
    dio.save_prediction(model_name, prediction, type_v)


parameters = {
    "alpha": [0.1, 1, 10],
}

make_grid_search(Ridge(tol=1e-2, solver="lsqr"), parameters, "Ridge_tfidf_05d",
                 param)
#make_grid_search(Lasso(), parameters, "Lasso_tfidf_05d", param)
#make_grid_search(LassoLars(), parameters, "LassoLars_tfidf_05d", param)

a = 5 / 0
benchmark(LassoCV(max_iter=100, verbose=1))
benchmark(LassoLarsCV(n_jobs=-1, max_iter=100, max_n_alphas=50, verbose=1))

n_trees = 20
min_samples_split = 2
name = "ExtraTrees_min_sample%d_%dtrees_tfidf-05d_BoW-titleFullRaw-AllColumns_new_log" % (
    min_samples_split, n_trees)
classifier = ExtraTreesRegressor(
    n_estimators=n_trees,
    #classifier = RandomForestRegressor(n_estimators=n_trees,
    verbose=2,
    n_jobs=4,  # 2 jobs on submission / 4 on valid test
    oob_score=True,
    min_samples_split=min_samples_split,
    random_state=3465343)
classifier.fit(features, salaries)
#classifier = dio.load_model(name)
model_ridge.fit(train_X, train_y)
print('训练集预测的确定系数R ^ 2: ', model_ridge.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model_ridge.score(test_X, test_y))
pred_1 = model_ridge.predict(test_X)
print('模型误差: ', mean_squared_error(test_y, pred_1))

# 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数
model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0])
model.fit(train_X, train_y)
print("模型参数:", model.get_params())
print("模型详情:", model)
print('最佳alpha', model.alpha_)  # Ridge()无这个方法,只有RidgeCV算法有
print('训练集预测的确定系数R ^ 2: ', model.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model.score(test_X, test_y))

pred_2 = model.predict(test_X)
print('Ridge模型误差: ', mean_squared_error(test_y, pred_2))

# Lasso回归
model_lasso = Lasso(alpha=0.01)
model_lasso = LassoCV()
model_lasso = LassoLarsCV()
model_lasso.fit(train_X, train_y)
print("模型参数:", model_lasso.get_params())
print("模型详情:", model_lasso)
#print('最佳alpha',model_lasso.alpha_)
print('训练集预测的确定系数R ^ 2: ', model_lasso.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model_lasso.score(test_X, test_y))

pred_3 = model_lasso.predict(test_X)
print('Lasso模型误差: ', mean_squared_error(test_y, pred_3))
Exemple #26
0
predictors['ESTEEM1']=preprocessing.scale(predictors['ESTEEM1'].astype('float64'))
predictors['VIOL1']=preprocessing.scale(predictors['VIOL1'].astype('float64'))
predictors['PASSIST']=preprocessing.scale(predictors['PASSIST'].astype('float64'))
predictors['DEVIANT1']=preprocessing.scale(predictors['DEVIANT1'].astype('float64'))
predictors['GPA1']=preprocessing.scale(predictors['GPA1'].astype('float64'))
predictors['EXPEL1']=preprocessing.scale(predictors['EXPEL1'].astype('float64'))
predictors['FAMCONCT']=preprocessing.scale(predictors['FAMCONCT'].astype('float64'))
predictors['PARACTV']=preprocessing.scale(predictors['PARACTV'].astype('float64'))
predictors['PARPRES']=preprocessing.scale(predictors['PARPRES'].astype('float64'))

# split data into train and test sets
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, 
                                                              test_size=.3, random_state=123)

# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)

# print variable names and regression coefficients
coef = dict(zip(predictors.columns, model.coef_))

#sort by value
import operator
sorted(coef.items(), key=operator.itemgetter(1), reverse=True) 
#most significants + are ESTEEM1, GPA1, FAMCONCT
#most significants - are DEP1, BLACK, VIOL1

# plot coefficient progression
# show the order of selected cofficient and its value when new predictors are added
m_log_alphas = -np.log10(model.alphas_) #alpha = penalty parameter = lambda through the model selection process
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T) #.T = transpose
models_out = pd.DataFrame(columns=columns)

# Fit models for each season separately
for l, ssn in enumerate(data['seasons']):

    print('Training years:', data['trn_yrs'])
    n_estimators = data['n_smpls']

    # Indexes for selecting data from the training period
    trn_idx = fcts.bool_index_to_int_index(
        np.isin(data['Y']['time.season'], ssn)
        & np.isin(data['Y']['time.year'], data['trn_yrs']))

    # Fit LassoLarsCV models using the handy BaggingRegressor meta-estimator
    cv = KFold(n_splits=5, shuffle=True)
    base_estimator = LassoLarsCV(eps=2e-10, max_iter=200, cv=cv, n_jobs=1)
    ensemble = fcts.bagging_metaestimator(
        data['X'].values[trn_idx], data['Y'][data['y_var']].values[trn_idx],
        data['vrbl_names'], data['n_smpls'], data['p_smpl'], data['p_feat'],
        data['n_jobs'], base_estimator)

    # Append the models to the output table, including also the season information
    for i, mdl in enumerate(ensemble.estimators_[:n_estimators]):
        feature_idxs = ensemble.estimators_features_[i]
        posit_features = np.abs(mdl.coef_) > 0
        feature_names = list(data['vrbl_names'][feature_idxs][posit_features])
        n_features = len(feature_names)
        fcs = mdl.predict(data['X'].values[trn_idx][:, feature_idxs])
        obs = data['Y'][data['y_var']].values[trn_idx]
        train_period_acc = fcts.calc_corr(fcs, obs)
        df = pd.DataFrame([[
Exemple #28
0
mu = np.repeat(0, 100)
dists = np.arange(100)
powers = [[np.abs(i-j) for j in dists] for i in dists]
r = np.power(.5, powers)
X = np.random.multivariate_normal(mu, r, size=50)
y = 7*X[:, 0] + \
    5*X[:, 10] + \
    3*X[:, 20] + \
    1*X[:, 30] + \
    .5*X[:, 40] + \
    .2*X[:, 50] + \
    np.random.normal(0, 2, 50)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

lasso = LassoLarsCV(cv=5).fit(X_train, y_train)
alpha = lasso.alpha_

# For testing when X input has a single feature
Xa, ya = make_regression(n_samples=50,
                         n_features=1,
                         random_state=0,
                         coef=False)

# For testing when y output vector is multidimensionnal
Xb, yb = make_regression(n_samples=50,
                         n_features=10,
                         n_informative=3,
                         n_targets=2,
                         noise=2,
                         random_state=0,
Exemple #29
0
    value: float
        Returns a float in the range (0., 1.)
    """
    try:
        value = float(value)
    except:
        raise argparse.ArgumentTypeError(
            'Invalid float value: \'{}\''.format(value))
    if value < 0.0 or value > 1.0:
        raise argparse.ArgumentTypeError(
            'Invalid float value: \'{}\''.format(value))
    return value

# dictionary of ml options
ml_dict = {
        'lasso': LassoLarsCV(),
        'svr': SVR(),
        'lsvr': LinearSVR(),
        'lr': LogisticRegression(solver='sag'),
        'sgd': SGDClassifier(loss='log',penalty='l1'),
        'svc': SVC(),
        'lsvc': LinearSVC(),
        'rfc': RandomForestClassifier(),
        'rfr': RandomForestRegressor(),
        'dtc': DecisionTreeClassifier(),
        'dtr': DecisionTreeRegressor(),
        'dc': DistanceClassifier(),
        'knc': KNeighborsClassifier(),
        'knr': KNeighborsRegressor(),
        None: None
}
Exemple #30
0
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('../../input/train.csv',
                          delimiter=',',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(ZeroCount(), LassoLarsCV(normalize=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemple #31
0
    def __init__(self, population_size=50, generations=100,
                 mutation_rate=0.5, crossover_rate=0.5,
                 ml = None, min_depth = 1, max_depth = 2, max_depth_init = 2,
                 sel = 'epsilon_lexicase', tourn_size = 2, fit_choice = None,
                 op_weight = False, max_stall=100, seed_with_ml = True, erc = False,
                 random_state=None, verbosity=0,
                 scoring_function=None, disable_update_check=False,
                 elitism=True, boolean = False,classification=False,clean=False,
                 track_diversity=False,mdr=False,otype='f',c=True,
                 weight_parents=True):
                # sets up GP.

        # Save params to be recalled later by get_params()
        self.params = locals()  # placed before any local variable definitions
        self.params.pop('self')

        # # Do not prompt the user to update during this session if they
        # ever disabled the update check
        # if disable_update_check:
        #     FEW.update_checked = True
        #
        # # Prompt the user if their version is out of date
        # if not disable_update_check and not FEW.update_checked:
        #     update_check('FEW', __version__)
        #     FEW.update_checked = True

        self._best_estimator = None
        self._training_features = None
        self._training_labels = None
        self._best_inds = None

        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.min_depth = min_depth
        self.max_depth = max_depth
        self.max_depth_init = max_depth_init
        self.sel = sel
        self.tourn_size = tourn_size
        self.fit_choice = fit_choice
        self.op_weight = op_weight
        self.max_stall = max_stall
        self.weight_parents = weight_parents
        self.seed_with_ml = seed_with_ml
        self.erc = erc
        self.random_state = check_random_state(random_state)
        self.verbosity = verbosity
        self.scoring_function = scoring_function
        self.gp_generation = 0
        self.elitism = elitism
        self.max_fit = 99999999.666
        self.boolean = boolean
        self.classification = classification
        self.clean = clean
        self.ml = Pipeline([('standardScaler',StandardScaler()), ('ml', ml)])
        self.ml_type = type(self.ml.named_steps['ml']).__name__
        self.track_diversity = track_diversity
        self.mdr = mdr
        self.otype = otype

        # if otype is b, boolean functions must be turned on
        if self.otype=='b':
            self.boolean = True

        # instantiate sklearn estimator according to specified machine learner
        if self.ml.named_steps['ml'] is None:
            if self.classification:
                self.ml = Pipeline([('standardScaler',StandardScaler()),
                                    ('ml',LogisticRegression(solver='sag'))])
            else:
                self.ml = Pipeline([('standardScaler',StandardScaler()),
                                    ('ml',LassoLarsCV())])
        if not self.scoring_function:
            if self.classification:
                self.scoring_function = accuracy_score
            else:
                self.scoring_function = r2_score

        # set default fitness metrics for various learners
        if not self.fit_choice:
            tmp_dict =  defaultdict(lambda: 'r2', {
                            #regression
                            type(LassoLarsCV()): 'mse',
                            type(SVR()): 'mae',
                            type(LinearSVR()): 'mae',
                            type(KNeighborsRegressor()): 'mse',
                            type(DecisionTreeRegressor()): 'mse',
                            type(RandomForestRegressor()): 'mse',
                            #classification
                            type(DistanceClassifier()): 'silhouette',
            })
            self.fit_choice = tmp_dict[type(self.ml.named_steps['ml'])]

        # Columns to always ignore when in an operator
        self.non_feature_columns = ['label', 'group', 'guess']

        # function set
        self.func_set = [node('+'), node('-'), node('*'), node('/'),
                         node('sin'), node('cos'), node('exp'), node('log'),
                         node('^2'), node('^3'), node('sqrt')]

        # terminal set
        self.term_set = []
        # diversity
        self.diversity = []
        # use cython
        self.c = c
 def __init__(self, hypers):
     modelHypers = self.extract_model_hypers(hypers)
     self.model = LassoLarsCV(**modelHypers)
 def fit(self, X, y):
     from sklearn.linear_model import LassoLarsCV
     self.estimator = LassoLarsCV(cv=5)
     self.estimator.fit(X, y)
     return self
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-17.53982123860686
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=make_pipeline(
            StackingEstimator(
                estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)),
            StackingEstimator(estimator=GradientBoostingRegressor(
                alpha=0.8,
                learning_rate=1.0,
                loss="lad",
                max_depth=4,
                max_features=0.7000000000000001,
                min_samples_leaf=11,
                min_samples_split=20,
                n_estimators=100,
                subsample=0.1)), LassoLarsCV(normalize=True))),
        FunctionTransformer(copy)),
    LinearSVR(C=5.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemple #35
0
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', 
                       Boosting_Flag=False,
                       scoring='', verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    FOLDS = 5
    model_dict = {}
    model_tuples = []
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
    else:
        try:
            X_train = X_train.sample(frac=0.30,random_state=99)
            y_train = y_train[X_train.index]
        except:
            pass
        NUMS = 200
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS,random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                                    random_state=seed, max_depth=1, min_samples_leaf=2
                                    ), n_estimators=NUMS, random_state=seed)
            model_tuples.append(('Adaboost',model5))
        elif not Boosting_Flag:
            model5 = LassoLarsCV(cv=scv)
            model_tuples.append(('LassoLarsCV',model5))
        else:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                        n_estimators=NUMS,random_state=seed)
            model_tuples.append(('Bagging',model5))
        if Boosting_Flag is None:
            model6 = DecisionTreeRegressor(max_depth=5,min_samples_leaf=2)
            model_tuples.append(('Decision_Tree',model6))
        elif not Boosting_Flag:
            model6 = LinearSVR()
            model_tuples.append(('Linear_SVR',model6))
        else:
            model6 = DecisionTreeRegressor(max_depth=5,min_samples_leaf=2)
            model_tuples.append(('Decision_Tree',model6))
        sgd_best_model = SGDRegressor(alpha=1e-06,
                            loss='squared_loss',
                           max_iter=1000,
                           penalty='l2',
                           learning_rate = 'constant',
                           eta0 = .1,
                           random_state = 3,
                           tol=None)
        model7 = BaggingRegressor(sgd_best_model)
        model_tuples.append(('SGD_Regressor',model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present. 
            ###   So choose a different kind of classifier here
            model8 = RandomForestRegressor(bootstrap = False,
                                       max_depth = 10,
                                       max_features = 'auto',
                                       min_samples_leaf = 2,
                                       n_estimators = 200,
                                       random_state=99)
            model_tuples.append(('Bagging_Regressor',model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. 
            ###   So choose a different kind of classifier here
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                                    random_state=seed, max_depth=1, min_samples_leaf=2
                                    ), n_estimators=NUMS, random_state=seed)
            model_tuples.append(('Adaboost',model8))
        else:
            model8 = RandomForestRegressor(bootstrap = False,
                                       max_depth = 10,
                                       max_features = 'auto',
                                       min_samples_leaf = 2,
                                       n_estimators = 200,
                                       random_state=99)
            model_tuples.append(('Bagging_Regressor',model8))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS,random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                                    random_state=seed, max_depth=1, min_samples_leaf=2
                                    ), n_estimators=NUMS, random_state=seed)
            model_tuples.append(('Adaboost',model5))
        elif not Boosting_Flag:
            model5 = LinearDiscriminantAnalysis()
            model_tuples.append(('Linear_Discriminant',model5))
        else:
            model5 = LogisticRegression(C=0.01,solver='liblinear',
                                          random_state=seed)
            model_tuples.append(('Logistic_Regression_Model',model5))
        if Boosting_Flag is None:
            model6 = DecisionTreeClassifier(max_depth=5,min_samples_leaf=2)
            model_tuples.append(('Decision_Tree',model6))
        elif not Boosting_Flag:
            model6 = LinearSVC()
            model_tuples.append(('Linear_SVC',model6))
        else:
            model6 = DecisionTreeClassifier(max_depth=5,min_samples_leaf=2)
            model_tuples.append(('Decision_Tree',model6))
        if modeltype == 'Binary_Classification':
            model7 = GaussianNB()
        else:
            model7 = MultinomialNB()
        model_tuples.append(('Naive_Bayes',model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present. 
            ###   So choose a different kind of classifier here
            model8 = RandomForestClassifier(bootstrap = False,
                                       max_depth = 10,
                                       max_features = 'auto',
                                       min_samples_leaf = 2,
                                       n_estimators = 200,
                                       random_state=99)
            model_tuples.append(('Bagging_Classifier',model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. 
            ###   So choose a different kind of classifier here
            sgd_best_model = SGDClassifier(alpha=1e-06,
                                loss='log',
                               max_iter=1000,
                               penalty='l2',
                               learning_rate = 'constant',
                               eta0 = .1,
                               random_state = 3,
                               tol=None)
            model8 = OneVsRestClassifier(sgd_best_model)
            model_tuples.append(('One_vs_Rest_Classifier',model8))
        else:
            model8 = RandomForestClassifier(bootstrap = False,
                                       max_depth = 10,
                                       max_features = 'auto',
                                       min_samples_leaf = 2,
                                       n_estimators = 200,
                                       random_state=99)
            model_tuples.append(('Bagging_Classifier',model8))
    model_dict = dict(model_tuples)
    models, results = run_ensemble_models(model_dict, X_train, y_train, X_test, y_test, 
                                          scoring, modeltype)
    return models, results
    RobustScaler(), MinMaxScaler(),
    StackingEstimator(estimator=LinearSVR(C=25.0,
                                          dual=True,
                                          epsilon=0.01,
                                          loss="epsilon_insensitive",
                                          tol=0.0001)),
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=8, min_samples_leaf=17, min_samples_split=9)),
    FeatureAgglomeration(affinity="l2", linkage="average"),
    RBFSampler(gamma=0.75),
    StackingEstimator(estimator=LinearSVR(C=1.0,
                                          dual=True,
                                          epsilon=1.0,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.1)),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=9, p=1, weights="uniform")),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    SelectPercentile(score_func=f_regression, percentile=26), StandardScaler(),
    PCA(iterated_power=7, svd_solver="randomized"),
    StackingEstimator(estimator=LinearSVR(C=10.0,
                                          dual=True,
                                          epsilon=0.01,
                                          loss="squared_epsilon_insensitive",
                                          tol=1e-05)), ZeroCount(),
    SelectFwe(score_func=f_regression, alpha=0.039),
    PCA(iterated_power=5, svd_solver="randomized"), RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
#分别计算其均方根误差和拟合优度

y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred))
y_train_score = rd.score(x_train, y_train)
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
y_test_score = rd.score(x_test, y_test)
print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score))
print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
'''========9.Lasso回归========'''
import numpy as np
import matplotlib.pyplot as plt  # 可视化绘制
from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV  # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取

#model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
# model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(x_train, y_train)  # 线性回归建模
print('系数矩阵:\n', model.coef_, model.intercept_)

print('线性回归模型:\n', model)
print('最佳的alpha:', model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效

# 使用模型预测
#分别预测训练数据和测试数据
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
#分别计算其均方根误差和拟合优度

y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred))
y_train_score = model.score(x_train, y_train)
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
Exemple #38
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-1.758311648032997e-26
exported_pipeline = make_pipeline(MinMaxScaler(), StandardScaler(),
                                  StandardScaler(),
                                  LassoLarsCV(normalize=True))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemple #39
0
        X_trainset_001.append(X_trainset[i])
        y_trainset_001.append(1.0)
        num_2 += 1
print num_1, num_2
classify_model_001 = RandomForestClassifier(n_estimators=55, random_state=1)
classify_model_001.fit(X_trainset_001, y_trainset_001)

### 构建0.003的回归模型
from sklearn.linear_model import LassoLarsCV, BayesianRidge
X_trainset_0003 = []
y_trainset_0003 = []
for i in range(0, y_trainset.__len__(), 1):
    if y_trainset[i] < 0.003:
        X_trainset_0003.append(X_trainset[i])
        y_trainset_0003.append(y_trainset[i])
reg_0003 = LassoLarsCV(max_n_alphas=100, positive=True)
reg_0003.fit(X_trainset_0003, y_trainset_0003)

### 构建0.003-0.01的回归模型
from sklearn.linear_model import LassoLarsCV
X_trainset_001 = []
y_trainset_001 = []
for i in range(0, y_trainset.__len__(), 1):
    if y_trainset[i] >= 0.003 and y_trainset[i] < 0.015:
        X_trainset_001.append(X_trainset[i])
        y_trainset_001.append(y_trainset[i])
reg_001 = LassoLarsCV(max_n_alphas=100, cv=10)
reg_001.fit(X_trainset_001, y_trainset_001)

### 构建大于0.01的回归模型
from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV
Exemple #40
0
#print(LogReg.coef_)
#print(icu.head())

#############   LASSO   ##############
predvar = icu.copy()
target = predvar.STA
predictors = predvar[[
    'AGE', 'SYS', 'HRA', 'RACE_1', 'RACE_2', 'RACE_3', 'CPR_1', 'TYP_1'
]].copy()

for i in list(predictors.columns.values):
    predictors[i] = preprocessing.scale(predictors[i].astype('float64'))

pred_train, pred_test, resp_train, resp_test = train_test_split(
    predictors, target, test_size=.3, random_state=123)
model = LassoLarsCV(cv=10, precompute=True).fit(pred_train, resp_train)
dict(zip(predictors.columns, model.coef_))
m_log_alphascv = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphascv, model.mse_path_, ':')
plt.plot(m_log_alphascv,
         model.mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
model.fit(X, y)  # 线性回归建模
print('系数:\n', model.coef_)
print('线性回归模型详情:\n', model)
print('交叉验证最佳alpha值', model.alpha_)  # Ridge()无这个方法,只有RidgeCV算法有

pred_2 = model.predict(X)
# 绘制散点图
plt.scatter(X, y, marker='x')
plt.plot(X, pred_1, c='r')
plt.xlabel("x")
plt.ylabel("y")
plt.show()

# Lasso回归
model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(X, y)  # 线性回归建模
print('系数:\n', model.coef_)
print('线性回归模型详情:\n', model)
# print('最佳的alpha:',model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效

pred = model.predict(X)

# 绘制散点图
plt.scatter(X, y, marker='x')
plt.plot(X, pred, c='r')
plt.xlabel("x")
plt.ylabel("y")
plt.show()
Exemple #42
0
    hg = pl.plot(alpha_grid[1:]**.333, scores_path[coef != 0].T[1:], 'r')
    hb = pl.plot(alpha_grid[1:]**.333, scores_path[coef == 0].T[1:], 'k')
    ymin, ymax = pl.ylim()
    pl.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    pl.ylabel('Stability score: proportion of times selected')
    pl.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
    pl.axis('tight')
    pl.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
              loc='best')

    ###########################################################################
    # Plot the estimated stability scores for a given alpha

    # Use 6-fold cross-validation rather than the default 3-fold: it leads to
    # a better choice of alpha:
    lars_cv = LassoLarsCV(cv=6).fit(X, y)

    # Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
    trees = ExtraTreesRegressor(100, compute_importances=True).fit(X, y)
    # Compare with F-score
    F, _ = f_regression(X, y)

    pl.figure()
    for name, score in [
        ('F-test', F),
        ('Stability selection', clf.scores_),
        ('Lasso coefs', np.abs(lars_cv.coef_)),
Exemple #43
0
parser = argparse.ArgumentParser()
parser.add_argument("--lat", help="Training Latitude", type=float)
parser.add_argument("--lon", help="Training Longitude", type=float)

args = parser.parse_args()

train_data = load_data.load_supervised(1950, 1985, args.lat, args.lon, 50, which='train')
test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test')

lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon))
if os.path.exists(lasso_file):
	print "Reading PCA from file"
	L = pickle.load(open(lasso_file, 'r'))
else:
	print "Fitting Lasso"
	L = LassoLarsCV(cv=5)
	L.fit(train_data.X, train_data.y[:,0])
	pickle.dump(L, open(lasso_file, 'w'))


## Print Fit stats
print "Alpha", L.alpha_ 
print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X))
print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X))

yhat = L.predict(test_data.X)
print "Pearson Corr", pearsonr(test_data.y[:,0], yhat)
print "Spearman Corr", spearmanr(test_data.y[:,0], yhat)
print "SSE", sum((yhat - test_data.y[:,0])**2)

X_aging, y_aging = aging[col], aging[interest]
X_sa, y_sa       = superagers[col], superagers[interest]
X_mci, y_mci     = mci[col], mci[interest]
X_train, y_train = train_set[col], train_set[interest]
X_test, y_test   = test_set[col], test_set[interest]

score = 'mean_squared_error'
tuned_params_lasso = [{'alpha': np.linspace(-1, 1, 100),
                       'normalize': [True, False]}]

### ACROSS WHOLE DATASET
### With StratifiedKFold, we're stratifying according to the interest variable.
### This ensures that there will be an even proportion of RAVLT_DEL (or whatever
### the interest variable is) values across all folds.
skf = cross_validation.StratifiedKFold(y_aging, n_folds=6)
model = LassoLarsCV(max_iter=100000, cv=skf).fit( X_aging, y_aging )

# print("Best estimator for WHOLE DATASET: \n{0}\n".format(model.best_estimator_))
print("Percent variance explained: {0}".format(model.score( X_aging, y_aging)))
print("Coefficients found: \n{0}\n".format(prettyprint(model.coef_, col, sort=True)))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
Exemple #45
0
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto")
	build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(), "LarsAuto")
	build_auto(LassoCV(random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(), "LassoLarsAuto")
	build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11)
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
	build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto")
	build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", flat = True)
	build_auto(RidgeCV(), "RidgeAuto")
	build_auto(TheilSenRegressor(n_subsamples = 15, random_state = 13), "TheilSenAuto")
	build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", ntree_limit = 31)

if "Auto" in datasets:
	build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
	build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")

def build_auto_h2o(regressor, name):
	transformer = ColumnTransformer(
Exemple #46
0
class LinearAll:
    """
    A repertoire of Linear Variable Selection and Prediction Models

    Parameters
    ----------
    n_jobs : int, optional
        Number of jobs to run in parallel (default 1).
        If -1 all CPUs are used. This will only provide speedup for
        n_targets > 1 and sufficient large problems
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be:
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’
    refit : boolean
        Refit the best estimator with the entire dataset. If “False”,
        it is impossible to make predictions using this GridSearchCV
        instance after fitting.
    iid : boolean, optional
        If True, the data is assumed to be identically distributed across
        the folds, and the score is computed from all samples individually,
        and not the mean loss across the folds.
        (If the number of data points is the same across folds, either
        returns the same thing)

    Attributes
    ----------
    ols_train,
    predictions models before variable selection
    predictions models after variable selection
    """

    def __init__ (self, cv=20, scoring = 'mean_squared_error',
                  n_jobs=1, refit=False, iid=False, pre_pred=True,
                  param_ridge_post=list(np.arange(1,3,0.1)),
                    rlasso_selection_threshold = 0.5):
        #self.__name__ = '__main__'
        """
        CAUTION: we changed to __main__ so that parallelization works
        """
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.iid = iid
        self.pre_pred =pre_pred
        self.param_ridge_post = param_ridge_post
        self.rlasso_selection_threshold = rlasso_selection_threshold

    def run_models(self, X, y, param_ridge):
        """

        Prediction Models.

        OLS, PLS, Ridge

        """

        ##################################
        ## OLS CV
        ##################################
        #ols = linear_model.LinearRegression(fit_intercept=True,
        #                                          normalize=False,
        #                                          copy_X=True)
        #ols_cv_score = cross_validation.cross_val_score(
        #        ols, X, y,
        #        cv=self.cv, scoring=self.scoring,
        #        n_jobs=self.n_jobs)
        """
        self.ols_cv_score.shape = (cv,)
        """

        ##################################
        ## PLS CV
        ##################################
        tuned_parameters = [{'n_components': range(1, 5)}]
        pls = PLSRegression()
        pls_cv = GridSearchCV(pls, tuned_parameters,
                                cv=self.cv, scoring=self.scoring,
                                n_jobs=self.n_jobs,
                                refit=self.refit, iid=self.iid)
        pls_cv.fit(X, y)


        ##################################
        ## Ridge CV
        ##################################
        tuned_parameters = [{'alpha': param_ridge}]
        ridge = linear_model.Ridge(alpha = 1)
        ridge_cv = GridSearchCV(ridge, tuned_parameters,
                                     cv=self.cv, scoring=self.scoring,
                                     n_jobs=self.n_jobs,
                                     refit=self.refit, iid=self.iid)
        ridge_cv.fit(X, y)

        return (pls_cv, ridge_cv)

    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self

    def predict(self, X_test):
        assert(self.refit == True)
        if self.pls_post.best_score_ > self.ridge_post.best_score_:
            self.best_model = self.pls_post
            print "Chosen Model: pls"
        else:
            self.best_model = self.ridge_post
            print "Chosen Model: ridge"

        if self.rlasso_selection_threshold == 0:
            X_test_selected = X_test[:, self.active]
        else:
            X_test_selected = self.rlasso.transform(X_test)
        return self.best_model.best_estimator_.predict(X_test_selected)
Exemple #47
0
    block = delay * num_filter
    chan = num / block
    f = (num % block) / delay
    t = (num % block) % delay
    return (chan, f, t)


if __name__ == "__main__":
    os.chdir(os.path.dirname(__file__))
    subj = 'sub1'
    finger = 1
    with h5py.File('ECoG_data.h5', 'r+') as f:
        u = f[subj]['unmixing_matrix'][:]
        X = f[subj]['train_data'][:]
        X -= X.mean(0)
        X = X.dot(u)
        Y = f[subj]['cleaned_train_dg'][:]
    X1, y1, _ = preprocessing(X, Y[:, finger])
    ls = LassoLarsCV()
    ls.fit(X1, y1[:, 0])
    pickle.dump(ls, open('linear_model_'+subj+'_'+str(finger), 'wb'))
    channel_count = Counter([num2info(c)[0] for c in ls.coef_.nonzero()[0]])
    X2, _, yb = preprocessing(X[:, list(set(channel_count.keys()))],
                              Y[:, finger])
    ls2 = LogisticRegressionCV()
    ls2.fit(X2, yb[:, 0])
    pickle.dump(ls2, open('logistic_model_'+subj+'_'+str(finger), 'wb'))
    with h5py.File('selected_channel.h5', 'w') as f:
            f.create_dataset('selected_channel',
                             data=list(set(channel_count.keys())))