Exemple #1
0
def test_few_at_least_as_good_as_default():
    """test_few.py: few performs at least as well as the default ML """
    np.random.seed(1006987)
    boston = load_boston()
    d = np.column_stack((boston.data,boston.target))
    np.random.shuffle(d)
    features = d[:,0:-1]
    target = d[:,-1]

    print("feature shape:",boston.data.shape)

    learner = FEW(generations=1, population_size=5,
                ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
                sel = 'tournament')

    learner.fit(features[:300], target[:300])
    few_score = learner.score(features[:300], target[:300])
    few_test_score = learner.score(features[300:],target[300:])

    lasso = LassoLarsCV()
    lasso.fit(features[:300], target[:300])
    lasso_score = lasso.score(features[:300], target[:300])
    lasso_test_score = lasso.score(features[300:],target[300:])
    print("few score:",few_score,"lasso score:",lasso_score)
    print("few test score:",few_test_score,"lasso test score:",
          lasso_test_score)
    assert round(few_score,8) >= round(lasso_score,8)

    print("lasso coefficients:",lasso.coef_)
Exemple #2
0
def test_few_at_least_as_good_as_default():
    """test_few.py: few performs at least as well as the default ML """
    np.random.seed(1006987)
    boston = load_boston()
    d = np.column_stack((boston.data,boston.target))
    np.random.shuffle(d)
    features = d[:,0:-1]
    target = d[:,-1]

    print("feature shape:",boston.data.shape)

    learner = FEW(generations=1, population_size=5,
                mutation_rate=1, crossover_rate=1,
                ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
                sel = 'tournament', fit_choice = 'r2',tourn_size = 2, random_state=0, verbosity=0,
                disable_update_check=False)

    learner.fit(features[:300], target[:300])
    few_score = learner.score(features[:300], target[:300])
    test_score = learner.score(features[300:],target[300:])

    lasso = LassoLarsCV()
    lasso.fit(learner._training_features,learner._training_labels)
    lasso_score = lasso.score(features[:300], target[:300])
    print("few score:",few_score,"lasso score:",lasso_score)
    print("few test score:",test_score,"lasso test score:",lasso.score(features[300:],target[300:]))
    assert few_score >= lasso_score

    print("lasso coefficients:",lasso.coef_)
Exemple #3
0
def lasso_regr(wine_set):

    pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
                    'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]
    predictors = pred.copy()
    targets = wine_set.quality

    # standardize predictors to have mean=0 and sd=1
    predictors = pd.DataFrame(preprocessing.scale(predictors))
    predictors.columns = pred.columns
    # print(predictors.head())

    # split into training and testing sets
    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123)

    # specify the lasso regression model
    model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

    print('Predictors and their regression coefficients:')
    d = dict(zip(predictors.columns, model.coef_))
    for k in d:
        print(k, ':', d[k])

    # plot coefficient progression
    m_log_alphas = -np.log10(model.alphas_)
    # ax = plt.gca()
    plt.plot(m_log_alphas, model.coef_path_.T)
    print('\nAlpha:', model.alpha_)
    plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV')
    plt.ylabel("Regression coefficients")
    plt.xlabel("-log(alpha)")
    plt.title('Regression coefficients progression for Lasso paths')
    plt.show()

    # plot mean squared error for each fold
    m_log_alphascv = -np.log10(model.cv_alphas_)
    plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
    plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.show()

    # Mean squared error from training and test data
    train_error = mean_squared_error(tar_train, model.predict(pred_train))
    test_error = mean_squared_error(tar_test, model.predict(pred_test))
    print('\nMean squared error for training data:', train_error)
    print('Mean squared error for test data:', test_error)

    rsquared_train = model.score(pred_train, tar_train)
    rsquared_test = model.score(pred_test, tar_test)
    print('\nR-square for training data:', rsquared_train)
    print('R-square for test data:', rsquared_test)
def lasso_regr(wine_set):

    pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity',
                    'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']]
    predictors = pred.copy()
    targets = wine_set.quality

    # standardize predictors to have mean=0 and sd=1
    predictors = pd.DataFrame(preprocessing.scale(predictors))
    predictors.columns = pred.columns
    # print(predictors.head())

    # split into training and testing sets
    pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123)

    # specify the lasso regression model
    model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

    print('Predictors and their regression coefficients:')
    d = dict(zip(predictors.columns, model.coef_))
    for k in d:
        print(k, ':', d[k])

    # plot coefficient progression
    m_log_alphas = -np.log10(model.alphas_)
    # ax = plt.gca()
    plt.plot(m_log_alphas, model.coef_path_.T)
    print('\nAlpha:', model.alpha_)
    plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV')
    plt.ylabel("Regression coefficients")
    plt.xlabel("-log(alpha)")
    plt.title('Regression coefficients progression for Lasso paths')
    plt.show()

    # plot mean squared error for each fold
    m_log_alphascv = -np.log10(model.cv_alphas_)
    plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
    plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
    plt.legend()
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.show()

    # Mean squared error from training and test data
    train_error = mean_squared_error(tar_train, model.predict(pred_train))
    test_error = mean_squared_error(tar_test, model.predict(pred_test))
    print('\nMean squared error for training data:', train_error)
    print('Mean squared error for test data:', test_error)

    rsquared_train = model.score(pred_train, tar_train)
    rsquared_test = model.score(pred_test, tar_test)
    print('\nR-square for training data:', rsquared_train)
    print('R-square for test data:', rsquared_test)
Exemple #5
0
def lassolarscv():
    print ("Doing cross-validated LassoLars")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf5 = LassoLarsCV(cv=cross_val)
    clf5.fit(base_X, base_Y)
    print ("Score = %f" % clf5.score(base_X, base_Y))
    clf5_pred = clf5.predict(X_test)
    write_to_file("lassolars.csv", clf5_pred)
Exemple #6
0
def fit_linear_model(X, Y, verbose=True):
    """
        Fits linear regression model to given observations

        return: fitted model
    """
    X_train, X_test, Y_train, Y_test = split_obs(X, Y)
    estimator = LassoLarsCV(cv=20).fit(X_train, Y_train)
    test_score = estimator.score(X_test, Y_test)

    if verbose:
        print(f'***** Linear Regression Stats *****')
        print(f'target: {Y.name}')
        print(f'r-squared: {round(test_score, 4)}')
        print(f'alpha: {estimator.alpha_}\n')

    return estimator
class determine_attribute_quality(object):
    
    def __init__(self,red,white):
        self.red=red
        self.white=white
    
    def remove_column_spaces(self,wine_data):
        wine_data.columns = [x.strip().replace(' ', '_') for x in wine_data.columns]
        return wine_data
    
    def regression(self,wine_data):

        self.pred = wine_data[['density', 
                               'alcohol', 
                               'sulphates', 
                               'pH', 
                               'volatile_acidity', 
                               'chlorides', 
                               'fixed_acidity',
                               'citric_acid', 
                               'residual_sugar', 
                               'free_sulfur_dioxide', 
                               'total_sulfur_dioxide']]
        self.predictors = self.pred.copy()
        self.targets = wine_data.quality

        # Normalization
        self.predictors = pd.DataFrame(preprocessing.scale(self.predictors))
        self.predictors.columns = self.pred.columns
    
        # Split into Training and Testing sets
        (self.pred_train, 
         self.pred_test, 
         self.target_train, 
         self.target_test) = train_test_split(self.predictors, 
                                             self.targets, 
                                             test_size=.2, 
                                             random_state=123)

        # Lasso Regression Model
        self.model = LassoLarsCV(cv=10, precompute=False).fit(self.pred_train, self.target_train)

        print('Predictors and their Regression coefficients:')
        d = dict(zip(self.predictors.columns, self.model.coef_))
        for k in d:
            print(k, ':', d[k])

        # Plot Coefficient Progression
        m_log_alphas = -np.log10(self.model.alphas_)
    
        plt.plot(m_log_alphas, self.model.coef_path_.T)
        print('\nAlpha:', self.model.alpha_)
        plt.axvline(-np.log10(self.model.alpha_), linestyle="dashed", color='k', label='alpha CV')
        plt.ylabel("Regression coefficients")
        plt.xlabel("-log(alpha)")
        plt.title('Regression coefficients progression for Lasso paths')
        plt.show()

        # Plot MSE for each fold
        m_log_alphascv = -np.log10(self.model.cv_alphas_)
        plt.plot(m_log_alphascv, self.model.cv_mse_path_, ':')
        plt.plot(m_log_alphascv, self.model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
        plt.legend()
        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean Squared Error')		
        plt.title('Mean Squared Error on Each Fold')
        plt.show()

        # Mean Squared Error from Training and Test data
        self.train_error = mean_squared_error(self.target_train, self.model.predict(self.pred_train))
        self.test_error = mean_squared_error(self.target_test, self.model.predict(self.pred_test))
        print('\nMean squared error for training data:', self.train_error)
        print('Mean squared error for test data:', self.test_error)

        self.rsquared_train = self.model.score(self.pred_train, self.target_train)
        self.rsquared_test = self.model.score(self.pred_test, self.target_test)
        print('\nR-square for training data:', self.rsquared_train)
        print('R-square for test data:', self.rsquared_test)
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
         

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Exemple #9
0
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths of Selected Variables')

#plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
            label='Average across the folds',linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
plt.xlim(1.95,4.0)

#MSE from training and test data
training_error = mean_squared_error(target_train,model.predict(predictors_train))
test_error = mean_squared_error(target_test,model.predict(predictors_test))
print('Training data MSE')
print(training_error)
print('Test data MSE')
print(test_error)

#R-squared from training and test data
rsquared_train=model.score(predictors_train,target_train)
rsquared_test=model.score(predictors_test,target_test)
print('Training data R**2')
print(rsquared_train)
print('Test data R**2')
print(rsquared_test)
Exemple #10
0
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')

# Plot mean square error for each fold
m_log_alphascv = -np.log10(LassoRegModel.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, LassoRegModel.cv_mse_path_, ':')
plt.plot(m_log_alphascv, LassoRegModel.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(LassoRegModel.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')

# MSE from training and test data
train_error = mean_squared_error(tar_train, LassoRegModel.predict(pred_train))
test_error = mean_squared_error(tar_test, LassoRegModel.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=LassoRegModel.score(pred_train,tar_train)
rsquared_test=LassoRegModel.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
X_train, y_train = train_set[col], train_set[interest]
X_test, y_test   = test_set[col], test_set[interest]

score = 'mean_squared_error'
tuned_params_lasso = [{'alpha': np.linspace(-1, 1, 100),
                       'normalize': [True, False]}]

### ACROSS WHOLE DATASET
### With StratifiedKFold, we're stratifying according to the interest variable.
### This ensures that there will be an even proportion of RAVLT_DEL (or whatever
### the interest variable is) values across all folds.
skf = cross_validation.StratifiedKFold(y_aging, n_folds=6)
model = LassoLarsCV(max_iter=100000, cv=skf).fit( X_aging, y_aging )

# print("Best estimator for WHOLE DATASET: \n{0}\n".format(model.best_estimator_))
print("Percent variance explained: {0}".format(model.score( X_aging, y_aging)))
print("Coefficients found: \n{0}\n".format(prettyprint(model.coef_, col, sort=True)))

# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')

plt.show()

### ACROSS SUPERAGERS
def main():
    u"""Main function for assignment 03."""
    # Load prepared data.
    df = return_proc_and_transf_data_set()
    # Mass is already included as mass in SI units.
    df.drop(['carat'], inplace=True, axis=1)
    # Those are dummy variables not needed in our data set anymore.
    df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1)

    # A bit of error checking.
    if df.isnull().sum().sum() != 0:
        raise ValueError('Your data has unintended nulls.')

    # Cast our dataframe into float type.
    df = df.astype('float64')

    # Scale our dataframe to avoid the sparsity control of our dataframe biased
    # against some variables.
    print('Prior to scaling:')
    print(df.describe())
    df = df.apply(preprocessing.scale)
    print('After scaling:')
    print(df.describe())
    print_separator()
    if (df.mean().abs() > 1e-3).sum() > 0:
        raise ValueError('Scaling of your dataframe went wrong.')

    # Split into training and testing sets
    # The predictirs should not include any price variable since this was used
    # to create the output variable
    predictors = [x for x in df.columns.tolist() if 'price' not in x]
    print('Input variables:')
    pprint(predictors, indent=4)
    input_variables = df[predictors].copy()
    output_variable = df.price.copy()  # Categorized price
    print_separator()

    input_training, input_test, output_training, output_test = train_test_split(
        input_variables, output_variable, test_size=0.3, random_state=0)

    # A few words about the LassoLarsCV:

        # LASSO: least absolute shrinkage and selection operator (discussed in
        # the course material.

        # LARS: least angle regression: algorithm for linear regression models
        # to high-dimensional data (aka 'a lot of categories').
        # Compared to simple LASSO this model uses the LARS algorithm instead of
        # the 'vanilla' 'coordinate_descent' of simple LASSO.

        # CV: cross validation: this sets the alpha parameter (refered to as
        # lambda parameter in the course video) by cross validation.
        # In the simple LARS this alpha (the penalty factor) is an input of the
        # function.
        # 'The alpha parameter controls the degree of sparsity of the
        # coefficients estimated.
        # If alpha = zero then the method is the same as OLS.

    model = LassoLarsCV(
        cv=10,  # Number of folds.
        precompute=False,  # Do not precompute Gram matrix.
        # precompute=True,  # Do not precompute Gram matrix.
        # verbose=3,
    ).fit(input_training, output_training)

    dict_var_lin_coefs = dict(zip(
        predictors,
        model.coef_))

    print('Result of linear model:')
    pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()],
                  key=lambda x: abs(x[1]))
           )
    print_separator()

    # Plot coefficient progression.
    # TODO: plot those on 4 different subplots.
    model_log_alphas = -np.log10(model.alphas_)
    ax = plt.gca()
    plt.plot(model_log_alphas, model.coef_path_.T)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.ylabel('Regression Coefficients')
    plt.xlabel('-log(alpha)')
    plt.title('Regression Coefficients Progression for Lasso Paths')
    plt.legend(predictors,
        loc='best',)
    plt.tight_layout()
    plt.savefig('result00.png', dpi=600)
    plt.close()
    # TODO: why are the coefficients in the result very different than the
    # coefficient path?
    #
    # There seems to be a scaling of the coefficient paths with an arbitrary
    # almost the same constant (194 in this case)
    #
    # print('Resulting alpha is not different than path alpha (difference):')
    # difference = model.alpha_ - model.alphas_
    # pprint(model.alpha_ - model.alphas_)
    # print('Resulting coefficients are very different than path coefficients (difference):')
    # pprint(model.coef_ - model.coef_path_.T)
    # print_separator()


    # Plot mean square error for each fold.
    # To avoid getting dividebyzero warning map zero to an extremely low value.
    model.cv_alphas_ = list(
        map(lambda x: x if x != 0 else np.inf,
            model.cv_alphas_))
    model_log_alphas = -np.log10(model.cv_alphas_)
    plt.figure()
    plt.plot(model_log_alphas, model.cv_mse_path_, ':')
    plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
            label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                label='alpha CV')
    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean squared error')
    plt.title('Mean squared error on each fold')
    plt.legend()
    plt.tight_layout()
    plt.savefig('result01.png', dpi=600)
    plt.close()

    # Mean squared error of our model.
    train_error = mean_squared_error(output_training,
                                     model.predict(input_training))
    test_error = mean_squared_error(output_test,
                                    model.predict(input_test))
    print ('Training data MSE')
    print(train_error)
    print ('Test data MSE')
    print(test_error)
    print_separator()


    # R-square from training and test data.
    rsquared_train = model.score(
        input_training,
        output_training)
    rsquared_test = model.score(
        input_test,
        output_test)
    print ('Training data R-square')
    print(rsquared_train)
    print ('Test data R-square')
    print(rsquared_test)
    print_separator()

    return {'model': model, 'dataframe': df}
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
y_test_score = rd.score(x_test, y_test)
print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score))
print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
'''========9.Lasso回归========'''
import numpy as np
import matplotlib.pyplot as plt  # 可视化绘制
from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV  # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取

#model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
# model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(x_train, y_train)  # 线性回归建模
print('系数矩阵:\n', model.coef_, model.intercept_)

print('线性回归模型:\n', model)
print('最佳的alpha:', model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效

# 使用模型预测
#分别预测训练数据和测试数据
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
#分别计算其均方根误差和拟合优度

y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred))
y_train_score = model.score(x_train, y_train)
y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
y_test_score = model.score(x_test, y_test)
print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score))
print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score))
Exemple #14
0
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
plt.savefig('Fig02')

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(resp_train, model.predict(pred_train))
test_error = mean_squared_error(resp_test, model.predict(pred_test))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train = model.score(pred_train, resp_train)
rsquared_test = model.score(pred_test, resp_test)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)

#plot coefficient progrssion
m_log_alphascv = -np.log10(model.alphas_)
ax = pyplot.gca()
pyplot.plot(m_log_alphascv, model.coef_path_.T) #.T is to transpose the coeff_path_attri matrix to match the first dim of array of alpha values
pyplot.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha_CV')
#print("Alpha Value:",m_log_alphascv, "Coefficients:",model.cv_mse_path_)
pyplot.ylabel('Regression Coefficients')
pyplot.xlabel('-log(alpha)')
pyplot.title('Regression coefficients for lasso plots')
pyplot.show()

# Indicate the lasso parameter that minimizes the average MSE acrossfolds.
lasso_fit = model.fit(x, y)
lasso_path = model.score(x, y)
pyplot.axvline(lasso_fit.alpha_, color = 'red')
pyplot.title("Lasso parameter")
print('Deg. Coefficient')
print(lasso_fit.intercept_)
print(dict(zip(X_train.columns, lasso_fit.coef_)))
#print("Lasso parameter:",lasso_fit.alpha_)
pyplot.show()

#MSE for training and testing data
train_error=mean_squared_error(y_train, model.predict(X_train))
test_error=mean_squared_error(y_test, model.predict(X_test))
print ("Traiing data MSE")
print (train_error)
print ("Testing data MSE")
print (test_error)
plt.plot(m_log_alphascv, lassomodel.cv_mse_path_, ':')
plt.plot(m_log_alphascv, lassomodel.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(lassomodel.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')


# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(y_train, lassomodel.predict(X_train))
test_error = mean_squared_error(y_test, lassomodel.predict(X_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train= lassomodel.score(X_train,y_train)
rsquared_test= lassomodel.score(X_test,y_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)



Exemple #17
0
## Performs Extremely Badly ##
import pandas as pd
import numpy as np

from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split

lm = LassoLarsCV(max_iter = 1000, precompute = 'auto', cv = 10, eps = 0.0001)

USE = pd.read_csv(r"C:\Users\chias\source\repos\FFM-MA\US equities.csv") 
x = USE[['growth','inflation','liquidity','risk app']]
y = USE['return'] 

use_train_x, use_test_x, use_train_y, use_test_y = train_test_split(x, y, test_size = 0.2, random_state = 6 )
lm.fit(use_train_x, use_train_y)
print("USE SCORE:")
print(lm.score(use_test_x, use_test_y))
Exemple #18
0
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试LassoLarsCV类**********"
    lassoLarscv = LassoLarsCV(cv=5)
    # 拟合训练集
    lassoLarscv.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    print "系数:", lassoLarscv.coef_
    print "截距:", lassoLarscv.intercept_
    print '训练集R2: ', r2_score(train_Y, lassoLarscv.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = lassoLarscv.predict(test_X)
    print "测试集得分:", lassoLarscv.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, lassoLarscv.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试LassoLarsIC类**********"
    lassoLarsIC = LassoLarsIC()
    # lassoLarsIC = LassoLarsIC(criterion='bic')
    # 拟合训练集
    lassoLarsIC.fit(train_X, train_Y.values.ravel())
plt.plot(m_log_alphascv,
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')

# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(training_target, model.predict(training_data))
test_error = mean_squared_error(test_target, model.predict(test_data))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train = model.score(training_data, training_target)
rsquared_test = model.score(test_data, test_target)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)
Exemple #20
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = LassoLarsCV(cv=15)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
Exemple #21
0

#plot coefficient progrssion
m_log_alphascv = -np.log10(model.alphas_)
ax = pyplot.gca()
pyplot.plot(m_log_alphascv, model.coef_path_.T) #.T is to transpose the coeff_path_attri matrix to match the first dim of array of alpha values
pyplot.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha_CV')
print("Alpha Value:",m_log_alphascv, "Coefficients:",model.cv_mse_path_)
pyplot.ylabel('Regression Coefficients')
pyplot.xlabel('-log(alpha)')
pyplot.title('Regression coefficients for lasso plots')
pyplot.show()

# Indicate the lasso parameter that minimizes the average MSE acrossfolds.
lasso_fit = model.fit(predictors, y)
lasso_path = model.score(predictors, y)
pyplot.axvline(lasso_fit.alpha_, color = 'red')
pyplot.title("Lasso parameter")
print('Deg. Coefficient')
print(pd.Series(np.r_[lasso_fit.intercept_, lasso_fit.coef_]))
print("Lasso parameter:",lasso_fit.alpha_)
pyplot.show()

#MSE for training and testing data
train_error=mean_squared_error(y_train, model.predict(X_train))
test_error=mean_squared_error(y_test, model.predict(X_test))
print ("Traiing data MSE")
print (train_error)
print ("Testing data MSE")
print (test_error)
Exemple #22
0
class FEW(SurvivalMixin, VariationMixin, EvaluationMixin, BaseEstimator):
    """FEW uses GP to find a set of transformations from the original feature space
    that produces the best performance for a given machine learner.
    """
    update_checked = False

    def __init__(self,
                 population_size=50,
                 generations=100,
                 mutation_rate=0.5,
                 crossover_rate=0.5,
                 ml=None,
                 min_depth=1,
                 max_depth=2,
                 max_depth_init=2,
                 sel='epsilon_lexicase',
                 tourn_size=2,
                 fit_choice=None,
                 op_weight=False,
                 seed_with_ml=True,
                 erc=False,
                 random_state=np.random.randint(9999999),
                 verbosity=0,
                 scoring_function=None,
                 disable_update_check=False,
                 elitism=True,
                 boolean=False,
                 classification=False,
                 clean=False,
                 track_diversity=False,
                 mdr=False,
                 otype='f'):
        # sets up GP.

        # Save params to be recalled later by get_params()
        self.params = locals(
        )  # Must be placed before any local variable definitions
        self.params.pop('self')

        # # Do not prompt the user to update during this session if they ever disabled the update check
        # if disable_update_check:
        #     FEW.update_checked = True
        #
        # # Prompt the user if their version is out of date
        # if not disable_update_check and not FEW.update_checked:
        #     update_check('FEW', __version__)
        #     FEW.update_checked = True

        self._best_estimator = None
        self._training_features = None
        self._training_labels = None
        self._best_inds = None

        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.min_depth = min_depth
        self.max_depth = max_depth
        self.max_depth_init = max_depth_init
        self.sel = sel
        self.tourn_size = tourn_size
        self.fit_choice = fit_choice
        self.op_weight = op_weight
        self.seed_with_ml = seed_with_ml
        self.erc = erc
        self.random_state = random_state
        self.verbosity = verbosity
        self.scoring_function = scoring_function
        self.gp_generation = 0
        self.elitism = elitism
        self.max_fit = 99999999.666
        self.boolean = boolean
        self.classification = classification
        self.clean = clean
        self.ml = ml
        self.track_diversity = track_diversity
        self.mdr = mdr
        self.otype = otype

        # if otype is b, boolean functions must be turned on
        if self.otype == 'b':
            self.boolean = True

        # instantiate sklearn estimator according to specified machine learner
        if self.ml is None:
            if self.classification:
                self.ml = LogisticRegression(solver='sag')
            else:
                self.ml = LassoLarsCV()
        if not self.scoring_function:
            if self.classification:
                self.scoring_function = accuracy_score
            else:
                self.scoring_function = r2_score

        # set default fitness metrics for various learners
        if not self.fit_choice:
            self.fit_choice = {
                #regression
                type(LassoLarsCV()): 'mse',
                type(SVR()): 'mae',
                type(LinearSVR()): 'mae',
                type(KNeighborsRegressor()): 'mse',
                type(DecisionTreeRegressor()): 'mse',
                type(RandomForestRegressor()): 'mse',
                #classification
                type(SGDClassifier()): 'r2',
                type(LogisticRegression()): 'r2',
                type(SVC()): 'r2',
                type(LinearSVC()): 'r2',
                type(RandomForestClassifier()): 'r2',
                type(DecisionTreeClassifier()): 'r2',
                type(DistanceClassifier()): 'silhouette',
                type(KNeighborsClassifier()): 'r2',
            }[type(self.ml)]

        # Columns to always ignore when in an operator
        self.non_feature_columns = ['label', 'group', 'guess']

        # function set
        self.func_set = [
            node('+'),
            node('-'),
            node('*'),
            node('/'),
            node('sin'),
            node('cos'),
            node('exp'),
            node('log'),
            node('^2'),
            node('^3'),
            node('sqrt')
        ]

        # if boolean operators are included but the output type is set to float, then
        # # include the if and if-else operations that allow use of both stacks
        # if self.boolean and self.otype=='f':
        #     self.func_set += [
        #     {'name:','if','arity':2,'in_type':}
        #     ]
        # terminal set
        self.term_set = []
        # diversity
        self.diversity = []

    #@profile
    def fit(self, features, labels):
        """Fit model to data"""

        np.random.seed(self.random_state)
        # setup data
        # imputation
        if self.clean:
            features = self.impute_data(features)
        # Train-test split routine for internal validation
        ####
        train_val_data = pd.DataFrame(data=features)
        train_val_data['labels'] = labels
        # print("train val data:",train_val_data[::10])
        new_col_names = {}
        for column in train_val_data.columns.values:
            if type(column) != str:
                new_col_names[column] = str(column).zfill(10)
        train_val_data.rename(columns=new_col_names, inplace=True)
        # internal training/validation split
        train_i, val_i = train_test_split(train_val_data.index,
                                          stratify=None,
                                          train_size=0.75,
                                          test_size=0.25)

        x_t = train_val_data.loc[train_i].drop('labels', axis=1).values
        x_v = train_val_data.loc[val_i].drop('labels', axis=1).values
        y_t = train_val_data.loc[train_i, 'labels'].values
        y_v = train_val_data.loc[val_i, 'labels'].values

        # Store the training features and classes for later use
        self._training_features = x_t
        self._training_labels = y_t
        ####

        # set population size
        if type(self.population_size) is str:
            if 'x' in self.population_size:  #
                self.population_size = int(
                    float(self.population_size[:-1]) * features.shape[1])
            else:
                self.population_size = int(self.population_size)

        if self.verbosity > 0: print("population size:", self.population_size)
        # print few settings
        if self.verbosity > 1:
            for arg in self.get_params():
                print('{}\t=\t{}'.format(arg, self.get_params()[arg]))
            print('')

        # initial model
        initial_estimator = copy.deepcopy(self.ml.fit(x_t, y_t))
        # self._best_estimator = copy.deepcopy(self.ml.fit(x_t,y_t))

        self._best_score = self.ml.score(x_v, y_v)
        initial_score = self._best_score
        if self.verbosity > 2:
            print("initial estimator size:", self.ml.coef_.shape)
        if self.verbosity > 0:
            print("initial ML CV: {:1.3f}".format(self._best_score))

        # create terminal set
        for i in np.arange(x_t.shape[1]):
            # dictionary of node name, arity, feature column index, output type and input type
            self.term_set.append(node('x', loc=i))  # features
            # add ephemeral random constants if flag
            if self.erc:
                self.term_set.append(node(
                    'k', value=np.random.rand()))  # ephemeral random constants

        # edit function set if boolean
        if self.boolean or self.otype == 'b':  # include boolean functions
            self.func_set += [
                node('!'),
                node('&'),
                node('|'),
                node('=='),
                node('>_f'),
                node('<_f'),
                node('>=_f'),
                node('<=_f'),
                node('>_b'),
                node('<_b'),
                node('>=_b'),
                node('<=_b'),
                node('xor_b'),
                node('xor_f')
            ]

        # add mdr if specified
        if self.mdr:
            self.func_set += [node('mdr2')]

        # Create initial population
        # for now, force seed_with_ml to be off if otype is 'b', since data types`
        # are assumed to be float
        if self.otype == 'b':
            self.seed_with_ml = False
        pop = self.init_pop(self._training_features.shape[0])
        # check that uuids are unique in population
        uuids = [p.id for p in pop.individuals]
        if len(uuids) != len(set(uuids)):
            pdb.set_trace()
        # Evaluate the entire population
        # X represents a matrix of the population outputs (number os samples x population size)
        # single thread
        pop.X = self.transform(x_t, pop.individuals, y_t).transpose()
        # parallel:
        # pop.X = np.asarray(Parallel(n_jobs=-1)(delayed(out)(I,x_t,self.otype,y_t) for I in pop.individuals), order = 'F')

        # calculate fitness of individuals
        # fitnesses = list(map(lambda I: fitness(I,y_t,self.ml),pop.X))
        fitnesses = self.calc_fitness(pop.X, y_t, self.fit_choice, self.sel)

        # max_fit = self.max_fit
        # while len([np.mean(f) for f in fitnesses if np.mean(f) < max_fit and np.mean(f)>=0])<self.population_size and max_count < 100:
        #     pop = self.init_pop()
        #     pop.X = self.transform(x_t,pop.individuals,y_t)
        #     fitnesses = self.calc_fitness(pop.X,y_t,self.fit_choice,self.sel)
        #
        #     max_count+= 1
        # print("fitnesses:",fitnesses)
        # Assign fitnesses to inidividuals in population
        for ind, fit in zip(pop.individuals, fitnesses):
            if isinstance(
                    fit,
                (list,
                 np.ndarray)):  # calc_fitness returned raw fitness values
                fit[fit < 0] = self.max_fit
                fit[np.isnan(fit)] = self.max_fit
                fit[np.isinf(fit)] = self.max_fit
                ind.fitness_vec = fit
                ind.fitness = np.mean(ind.fitness_vec)
            else:
                ind.fitness = np.nanmin([fit, self.max_fit])

        #with Parallel(n_jobs=10) as parallel:
        ####################
        ### Main GP loop
        self.diversity = []
        # progress bar
        pbar = tqdm(total=self.generations,
                    disable=self.verbosity == 0,
                    desc='Internal CV: {:1.3f}'.format(self._best_score))
        # for each generation g
        for g in np.arange(self.generations):

            if self.track_diversity:
                self.get_diversity(pop.X)

            if self.verbosity > 1: print(".", end='')
            if self.verbosity > 1: print(str(g) + ".)", end='')
            # if self.verbosity > 1: print("population:",stacks_2_eqns(pop.individuals))
            if self.verbosity > 2:
                print("pop fitnesses:",
                      ["%0.2f" % x.fitness for x in pop.individuals])
            if self.verbosity > 1:
                print("median fitness pop: %0.2f" %
                      np.median([x.fitness for x in pop.individuals]))
            if self.verbosity > 1:
                print("best fitness pop: %0.2f" %
                      np.min([x.fitness for x in pop.individuals]))
            if self.verbosity > 1 and self.track_diversity:
                print("feature diversity: %0.2f" % self.diversity[-1])
            if self.verbosity > 1: print("ml fitting...")
            # fit ml model
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                try:
                    # if len(self.valid_loc(pop.individuals)) > 0:
                    if self.valid(pop.individuals):
                        self.ml.fit(
                            pop.X[self.valid_loc(pop.individuals), :].
                            transpose(), y_t)
                    # else:
                    #     self.ml.fit(pop.X.transpose(),y_t)

                except ValueError as detail:
                    # pdb.set_trace()
                    print(
                        "warning: ValueError in ml fit. X.shape:",
                        pop.X[:, self.valid_loc(pop.individuals)].transpose(
                        ).shape, "y_t shape:", y_t.shape)
                    print(
                        "First ten entries X:",
                        pop.X[self.valid_loc(pop.individuals), :].transpose()
                        [:10])
                    print("First ten entries y_t:", y_t[:10])
                    print("equations:", stacks_2_eqns(pop.individuals))
                    print("FEW parameters:", self.get_params())
                    if self.verbosity > 1:
                        print("---\ndetailed error message:", detail)
                    raise (ValueError)

            # if self.verbosity > 1: print("number of non-zero regressors:",self.ml.coef_.shape[0])
            # keep best model
            tmp_score = 0
            try:
                # if len(self.valid_loc(pop.individuals)) > 0:
                if self.valid(pop.individuals):
                    tmp_score = self.ml.score(
                        self.transform(
                            x_v,
                            pop.individuals)[:,
                                             self.valid_loc(pop.individuals)],
                        y_v)
                # else:
                #     tmp_score = 0
                # tmp = self.ml.score(self.transform(x_v,pop.individuals),y_v)
            except Exception as detail:
                if self.verbosity > 1: print(detail)

            if self.verbosity > 1:
                print("current ml validation score:", tmp_score)

            if self.valid(pop.individuals) and tmp_score > self._best_score:
                self._best_estimator = copy.deepcopy(self.ml)
                self._best_score = tmp_score
                self._best_inds = copy.deepcopy(self.valid(pop.individuals))
                if self.verbosity > 1:
                    print("updated best internal validation score:",
                          self._best_score)

            # Variation
            if self.verbosity > 2: print("variation...")
            offspring, elite, elite_index = self.variation(pop.individuals)

            # evaluate offspring
            if self.verbosity > 2: print("output...")
            X_offspring = self.transform(x_t, offspring).transpose()
            #parallel:
            # X_offspring = np.asarray(Parallel(n_jobs=-1)(delayed(out)(O,x_t,y_t,self.otype) for O in offspring), order = 'F')
            if self.verbosity > 2: print("fitness...")
            F_offspring = self.calc_fitness(X_offspring, y_t, self.fit_choice,
                                            self.sel)
            # F_offspring = parallel(delayed(f[self.fit_choice])(y_t,yhat) for yhat in X_offspring)
            # print("fitnesses:",fitnesses)
            # Assign fitnesses to inidividuals in population
            for ind, fit in zip(offspring, F_offspring):
                if isinstance(
                        fit,
                    (list,
                     np.ndarray)):  # calc_fitness returned raw fitness values
                    fit[fit < 0] = self.max_fit
                    fit[np.isnan(fit)] = self.max_fit
                    fit[np.isinf(fit)] = self.max_fit
                    ind.fitness_vec = fit
                    ind.fitness = np.mean(ind.fitness_vec)
                else:
                    ind.fitness = np.nanmin([fit, self.max_fit])

            # Survival
            if self.verbosity > 2: print("survival..")
            survivors, survivor_index = self.survival(pop.individuals,
                                                      offspring, elite,
                                                      elite_index)
            pop.individuals[:] = survivors
            pop.X = np.vstack((pop.X, X_offspring))[survivor_index, :]

            if self.verbosity > 2:
                print("median fitness survivors: %0.2f" %
                      np.median([x.fitness for x in pop.individuals]))
            if self.verbosity > 2:
                print(
                    "best features:",
                    stacks_2_eqns(self._best_inds)
                    if self._best_inds else 'original')
            pbar.set_description('Internal CV: {:1.3f}'.format(
                self._best_score))
            pbar.update(1)
        # end of main GP loop
        ####################
        if self.verbosity > 0:
            print('finished. best internal val score: {:1.3f}'.format(
                self._best_score))
        if self.verbosity > 0: print("final model:\n", self.print_model())
        if not self._best_estimator:
            self._best_estimator = initial_estimator
        return self

    def transform(self, x, inds=None, labels=None):
        """return a transformation of x using population outputs"""
        if inds:
            # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in inds)).transpose()
            return np.asarray([
                self.out(I, x, labels, self.otype) for I in inds
            ]).transpose()
        else:
            # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in self._best_inds)).transpose()
            return np.asarray([
                self.out(I, x, labels, self.otype) for I in self._best_inds
            ]).transpose()

    def impute_data(self, x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x)

    def clean(self, x):
        """remove nan and inf rows from x"""
        return x[~np.any(np.isnan(x) | np.isinf(x), axis=1)]

    def clean_with_zeros(self, x):
        """ set nan and inf rows from x to zero"""
        x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] = 0
        return x

    def predict(self, testing_features):
        """predict on a holdout data set."""
        # print("best_inds:",self._best_inds)
        # print("best estimator size:",self._best_estimator.coef_.shape)
        if self.clean:
            testing_features = self.impute_data(testing_features)

        if self._best_inds:
            X_transform = self.transform(testing_features)
            try:
                return self._best_estimator.predict(
                    self.transform(testing_features))
            except ValueError as detail:
                pdb.set_trace()
                print('shape of X:', testing_features.shape)
                print('shape of X_transform:', X_transform.transpose().shape)
                print('best inds:', stacks_2_eqns(self._best_inds))
                print('valid locs:', self.valid_loc(self._best_inds))
                raise ValueError(detail)
        else:
            return self._best_estimator.predict(testing_features)

    def fit_predict(self, features, labels):
        """Convenience function that fits a pipeline then predicts on the provided features

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix
        labels: array-like {n_samples}
            List of class labels for prediction

        Returns
        ----------
        array-like: {n_samples}
            Predicted labels for the provided features

        """
        self.fit(features, labels)
        return self.predict(features)

    def score(self, testing_features, testing_labels):
        """estimates accuracy on testing set"""
        # print("test features shape:",testing_features.shape)
        # print("testing labels shape:",testing_labels.shape)
        yhat = self.predict(testing_features)
        return self.scoring_function(testing_labels, yhat)

    def export(self, output_file_name):
        """exports engineered features

        Parameters
        ----------
        output_file_name: string
            String containing the path and file name of the desired output file

        Returns
        -------
        None

        """
        if self._best_estimator is None:
            raise ValueError(
                'A model has not been optimized. Please call fit() first.')

        # Write print_model() to file
        with open(output_file_name, 'w') as output_file:
            output_file.write(self.print_model())
        # if decision tree, print tree into dot file
        if 'DecisionTree' in type(self.ml).__name__:
            export_graphviz(self._best_estimator,
                            out_file=output_file_name + '.dot',
                            feature_names=stacks_2_eqns(self._best_inds)
                            if self._best_inds else None,
                            class_names=['True', 'False'],
                            filled=False,
                            impurity=True,
                            rotate=True)

    def init_pop(self, num_features=1):
        """initializes population of features as GP stacks."""
        pop = Pop(self.population_size, num_features)
        # make programs
        if self.seed_with_ml:
            # initial population is the components of the default ml model
            if type(self.ml) == type(LassoLarsCV()):
                # add all model components with non-zero coefficients
                for i, (c, p) in enumerate(
                        it.zip_longest([c for c in self.ml.coef_ if c != 0],
                                       pop.individuals,
                                       fillvalue=None)):
                    if c is not None and p is not None:
                        p.stack = [node('x', loc=i)]
                    elif p is not None:
                        # make program if pop is bigger than model componennts
                        make_program(
                            p.stack, self.func_set, self.term_set,
                            np.random.randint(self.min_depth,
                                              self.max_depth + 1), self.otype)
                        p.stack = list(reversed(p.stack))
            else:  # seed with raw features
                # if list(self.ml.coef_):
                #pdb.set_trace()
                try:
                    if self.population_size < self.ml.coef_.shape[0]:
                        # seed pop with highest coefficients
                        coef_order = np.argsort(self.ml.coef_[::-1])
                        for i, (c, p) in enumerate(
                                zip(coef_order, pop.individuals)):
                            p.stack = [node('x', loc=i)]
                    else:
                        raise (AttributeError)
                except Exception:  # seed pop with raw features
                    for i, p in it.zip_longest(range(
                            self._training_features.shape[1]),
                                               pop.individuals,
                                               fillvalue=None):
                        if p is not None:
                            if i is not None:
                                p.stack = [node('x', loc=i)]
                            else:
                                make_program(
                                    p.stack, self.func_set, self.term_set,
                                    np.random.randint(self.min_depth,
                                                      self.max_depth + 1),
                                    self.otype)
                                p.stack = list(reversed(p.stack))

            # print initial population
            if self.verbosity > 2:
                print("seeded initial population:",
                      stacks_2_eqns(pop.individuals))

        else:
            for I in pop.individuals:
                depth = np.random.randint(self.min_depth, self.max_depth + 1)
                # print("hex(id(I)):",hex(id(I)))
                # depth = 2;
                # print("initial I.stack:",I.stack)

                make_program(I.stack, self.func_set, self.term_set, depth,
                             self.otype)
                # print(I.stack)
                I.stack = list(reversed(I.stack))

            # print(I.stack)

        return pop

    def print_model(self, sep='\n'):
        """prints model contained in best inds, if ml has a coefficient property.
        otherwise, prints the features generated by FEW."""
        model = ''

        if self._best_inds:
            if type(self.ml).__name__ != 'SVC' and type(
                    self.ml).__name__ != 'SVR':
                # this is need because svm has a bug that throws valueerror on attribute check:

                if hasattr(self.ml, 'coef_'):
                    if self._best_estimator.coef_.shape[0] == 1 or len(
                            self._best_estimator.coef_.shape) == 1:
                        if self._best_estimator.coef_.shape[0] == 1:
                            s = np.argsort(
                                np.abs(self._best_estimator.coef_[0]))[::-1]
                            scoef = self._best_estimator.coef_[0][s]
                        else:
                            s = np.argsort(np.abs(
                                self._best_estimator.coef_))[::-1]
                            scoef = self._best_estimator.coef_[s]
                        bi = [self._best_inds[k] for k in s]
                        model = (' +' + sep).join([
                            str(round(c, 3)) + '*' + stack_2_eqn(f)
                            for i, (f, c) in enumerate(zip(bi, scoef))
                            if round(scoef[i], 3) != 0
                        ])
                    else:
                        # more than one decision function is fit. print all.
                        for j, coef in enumerate(self._best_estimator.coef_):
                            s = np.argsort(np.abs(coef))[::-1]
                            scoef = coef[s]
                            bi = [self._best_inds[k] for k in s]
                            model += sep + 'class' + str(
                                j) + ' :' + ' + '.join([
                                    str(round(c, 3)) + '*' + stack_2_eqn(f)
                                    for i, (f, c) in enumerate(zip(bi, coef))
                                    if coef[i] != 0
                                ])
                elif hasattr(self._best_estimator, 'feature_importances_'):
                    s = np.argsort(
                        self._best_estimator.feature_importances_)[::-1]
                    sfi = self._best_estimator.feature_importances_[s]
                    bi = [self._best_inds[k] for k in s]
                    model = 'importance : feature\n'

                    model += sep.join([
                        str(round(c, 3)) + '\t:\t' + stack_2_eqn(f)
                        for i, (f, c) in enumerate(zip(bi, sfi))
                        if round(sfi[i], 3) != 0
                    ])
                else:
                    return stacks_2_eqns(self._best_inds)
            else:
                return stacks_2_eqns(self._best_inds)
        else:
            return 'original features'

        return model

    def representation(self):
        """return stacks_2_eqns output"""
        return stacks_2_eqns(self._best_inds)

    def valid_loc(self, individuals):
        """returns the indices of individuals with valid fitness."""

        return [
            index for index, i in enumerate(individuals)
            if i.fitness < self.max_fit and i.fitness >= 0
        ]

    def valid(self, individuals):
        """returns the sublist of individuals with valid fitness."""

        return [
            i for i in individuals
            if i.fitness < self.max_fit and i.fitness >= 0
        ]

    def get_params(self, deep=None):
        """Get parameters for this estimator

        This function is necessary for FEW to work as a drop-in feature constructor in,
        e.g., sklearn.model_selection.cross_val_score

        Parameters
        ----------
        deep: unused
            Only implemented to maintain interface for sklearn

        Returns
        -------
        params: mapping of string to any
            Parameter names mapped to their values
        """
        return self.params

    def get_diversity(self, X):
        """compute mean diversity of individual outputs"""
        # diversity in terms of cosine distances between features
        feature_correlations = np.zeros(X.shape[0] - 1)
        for i in np.arange(1, X.shape[0] - 1):
            feature_correlations[i] = max(0.0, r2_score(X[0], X[i]))
        # pdb.set_trace()
        self.diversity.append(1 - np.mean(feature_correlations))
plt.title('Regression Coefficients Progression for Lasso Paths')

# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')


# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(training_target, model.predict(training_data))
test_error = mean_squared_error(test_target, model.predict(test_data))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(training_data, training_target)
rsquared_test=model.score(test_data, test_target)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Exemple #24
0
from sklearn.linear_model import LassoLarsCV
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
X, y = make_regression(n_features=1, noise=4.0, random_state=0)
y = y.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=100)
reg = LassoLarsCV(cv=5).fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))
print(reg.alpha_)
y_pred = reg.predict(X)
print(X_train.shape, y_train.shape)
plt.scatter(X_train, y_train, label='train')
plt.scatter(X_test, y_test, label='test')
plt.plot(X, y_pred)
plt.show()
    y_pred_lasso = lasso.fit(train_features, train_targets).predict(test_features)
    r2_score_lasso = r2_score(test_targets, y_pred_lasso)
    r2_values_store.append(r2_score_lasso)
#print(r2_values_store)
plt.figure()
#plt.figure(figsize=(32,18), dpi=1200) # used to expose the figure at higher resolution 
plt.plot(m_log_alphas_modified, r2_values_store)
plt.xlabel('alpha values')
plt.ylabel('R^2 values')
plt.show()

# The estimator chose automatically its lambda:
tune_parameter = -np.log10(model.alpha_)
print("Tuned parameter obtained using cross validation : %f" % tune_parameter)
#To evaluate the cross validation perfomance
Cross_validation_perfomance = model.score(test_features, test_targets)
print("Cross validation perfomance : %f" % Cross_validation_perfomance)

alpha = tune_parameter
lasso = Lasso(alpha=-np.log10(model.alpha_))

y_pred_lasso = lasso.fit(train_features, train_targets).predict(test_features)
r2_score_lasso = r2_score(test_targets, y_pred_lasso)
print(lasso)
print("R^2 on test data : %f" % r2_score_lasso)
#plt.plot(lasso.coef_, label='Lasso coefficients')
#plt.xlabel('alpha values')
#plt.ylabel('R^2')
#plt.plot(m_log_alphas, r2_score_lasso)
#plt.legend(loc='best')
#plt.show()
Exemple #26
0
         model.cv_mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)
plt.axvline(-np.log10(model.alpha_),
            linestyle='--',
            color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
# There is variability across individual cv as variables area added in the same pattern
# = Decrease rapidly and then level off to point where more prediction is not reducing MSE

# 3.5 MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print('training data MSE')
print(train_error)
print('test data MSE')
print(test_error)  #similar accuracy

# 3.6 R-square from training and test data
rsquared_train = model.score(pred_train, tar_train)
rsquared_test = model.score(pred_test, tar_test)
print('training data R-square')
print(rsquared_train)
print('test data R-square')
print(rsquared_test)  #more accurate than training data
Exemple #27
0
# Models that performed substantially worse
# model = LinearSVC()
# model = KNeighborsClassifier(n_neighbors = 3)
# model = GaussianNB()
# model = LogisticRegression()
# model = SVC()

# ## Fit/Accurancy

# In[32]:

model.fit(train_X, train_y)

# Print the Training Set Accuracy and the Test Set Accuracy in order to understand overfitting
print(model.score(train_X, train_y), model.score(valid_X, valid_y))

# In[33]:

id = test_X.Id
result = model.predict(test_X)

# output = pd.DataFrame( { 'id': id , 'SalePrice': result}, columns=['id', 'SalePrice'] )
output = pd.DataFrame({'id': id, 'SalePrice': result})
output = output[['id', 'SalePrice']]

output.to_csv("solution.csv", index=False)
output.head(10)

# ## Conclusion
#