Ejemplo n.º 1
0
def test_few_at_least_as_good_as_default():
    """test_few.py: few performs at least as well as the default ML """
    np.random.seed(1006987)
    boston = load_boston()
    d = np.column_stack((boston.data,boston.target))
    np.random.shuffle(d)
    features = d[:,0:-1]
    target = d[:,-1]

    print("feature shape:",boston.data.shape)

    learner = FEW(generations=1, population_size=5,
                mutation_rate=1, crossover_rate=1,
                ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
                sel = 'tournament', fit_choice = 'r2',tourn_size = 2, random_state=0, verbosity=0,
                disable_update_check=False)

    learner.fit(features[:300], target[:300])
    few_score = learner.score(features[:300], target[:300])
    test_score = learner.score(features[300:],target[300:])

    lasso = LassoLarsCV()
    lasso.fit(learner._training_features,learner._training_labels)
    lasso_score = lasso.score(features[:300], target[:300])
    print("few score:",few_score,"lasso score:",lasso_score)
    print("few test score:",test_score,"lasso test score:",lasso.score(features[300:],target[300:]))
    assert few_score >= lasso_score

    print("lasso coefficients:",lasso.coef_)
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()

    y_train_all = train['y']
    del train['ID']
    del train['y']
    id_test = test['ID']
    del test['ID']
    print 'train:', train.shape, ', test:', test.shape

    model = LassoLarsCV(fit_intercept=True,
                        verbose=False,
                        max_iter=500,
                        normalize=True,
                        precompute='auto',
                        cv=5,
                        max_n_alphas=1000,
                        n_jobs=-1,
                        eps=2.2204460492503131e-16,
                        copy_X=True,
                        positive=False)
    model.fit(train.values, y_train_all)

    print 'predict submit...'
    y_pred = model.predict(test.values)
    df_sub = pd.DataFrame({'ID': id_test, 'y': y_pred})
    df_sub.to_csv('lassolars_model_result.csv', index=False)  # 0.55827
class OwnLassoLarsCV(
        AutoSklearnRegressionAlgorithm, ):
    def __init__(self, random_state=None):
        self.estimator = None

    def fit(self, X, y):
        from sklearn.linear_model import LassoLarsCV
        self.estimator = LassoLarsCV(cv=5)
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'LL',
            'name': 'LassoLarsCV',
            'handles_regression': True,
            'handles_classification': False,
            'handles_multiclass': False,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()
        return cs
Ejemplo n.º 4
0
def cv_train_lasso_lars_with_sparse_refit(x_train,
                                          y_train,
                                          pval_cutoff=0.001,
                                          do_sparse_refit=True):
    model = LassoLarsCV(n_jobs=-1, cv=min(x_train.shape[0], 10))
    model.fit(x_train, y_train)
    best_alpha_idx = int(np.argwhere(model.alpha_ == model.cv_alphas_))

    if do_sparse_refit:
        sparse_alpha_idx = -1
        for i in range(best_alpha_idx + 1, len(model.cv_alphas_)):
            pval = ttest_ind(model.mse_path_[best_alpha_idx],
                             model.mse_path_[i]).pvalue

            if pval < pval_cutoff:
                sparse_alpha_idx = i - 1
                break

        if sparse_alpha_idx == -1:
            # take the sparsest solution
            sparse_alpha_idx = len(model.cv_alphas_) - 1

        model_sparse = LassoLars(alpha=model.cv_alphas_[sparse_alpha_idx])
        model_sparse.fit(x_train, y_train)

        return model_sparse
    else:
        return model
Ejemplo n.º 5
0
def test_few_at_least_as_good_as_default():
    """test_few.py: few performs at least as well as the default ML """
    np.random.seed(1006987)
    boston = load_boston()
    d = np.column_stack((boston.data,boston.target))
    np.random.shuffle(d)
    features = d[:,0:-1]
    target = d[:,-1]

    print("feature shape:",boston.data.shape)

    learner = FEW(generations=1, population_size=5,
                ml = LassoLarsCV(), min_depth = 1, max_depth = 3,
                sel = 'tournament')

    learner.fit(features[:300], target[:300])
    few_score = learner.score(features[:300], target[:300])
    few_test_score = learner.score(features[300:],target[300:])

    lasso = LassoLarsCV()
    lasso.fit(features[:300], target[:300])
    lasso_score = lasso.score(features[:300], target[:300])
    lasso_test_score = lasso.score(features[300:],target[300:])
    print("few score:",few_score,"lasso score:",lasso_score)
    print("few test score:",few_test_score,"lasso test score:",
          lasso_test_score)
    assert round(few_score,8) >= round(lasso_score,8)

    print("lasso coefficients:",lasso.coef_)
Ejemplo n.º 6
0
def lassolarscv():
    print ("Doing cross-validated LassoLars")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf5 = LassoLarsCV(cv=cross_val)
    clf5.fit(base_X, base_Y)
    print ("Score = %f" % clf5.score(base_X, base_Y))
    clf5_pred = clf5.predict(X_test)
    write_to_file("lassolars.csv", clf5_pred)
Ejemplo n.º 7
0
def train_test_lasso(input_data, output_data, train_key, test_key, n_cv=3):
    """
    lasso回帰による学習/予測
    """
    # 例外処理 : 学習データ点数が分割数より少ない場合
    if len(train_key) < n_cv:
        n_cv = len(train_key)
    
    #-------------
    # 学習
    #-------------
    x = input_data[train_key,:]
    y = output_data[train_key]
    
    # インスタンス生成
    x_scaler = StandardScaler()
    y_scaler = StandardScaler()
    
    clf = LassoLarsCV(cv=n_cv, positive=True)
    #clf = LassoLarsCV(cv=n_cv, positive=True)
    #clf = LassoLarsIC(criterion='bic', positive=True)
    
    # モデル構築
    x_scaler.fit(x) #正規化
    y_scaler.fit(y.reshape(-1,1)) #正規化
    
    y_ = y_scaler.transform(y.reshape(-1,1))
    y_ = y_.reshape(-1)
    
    #import pdb; pdb.set_trace()
    
    #error_flag = 0 #初期化
    try:
        clf.fit(x_scaler.transform(x), y_)
    except ValueError:
        clf = LassoLarsIC(criterion='bic', positive=True)
        clf.fit(x_scaler.transform(x), y_)
    
    
    # モデルパラメータ取得
    #alpha = clf.alpha_ #ハイパーパラメータ
    a = clf.coef_ #係数
    b = clf.intercept_ #切片
    p = np.append(a, b)
    
    #-------------
    # 予測
    #-------------
    x = input_data[test_key,:]
    
    # 例外処理 : xのデータ点数 = 1の場合 ⇒配列を整形
    if x.ndim == 1:
        x = x.reshape(1,-1)
    
    # 予測
    tmp = clf.predict(x_scaler.transform(x))
    y_pred = y_scaler.inverse_transform(tmp) #非正規化
    return y_pred, p
Ejemplo n.º 8
0
def LassoLarsTest(dataMat, labelMat):
    clf1 = LassoLars(alpha=1, max_iter=100)
    clf1.fit(dataMat[0:99], labelMat[0:99])
    labelTest1 = clf1.predict(dataMat[100:199])
    print('LassoLars ', ((labelTest1 - labelMat[100:199])**2).sum())
    clf2 = LassoLarsCV(max_n_alphas=10, max_iter=100)
    clf2.fit(dataMat[0:99], labelMat[0:99])
    labelTest2 = clf2.predict(dataMat[100:199])
    print('LassoLarsCV', ((labelTest2 - labelMat[100:199])**2).sum())
Ejemplo n.º 9
0
    def _LASSOLars_Regression(self):

        # initialize a model object
        LassoLarsReg = LassoLarsCV(cv=10)
        # train model
        LassoLarsReg.fit(self.X_train, self.y_train)
        # optimal alpha
        print("The best alpha in LAR is: ", LassoLarsReg.alpha_)

        return LassoLarsReg
Ejemplo n.º 10
0
def lasso(x_train, y_train, x_test):
    #model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
    #model = LassoCV(max_iter=3000)  # LassoCV自动调节alpha可以实现选择最佳的alpha,0.0295。
    model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
    print(x_train.shape)
    print(y_train.shape)
    model.fit(x_train, y_train)  # 线性回归建模
    print('系数矩阵:\n', model.coef_)
    print('线性回归模型:\n', model)
    print('最佳的alpha:', model.alpha_)
    predicted = model.predict(x_test)
    print(predicted.shape)
    return (predicted)
Ejemplo n.º 11
0
def test():
    X, y = genClassificationData(n_features=10, n_strel=2, n_redundant=0)
    model = LassoLarsCV(cv=5)
    normal_score = model.fit(X, y).score(X, y)

    X_NF = add_NFeature_to_X(X, 1, np.random.RandomState())
    model = LassoLarsCV(cv=5)
    assert model.fit(X_NF, y).score(X_NF, y) > 0.5

    stats = Stats(model, X, y, n_resampling=50, fpr=1e-3, check_importances=False)
    bounds = stats.score_stat
    assert type(bounds) is tuple
    assert bounds[0] < normal_score < bounds[1]
Ejemplo n.º 12
0
def run_lasso_lars_cv(X_train, y_train, X_test, y_test):
    """
    :param X_train: 
    :param y_train: 
    :param X_test: 
    :param y_test: 
    :return: 
    """
    model_lars_cv = LassoLarsCV(cv=10)
    model_lars_cv.fit(X_train, y_train)
    print(model_lars_cv.alpha_)
    print(model_lars_cv.cv_alphas_)
    print(model_lars_cv.cv_mse_path_)
Ejemplo n.º 13
0
class _LassoLarsCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Ejemplo n.º 14
0
def UFS(data, K, d):
    '''
    Cai D, Zhang C, He X. Unsupervised feature selection for multi-cluster data[C]
    //Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2010: 333-342.
    本算法执行的是一个无监督特征选择算法
    data: 是一个N*M的矩阵,每一行代表一个样本,每一列代表一个特征
    K: 聚类的类簇数
    d: 选择的特征数目
    :return :  data:特征选择后的数据N*d 每一行代表一个样本,每一列代表一个特征    seq:表示选择的特征序号1*d,seq中的每一个元素代表原来数据的特征的序号
    '''
    k = int(max(0.2 * data.shape[0], 10))  #近邻设置的数目
    data = np.array(data)
    M = data.shape[1]  #数据的维数
    N = data.shape[0]  #数据的样本数目
    data = data.transpose()  #将矩阵转换成列形式,与文中的形式保持一致
    #寻找数据的样本的K近邻
    dist = np.zeros((N, N))
    delta = 2  #原文中热核函数的参数
    W = np.zeros((N, N))  #邻接矩阵的权值
    for i in range(N):  #计算样本之间的距离
        for j in range(N):
            dist[i, j] = np.linalg.norm(data[:, i] - data[:, j], ord=2)
    D = np.zeros((N, N))  #初始化度矩阵
    for i in range(N):  #找出每个样本之间的k近邻
        neigbors = np.argsort(dist[i, :], axis=0)
        neigbors = neigbors[1:k + 1]  #只选取前k个样本,因为第0个最短矩阵是本身与本身的距离,其距离为0
        for j in neigbors:
            W[i, j] = math.exp(
                -math.pow(np.linalg.norm(data[:, i] - data[:, j]), 2) /
                delta)  #计算权值矩阵中的元素
            W[j, i] = W[i, j]
        D[i, i] = sum(W[i, :])  #确定度矩阵中的对角元素
    L = D - W  #计算拉普拉斯矩阵
    feature_values, vectors = scipy.linalg.eig(L, D)  #求取Eq.(1)中的广义特征值与特征向量
    seq = np.argsort(feature_values)  #对特征值进行排序
    seq = seq[1:K + 1]  #选取从次小后的K个特征值
    Y = vectors[:, seq]  #获取特征向量
    #Y = np.real(Y)
    # 采用最小角回归来气球节a的参数
    score = np.zeros((1, M))  #记录每个特征的得分
    model = LassoLarsCV()  #训练一个模型
    for i in range(K):
        model.fit(data.transpose(), Y[:, i])
        a = model.coef_  #获取线性回归模型的系数
        score[0, i] = max(a)
    seq = np.argsort(-score)  #对得分由大到小排序
    seq = seq[0, 0:d]  #选取前d个最大的得分所对应的特征序号
    data = data.transpose()
    data = data[:, seq]  #获得最终的结果
    return data, seq
Ejemplo n.º 15
0
def lassolars():
    lassolars = LassoLarsCV()
    X_train, X_test, Y_train, Y_test = train_test_split(train_pca_value,
                                                        train_pro,
                                                        test_size=0.1,
                                                        random_state=9)
    lassolars.fit(X_train, Y_train)
    pre = lassolars.predict(X_test)
    loss = mean_squared_error(pre, Y_test)
    print(loss)
    pre = lassolars.predict(test_pca_data)
    write = open('data/lassolars.txt', 'w')
    for i in range(len(pre)):
        write.write("%f\r" % pre[i])
    write.close()
Ejemplo n.º 16
0
def _lassolarscv(*,
                 train,
                 test,
                 x_predict=None,
                 metrics,
                 fit_intercept=True,
                 verbose=False,
                 max_iter=500,
                 normalize=True,
                 precompute='auto',
                 cv=None,
                 max_n_alphas=1000,
                 n_jobs=None,
                 eps=2.220446049250313e-16,
                 copy_X=True,
                 positive=False):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLarsCV.html#sklearn.linear_model.LassoLarsCV
    """

    model = LassoLarsCV(fit_intercept=fit_intercept,
                        verbose=verbose,
                        max_iter=max_iter,
                        normalize=normalize,
                        precompute=precompute,
                        cv=cv,
                        max_n_alphas=max_n_alphas,
                        n_jobs=n_jobs,
                        eps=eps,
                        copy_X=copy_X,
                        positive=positive)
    model.fit(train[0], train[1])
    model_name = 'LassoLarsCV'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Ejemplo n.º 17
0
def LASSO_EXAMPLE(inputfile):
    # 导入使用的模块
    import os
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV
    data = pd.read_csv(inputfile)
    des = data.describe()
    r = des.T
    r = r[['min', 'max', 'mean', 'std']]
    np.round(r, 2)  # 保留2位小数,四舍六入五留双(五留双即遇五看五前面是偶数则保留,奇数进位)
    np.round(data.corr(method='pearson'), 2)  # method={'pearson','spearman','kendall'},计算相关系数,相关分析
    model = LassoLarsCV(alpha=1)  # LASSO回归的特点是在拟合广义线性模型的同时进行变量筛选和复杂度调整,剔除存在共线性的变量
    model.fit(data.iloc[:, :data.shape[1] - 1], data.iloc[:, data.shape[1] - 1])
    model_coef = pd.DataFrame(pd.DataFrame(model.coef_).T)
    model_coef.columns = ['x%d' % i for i in np.arange(13) + 1]
    print(model_coef)
Ejemplo n.º 18
0
def polyomial():
    poly = PolynomialFeatures(degree=2)
    X_train, X_test, Y_train, Y_test = train_test_split(train_pca_value,
                                                        train_pro,
                                                        test_size=0.1,
                                                        random_state=9)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)
    test_pca_data_poly = poly.fit_transform(test_pca_data)
    regressor_poly = LassoLarsCV()
    regressor_poly.fit(X_train_poly, Y_train)
    pre = regressor_poly.predict(X_test_poly)
    loss = mean_squared_error(pre, Y_test)
    print(loss)
    pre = regressor_poly.predict(test_pca_data_poly)
    write = open('data/poly.txt', 'w')
    for i in range(len(pre)):
        write.write("%f\r" % pre[i])
    write.close()
def select_features_lasso_lars(X,Y):
    clf = LassoLarsCV()
    # Set a minimum threshold of 0.25
    fit=clf.fit(X,Y)
    sfm = SelectFromModel(fit,prefit=True)
    values= SelectFromModel.get_support(sfm,indices=True)
    for i in range(0,len(values)):
        print(values[i])
    new_features = sfm.transform(X)
    return new_features,values
def lassovar(data, lag=1, n_samples=None):
    Y = data.T[:, lag:]
    d = Y.shape[0]
    Z = np.vstack([data.T[:, lag - k:-k] for k in range(1, lag + 1)])
    Y, Z = Y.T, Z.T
    if n_samples is not None:
        Y, Z = resample(Y, Z, replace=False, n_samples=n_samples)

    scores = np.zeros((d, d * lag))

    ls = LassoLarsCV(cv=10, n_jobs=1)

    residuals = np.zeros(Y.shape)

    # one variable after the other as target
    for j in range(d):
        target = np.copy(Y[:, j])
        selectedparents = np.full(d * lag, False)
        # we include one lag after the other
        for l in range(1, lag + 1):
            ind_a = d * (l - 1)
            ind_b = d * l
            ls.fit(Z[:, ind_a:ind_b], target)
            selectedparents[ind_a:ind_b] = ls.coef_ > 0
            target -= ls.predict(Z[:, ind_a:ind_b])

        residuals[:, j] = np.copy(target)

        # refit to get rid of the bias
        ZZ = Z[:, selectedparents]
        B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0]
        scores[j, selectedparents] = B

    # the more uncorrelated the residuals the higher the weight
    weight = 1
    res = np.corrcoef(residuals.T)
    if np.linalg.matrix_rank(res) == res.shape[0]:
        weight = np.linalg.det(res)
    return scores * weight
Ejemplo n.º 21
0
def train_lars(train_features, train_labels, num_alphas, skip_cross_validation,
               alpha, num_jobs):
    """
  Performs the cross validation of lars model, and returns the trained model with best
  params. Assume features are scaled/normalized
  """
    best_alpha = alpha
    max_iter = 10000
    if not skip_cross_validation:
        # use 5 fold cross validation
        model = LassoLarsCV(max_iter=max_iter,
                            cv=5,
                            max_n_alphas=min(num_alphas, 2000),
                            n_jobs=num_jobs,
                            normalize=False)
        model.fit(train_features, train_labels)
        best_alpha = model.alpha_
        #print("number of iterations were {}".format(model.n_iter_))

    model = LassoLars(alpha=alpha, normalize=False, max_iter=max_iter)
    model.fit(train_features, train_labels)

    return (model, {'alpha': best_alpha})
Ejemplo n.º 22
0
def lassovar(data, maxlags=1, n_samples=None, cv=5):
    # Stack data to perform regression of present on past values
    Y = data.T[:, maxlags:]
    d = Y.shape[0]
    Z = np.vstack([data.T[:, maxlags - k:-k] for k in range(1, maxlags + 1)])
    Y, Z = Y.T, Z.T

    # Subsample data
    if n_samples is not None:
        Y, Z = resample(Y, Z, n_samples=n_samples)

    scores = np.zeros((d, d * maxlags))

    ls = LassoLarsCV(cv=cv, n_jobs=1)

    residuals = np.zeros(Y.shape)

    # Consider one variable after the other as target
    for j in range(d):
        target = np.copy(Y[:, j])
        selectedparents = np.full(d * maxlags, False)
        # Include one lag after the other
        for l in range(1, maxlags + 1):
            ind_a = d * (l - 1)
            ind_b = d * l
            ls.fit(Z[:, ind_a:ind_b], target)
            selectedparents[ind_a:ind_b] = ls.coef_ > 0
            target -= ls.predict(Z[:, ind_a:ind_b])

        residuals[:, j] = np.copy(target)

        # Refit OLS using the selected variables to get rid of the bias
        ZZ = Z[:, selectedparents]
        B = np.linalg.lstsq(ZZ.T.dot(ZZ), ZZ.T.dot(Y[:, j]), rcond=None)[0]
        scores[j, selectedparents] = B

    return scores
Ejemplo n.º 23
0
#encoding=utf-8
import numpy as np
import pandas as pd

data = pd.read_csv('train2.csv', index_col='id')

#print np.round(data.corr(method='pearson'),2)
from sklearn.linear_model import LassoLarsCV
#print len(data.columns)
#print data.iloc[:,52]
x = data.iloc[:, 0:52].as_matrix()
y = data.iloc[:, 52].as_matrix()

ll = LassoLarsCV()
ll.fit(x, y)

a = ll.coef_
b = [i != 0 for i in a]

data1 = data.iloc[:, 0:52]
data1 = data1[data1.columns[b]]
x = data1.iloc[:, :].as_matrix()

from xgboost import XGBRegressor

reg = XGBRegressor()
reg.fit(x, y)
y_pred = reg.predict(x)
print('有限特征%s' % data1.columns)
testdata = pd.read_csv('test2.csv', index_col='id')
testdata = testdata[testdata.columns[b]].as_matrix()
Ejemplo n.º 24
0
RMSE(lassolarscv, X_train, Y_train)  #0.1154

# In[ ]:

RMSE(elasticnetcv, X_train, Y_train)  #0.1140

# What the hell ! Who could bielieve that a simple linear regression model performs so well ?! (Im behaving like i just discovered that : WOOOOAAAWW :p)
#
# The 4 models have very similar scores except for the ridge, if we average their predictions we could probably slightly reduce  the error !

# In[ ]:

lassocv.fit(X_train, Y_train)
ridge.fit(X_train, Y_train)
lassolarscv.fit(X_train, Y_train)
elasticnetcv.fit(X_train, Y_train)

# #### 6.1.2 Features coefficients

# In[ ]:

print("LassoCV regression has conserved %d features over %d" %
      (len(features[lassocv.coef_ != 0]), X_train.shape[1]))
print("Ridge regression has conserved %d features over %d" %
      (len(features[ridge.coef_ != 0]), X_train.shape[1]))
print("LassoLarsCV regression has conserved %d features over %d" %
      (len(features[lassolarscv.coef_ != 0]), X_train.shape[1]))
print("ElasticNetCV regression has conserved %d features over %d" %
      (len(features[elasticnetcv.coef_ != 0]), X_train.shape[1]))
Ejemplo n.º 25
0
Archivo: few.py Proyecto: codeaudit/few
class FEW(SurvivalMixin, VariationMixin, EvaluationMixin, BaseEstimator):
    """FEW uses GP to find a set of transformations from the original feature space
    that produces the best performance for a given machine learner.
    """
    update_checked = False

    def __init__(self,
                 population_size=50,
                 generations=100,
                 mutation_rate=0.5,
                 crossover_rate=0.5,
                 ml=None,
                 min_depth=1,
                 max_depth=2,
                 max_depth_init=2,
                 sel='epsilon_lexicase',
                 tourn_size=2,
                 fit_choice=None,
                 op_weight=False,
                 seed_with_ml=True,
                 erc=False,
                 random_state=np.random.randint(9999999),
                 verbosity=0,
                 scoring_function=None,
                 disable_update_check=False,
                 elitism=True,
                 boolean=False,
                 classification=False,
                 clean=False,
                 track_diversity=False,
                 mdr=False,
                 otype='f'):
        # sets up GP.

        # Save params to be recalled later by get_params()
        self.params = locals(
        )  # Must be placed before any local variable definitions
        self.params.pop('self')

        # # Do not prompt the user to update during this session if they ever disabled the update check
        # if disable_update_check:
        #     FEW.update_checked = True
        #
        # # Prompt the user if their version is out of date
        # if not disable_update_check and not FEW.update_checked:
        #     update_check('FEW', __version__)
        #     FEW.update_checked = True

        self._best_estimator = None
        self._training_features = None
        self._training_labels = None
        self._best_inds = None

        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.min_depth = min_depth
        self.max_depth = max_depth
        self.max_depth_init = max_depth_init
        self.sel = sel
        self.tourn_size = tourn_size
        self.fit_choice = fit_choice
        self.op_weight = op_weight
        self.seed_with_ml = seed_with_ml
        self.erc = erc
        self.random_state = random_state
        self.verbosity = verbosity
        self.scoring_function = scoring_function
        self.gp_generation = 0
        self.elitism = elitism
        self.max_fit = 99999999.666
        self.boolean = boolean
        self.classification = classification
        self.clean = clean
        self.ml = ml
        self.track_diversity = track_diversity
        self.mdr = mdr
        self.otype = otype

        # if otype is b, boolean functions must be turned on
        if self.otype == 'b':
            self.boolean = True

        # instantiate sklearn estimator according to specified machine learner
        if self.ml is None:
            if self.classification:
                self.ml = LogisticRegression(solver='sag')
            else:
                self.ml = LassoLarsCV()
        if not self.scoring_function:
            if self.classification:
                self.scoring_function = accuracy_score
            else:
                self.scoring_function = r2_score

        # set default fitness metrics for various learners
        if not self.fit_choice:
            self.fit_choice = {
                #regression
                type(LassoLarsCV()): 'mse',
                type(SVR()): 'mae',
                type(LinearSVR()): 'mae',
                type(KNeighborsRegressor()): 'mse',
                type(DecisionTreeRegressor()): 'mse',
                type(RandomForestRegressor()): 'mse',
                #classification
                type(SGDClassifier()): 'r2',
                type(LogisticRegression()): 'r2',
                type(SVC()): 'r2',
                type(LinearSVC()): 'r2',
                type(RandomForestClassifier()): 'r2',
                type(DecisionTreeClassifier()): 'r2',
                type(DistanceClassifier()): 'silhouette',
                type(KNeighborsClassifier()): 'r2',
            }[type(self.ml)]

        # Columns to always ignore when in an operator
        self.non_feature_columns = ['label', 'group', 'guess']

        # function set
        self.func_set = [
            node('+'),
            node('-'),
            node('*'),
            node('/'),
            node('sin'),
            node('cos'),
            node('exp'),
            node('log'),
            node('^2'),
            node('^3'),
            node('sqrt')
        ]

        # if boolean operators are included but the output type is set to float, then
        # # include the if and if-else operations that allow use of both stacks
        # if self.boolean and self.otype=='f':
        #     self.func_set += [
        #     {'name:','if','arity':2,'in_type':}
        #     ]
        # terminal set
        self.term_set = []
        # diversity
        self.diversity = []

    #@profile
    def fit(self, features, labels):
        """Fit model to data"""

        np.random.seed(self.random_state)
        # setup data
        # imputation
        if self.clean:
            features = self.impute_data(features)
        # Train-test split routine for internal validation
        ####
        train_val_data = pd.DataFrame(data=features)
        train_val_data['labels'] = labels
        # print("train val data:",train_val_data[::10])
        new_col_names = {}
        for column in train_val_data.columns.values:
            if type(column) != str:
                new_col_names[column] = str(column).zfill(10)
        train_val_data.rename(columns=new_col_names, inplace=True)
        # internal training/validation split
        train_i, val_i = train_test_split(train_val_data.index,
                                          stratify=None,
                                          train_size=0.75,
                                          test_size=0.25)

        x_t = train_val_data.loc[train_i].drop('labels', axis=1).values
        x_v = train_val_data.loc[val_i].drop('labels', axis=1).values
        y_t = train_val_data.loc[train_i, 'labels'].values
        y_v = train_val_data.loc[val_i, 'labels'].values

        # Store the training features and classes for later use
        self._training_features = x_t
        self._training_labels = y_t
        ####

        # set population size
        if type(self.population_size) is str:
            if 'x' in self.population_size:  #
                self.population_size = int(
                    float(self.population_size[:-1]) * features.shape[1])
            else:
                self.population_size = int(self.population_size)

        if self.verbosity > 0: print("population size:", self.population_size)
        # print few settings
        if self.verbosity > 1:
            for arg in self.get_params():
                print('{}\t=\t{}'.format(arg, self.get_params()[arg]))
            print('')

        # initial model
        initial_estimator = copy.deepcopy(self.ml.fit(x_t, y_t))
        # self._best_estimator = copy.deepcopy(self.ml.fit(x_t,y_t))

        self._best_score = self.ml.score(x_v, y_v)
        initial_score = self._best_score
        if self.verbosity > 2:
            print("initial estimator size:", self.ml.coef_.shape)
        if self.verbosity > 0:
            print("initial ML CV: {:1.3f}".format(self._best_score))

        # create terminal set
        for i in np.arange(x_t.shape[1]):
            # dictionary of node name, arity, feature column index, output type and input type
            self.term_set.append(node('x', loc=i))  # features
            # add ephemeral random constants if flag
            if self.erc:
                self.term_set.append(node(
                    'k', value=np.random.rand()))  # ephemeral random constants

        # edit function set if boolean
        if self.boolean or self.otype == 'b':  # include boolean functions
            self.func_set += [
                node('!'),
                node('&'),
                node('|'),
                node('=='),
                node('>_f'),
                node('<_f'),
                node('>=_f'),
                node('<=_f'),
                node('>_b'),
                node('<_b'),
                node('>=_b'),
                node('<=_b'),
                node('xor_b'),
                node('xor_f')
            ]

        # add mdr if specified
        if self.mdr:
            self.func_set += [node('mdr2')]

        # Create initial population
        # for now, force seed_with_ml to be off if otype is 'b', since data types`
        # are assumed to be float
        if self.otype == 'b':
            self.seed_with_ml = False
        pop = self.init_pop(self._training_features.shape[0])
        # check that uuids are unique in population
        uuids = [p.id for p in pop.individuals]
        if len(uuids) != len(set(uuids)):
            pdb.set_trace()
        # Evaluate the entire population
        # X represents a matrix of the population outputs (number os samples x population size)
        # single thread
        pop.X = self.transform(x_t, pop.individuals, y_t).transpose()
        # parallel:
        # pop.X = np.asarray(Parallel(n_jobs=-1)(delayed(out)(I,x_t,self.otype,y_t) for I in pop.individuals), order = 'F')

        # calculate fitness of individuals
        # fitnesses = list(map(lambda I: fitness(I,y_t,self.ml),pop.X))
        fitnesses = self.calc_fitness(pop.X, y_t, self.fit_choice, self.sel)

        # max_fit = self.max_fit
        # while len([np.mean(f) for f in fitnesses if np.mean(f) < max_fit and np.mean(f)>=0])<self.population_size and max_count < 100:
        #     pop = self.init_pop()
        #     pop.X = self.transform(x_t,pop.individuals,y_t)
        #     fitnesses = self.calc_fitness(pop.X,y_t,self.fit_choice,self.sel)
        #
        #     max_count+= 1
        # print("fitnesses:",fitnesses)
        # Assign fitnesses to inidividuals in population
        for ind, fit in zip(pop.individuals, fitnesses):
            if isinstance(
                    fit,
                (list,
                 np.ndarray)):  # calc_fitness returned raw fitness values
                fit[fit < 0] = self.max_fit
                fit[np.isnan(fit)] = self.max_fit
                fit[np.isinf(fit)] = self.max_fit
                ind.fitness_vec = fit
                ind.fitness = np.mean(ind.fitness_vec)
            else:
                ind.fitness = np.nanmin([fit, self.max_fit])

        #with Parallel(n_jobs=10) as parallel:
        ####################
        ### Main GP loop
        self.diversity = []
        # progress bar
        pbar = tqdm(total=self.generations,
                    disable=self.verbosity == 0,
                    desc='Internal CV: {:1.3f}'.format(self._best_score))
        # for each generation g
        for g in np.arange(self.generations):

            if self.track_diversity:
                self.get_diversity(pop.X)

            if self.verbosity > 1: print(".", end='')
            if self.verbosity > 1: print(str(g) + ".)", end='')
            # if self.verbosity > 1: print("population:",stacks_2_eqns(pop.individuals))
            if self.verbosity > 2:
                print("pop fitnesses:",
                      ["%0.2f" % x.fitness for x in pop.individuals])
            if self.verbosity > 1:
                print("median fitness pop: %0.2f" %
                      np.median([x.fitness for x in pop.individuals]))
            if self.verbosity > 1:
                print("best fitness pop: %0.2f" %
                      np.min([x.fitness for x in pop.individuals]))
            if self.verbosity > 1 and self.track_diversity:
                print("feature diversity: %0.2f" % self.diversity[-1])
            if self.verbosity > 1: print("ml fitting...")
            # fit ml model
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                try:
                    # if len(self.valid_loc(pop.individuals)) > 0:
                    if self.valid(pop.individuals):
                        self.ml.fit(
                            pop.X[self.valid_loc(pop.individuals), :].
                            transpose(), y_t)
                    # else:
                    #     self.ml.fit(pop.X.transpose(),y_t)

                except ValueError as detail:
                    # pdb.set_trace()
                    print(
                        "warning: ValueError in ml fit. X.shape:",
                        pop.X[:, self.valid_loc(pop.individuals)].transpose(
                        ).shape, "y_t shape:", y_t.shape)
                    print(
                        "First ten entries X:",
                        pop.X[self.valid_loc(pop.individuals), :].transpose()
                        [:10])
                    print("First ten entries y_t:", y_t[:10])
                    print("equations:", stacks_2_eqns(pop.individuals))
                    print("FEW parameters:", self.get_params())
                    if self.verbosity > 1:
                        print("---\ndetailed error message:", detail)
                    raise (ValueError)

            # if self.verbosity > 1: print("number of non-zero regressors:",self.ml.coef_.shape[0])
            # keep best model
            tmp_score = 0
            try:
                # if len(self.valid_loc(pop.individuals)) > 0:
                if self.valid(pop.individuals):
                    tmp_score = self.ml.score(
                        self.transform(
                            x_v,
                            pop.individuals)[:,
                                             self.valid_loc(pop.individuals)],
                        y_v)
                # else:
                #     tmp_score = 0
                # tmp = self.ml.score(self.transform(x_v,pop.individuals),y_v)
            except Exception as detail:
                if self.verbosity > 1: print(detail)

            if self.verbosity > 1:
                print("current ml validation score:", tmp_score)

            if self.valid(pop.individuals) and tmp_score > self._best_score:
                self._best_estimator = copy.deepcopy(self.ml)
                self._best_score = tmp_score
                self._best_inds = copy.deepcopy(self.valid(pop.individuals))
                if self.verbosity > 1:
                    print("updated best internal validation score:",
                          self._best_score)

            # Variation
            if self.verbosity > 2: print("variation...")
            offspring, elite, elite_index = self.variation(pop.individuals)

            # evaluate offspring
            if self.verbosity > 2: print("output...")
            X_offspring = self.transform(x_t, offspring).transpose()
            #parallel:
            # X_offspring = np.asarray(Parallel(n_jobs=-1)(delayed(out)(O,x_t,y_t,self.otype) for O in offspring), order = 'F')
            if self.verbosity > 2: print("fitness...")
            F_offspring = self.calc_fitness(X_offspring, y_t, self.fit_choice,
                                            self.sel)
            # F_offspring = parallel(delayed(f[self.fit_choice])(y_t,yhat) for yhat in X_offspring)
            # print("fitnesses:",fitnesses)
            # Assign fitnesses to inidividuals in population
            for ind, fit in zip(offspring, F_offspring):
                if isinstance(
                        fit,
                    (list,
                     np.ndarray)):  # calc_fitness returned raw fitness values
                    fit[fit < 0] = self.max_fit
                    fit[np.isnan(fit)] = self.max_fit
                    fit[np.isinf(fit)] = self.max_fit
                    ind.fitness_vec = fit
                    ind.fitness = np.mean(ind.fitness_vec)
                else:
                    ind.fitness = np.nanmin([fit, self.max_fit])

            # Survival
            if self.verbosity > 2: print("survival..")
            survivors, survivor_index = self.survival(pop.individuals,
                                                      offspring, elite,
                                                      elite_index)
            pop.individuals[:] = survivors
            pop.X = np.vstack((pop.X, X_offspring))[survivor_index, :]

            if self.verbosity > 2:
                print("median fitness survivors: %0.2f" %
                      np.median([x.fitness for x in pop.individuals]))
            if self.verbosity > 2:
                print(
                    "best features:",
                    stacks_2_eqns(self._best_inds)
                    if self._best_inds else 'original')
            pbar.set_description('Internal CV: {:1.3f}'.format(
                self._best_score))
            pbar.update(1)
        # end of main GP loop
        ####################
        if self.verbosity > 0:
            print('finished. best internal val score: {:1.3f}'.format(
                self._best_score))
        if self.verbosity > 0: print("final model:\n", self.print_model())
        if not self._best_estimator:
            self._best_estimator = initial_estimator
        return self

    def transform(self, x, inds=None, labels=None):
        """return a transformation of x using population outputs"""
        if inds:
            # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in inds)).transpose()
            return np.asarray([
                self.out(I, x, labels, self.otype) for I in inds
            ]).transpose()
        else:
            # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) for I in self._best_inds)).transpose()
            return np.asarray([
                self.out(I, x, labels, self.otype) for I in self._best_inds
            ]).transpose()

    def impute_data(self, x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x)

    def clean(self, x):
        """remove nan and inf rows from x"""
        return x[~np.any(np.isnan(x) | np.isinf(x), axis=1)]

    def clean_with_zeros(self, x):
        """ set nan and inf rows from x to zero"""
        x[~np.any(np.isnan(x) | np.isinf(x), axis=1)] = 0
        return x

    def predict(self, testing_features):
        """predict on a holdout data set."""
        # print("best_inds:",self._best_inds)
        # print("best estimator size:",self._best_estimator.coef_.shape)
        if self.clean:
            testing_features = self.impute_data(testing_features)

        if self._best_inds:
            X_transform = self.transform(testing_features)
            try:
                return self._best_estimator.predict(
                    self.transform(testing_features))
            except ValueError as detail:
                pdb.set_trace()
                print('shape of X:', testing_features.shape)
                print('shape of X_transform:', X_transform.transpose().shape)
                print('best inds:', stacks_2_eqns(self._best_inds))
                print('valid locs:', self.valid_loc(self._best_inds))
                raise ValueError(detail)
        else:
            return self._best_estimator.predict(testing_features)

    def fit_predict(self, features, labels):
        """Convenience function that fits a pipeline then predicts on the provided features

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix
        labels: array-like {n_samples}
            List of class labels for prediction

        Returns
        ----------
        array-like: {n_samples}
            Predicted labels for the provided features

        """
        self.fit(features, labels)
        return self.predict(features)

    def score(self, testing_features, testing_labels):
        """estimates accuracy on testing set"""
        # print("test features shape:",testing_features.shape)
        # print("testing labels shape:",testing_labels.shape)
        yhat = self.predict(testing_features)
        return self.scoring_function(testing_labels, yhat)

    def export(self, output_file_name):
        """exports engineered features

        Parameters
        ----------
        output_file_name: string
            String containing the path and file name of the desired output file

        Returns
        -------
        None

        """
        if self._best_estimator is None:
            raise ValueError(
                'A model has not been optimized. Please call fit() first.')

        # Write print_model() to file
        with open(output_file_name, 'w') as output_file:
            output_file.write(self.print_model())
        # if decision tree, print tree into dot file
        if 'DecisionTree' in type(self.ml).__name__:
            export_graphviz(self._best_estimator,
                            out_file=output_file_name + '.dot',
                            feature_names=stacks_2_eqns(self._best_inds)
                            if self._best_inds else None,
                            class_names=['True', 'False'],
                            filled=False,
                            impurity=True,
                            rotate=True)

    def init_pop(self, num_features=1):
        """initializes population of features as GP stacks."""
        pop = Pop(self.population_size, num_features)
        # make programs
        if self.seed_with_ml:
            # initial population is the components of the default ml model
            if type(self.ml) == type(LassoLarsCV()):
                # add all model components with non-zero coefficients
                for i, (c, p) in enumerate(
                        it.zip_longest([c for c in self.ml.coef_ if c != 0],
                                       pop.individuals,
                                       fillvalue=None)):
                    if c is not None and p is not None:
                        p.stack = [node('x', loc=i)]
                    elif p is not None:
                        # make program if pop is bigger than model componennts
                        make_program(
                            p.stack, self.func_set, self.term_set,
                            np.random.randint(self.min_depth,
                                              self.max_depth + 1), self.otype)
                        p.stack = list(reversed(p.stack))
            else:  # seed with raw features
                # if list(self.ml.coef_):
                #pdb.set_trace()
                try:
                    if self.population_size < self.ml.coef_.shape[0]:
                        # seed pop with highest coefficients
                        coef_order = np.argsort(self.ml.coef_[::-1])
                        for i, (c, p) in enumerate(
                                zip(coef_order, pop.individuals)):
                            p.stack = [node('x', loc=i)]
                    else:
                        raise (AttributeError)
                except Exception:  # seed pop with raw features
                    for i, p in it.zip_longest(range(
                            self._training_features.shape[1]),
                                               pop.individuals,
                                               fillvalue=None):
                        if p is not None:
                            if i is not None:
                                p.stack = [node('x', loc=i)]
                            else:
                                make_program(
                                    p.stack, self.func_set, self.term_set,
                                    np.random.randint(self.min_depth,
                                                      self.max_depth + 1),
                                    self.otype)
                                p.stack = list(reversed(p.stack))

            # print initial population
            if self.verbosity > 2:
                print("seeded initial population:",
                      stacks_2_eqns(pop.individuals))

        else:
            for I in pop.individuals:
                depth = np.random.randint(self.min_depth, self.max_depth + 1)
                # print("hex(id(I)):",hex(id(I)))
                # depth = 2;
                # print("initial I.stack:",I.stack)

                make_program(I.stack, self.func_set, self.term_set, depth,
                             self.otype)
                # print(I.stack)
                I.stack = list(reversed(I.stack))

            # print(I.stack)

        return pop

    def print_model(self, sep='\n'):
        """prints model contained in best inds, if ml has a coefficient property.
        otherwise, prints the features generated by FEW."""
        model = ''

        if self._best_inds:
            if type(self.ml).__name__ != 'SVC' and type(
                    self.ml).__name__ != 'SVR':
                # this is need because svm has a bug that throws valueerror on attribute check:

                if hasattr(self.ml, 'coef_'):
                    if self._best_estimator.coef_.shape[0] == 1 or len(
                            self._best_estimator.coef_.shape) == 1:
                        if self._best_estimator.coef_.shape[0] == 1:
                            s = np.argsort(
                                np.abs(self._best_estimator.coef_[0]))[::-1]
                            scoef = self._best_estimator.coef_[0][s]
                        else:
                            s = np.argsort(np.abs(
                                self._best_estimator.coef_))[::-1]
                            scoef = self._best_estimator.coef_[s]
                        bi = [self._best_inds[k] for k in s]
                        model = (' +' + sep).join([
                            str(round(c, 3)) + '*' + stack_2_eqn(f)
                            for i, (f, c) in enumerate(zip(bi, scoef))
                            if round(scoef[i], 3) != 0
                        ])
                    else:
                        # more than one decision function is fit. print all.
                        for j, coef in enumerate(self._best_estimator.coef_):
                            s = np.argsort(np.abs(coef))[::-1]
                            scoef = coef[s]
                            bi = [self._best_inds[k] for k in s]
                            model += sep + 'class' + str(
                                j) + ' :' + ' + '.join([
                                    str(round(c, 3)) + '*' + stack_2_eqn(f)
                                    for i, (f, c) in enumerate(zip(bi, coef))
                                    if coef[i] != 0
                                ])
                elif hasattr(self._best_estimator, 'feature_importances_'):
                    s = np.argsort(
                        self._best_estimator.feature_importances_)[::-1]
                    sfi = self._best_estimator.feature_importances_[s]
                    bi = [self._best_inds[k] for k in s]
                    model = 'importance : feature\n'

                    model += sep.join([
                        str(round(c, 3)) + '\t:\t' + stack_2_eqn(f)
                        for i, (f, c) in enumerate(zip(bi, sfi))
                        if round(sfi[i], 3) != 0
                    ])
                else:
                    return stacks_2_eqns(self._best_inds)
            else:
                return stacks_2_eqns(self._best_inds)
        else:
            return 'original features'

        return model

    def representation(self):
        """return stacks_2_eqns output"""
        return stacks_2_eqns(self._best_inds)

    def valid_loc(self, individuals):
        """returns the indices of individuals with valid fitness."""

        return [
            index for index, i in enumerate(individuals)
            if i.fitness < self.max_fit and i.fitness >= 0
        ]

    def valid(self, individuals):
        """returns the sublist of individuals with valid fitness."""

        return [
            i for i in individuals
            if i.fitness < self.max_fit and i.fitness >= 0
        ]

    def get_params(self, deep=None):
        """Get parameters for this estimator

        This function is necessary for FEW to work as a drop-in feature constructor in,
        e.g., sklearn.model_selection.cross_val_score

        Parameters
        ----------
        deep: unused
            Only implemented to maintain interface for sklearn

        Returns
        -------
        params: mapping of string to any
            Parameter names mapped to their values
        """
        return self.params

    def get_diversity(self, X):
        """compute mean diversity of individual outputs"""
        # diversity in terms of cosine distances between features
        feature_correlations = np.zeros(X.shape[0] - 1)
        for i in np.arange(1, X.shape[0] - 1):
            feature_correlations[i] = max(0.0, r2_score(X[0], X[i]))
        # pdb.set_trace()
        self.diversity.append(1 - np.mean(feature_correlations))
marked_pixel = (4, 2)

from matplotlib import gridspec
from matplotlib.patches import Rectangle

fig = plt.figure(figsize=(12, 8))
fig.suptitle('Receptive fields of the marked voxels', fontsize=25)

# GridSpec allows us to do subplots with more control of the spacing
gs1 = gridspec.GridSpec(2, 3)

# we fit the Lasso for each of the three voxels of the upper row
for i, index in enumerate([1780, 1951, 2131]):
    ax = plt.subplot(gs1[0, i])
    # we reshape the coefficients into the form of the original images
    rf = lasso.fit(stimuli, fmri_data[:, index]).coef_.reshape((10, 10))
    # add a black background
    ax.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray')
    ax_im = ax.imshow(np.ma.masked_less(rf, 0.1), interpolation="nearest",
                      cmap=['Blues', 'Greens', 'Reds'][i], vmin=0., vmax=0.75)
    # add the marked pixel
    ax.add_patch(Rectangle(
        (marked_pixel[1] - .5, marked_pixel[0] - .5), 1, 1,
        facecolor='none', edgecolor='r', lw=4))
    plt.axis('off')
    plt.colorbar(ax_im, ax=ax)

# and then for the voxel at the bottom

gs1.update(left=0., right=1., wspace=0.1)
ax = plt.subplot(gs1[1, 1])
Ejemplo n.º 27
0
parser.add_argument("--lat", help="Training Latitude", type=float)
parser.add_argument("--lon", help="Training Longitude", type=float)

args = parser.parse_args()

train_data = load_data.load_supervised(1950, 1985, args.lat, args.lon, 50, which='train')
test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test')

lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon))
if os.path.exists(lasso_file):
	print "Reading PCA from file"
	L = pickle.load(open(lasso_file, 'r'))
else:
	print "Fitting Lasso"
	L = LassoLarsCV(cv=5)
	L.fit(train_data.X, train_data.y[:,0])
	pickle.dump(L, open(lasso_file, 'w'))


## Print Fit stats
print "Alpha", L.alpha_ 
print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X))
print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X))

yhat = L.predict(test_data.X)
print "Pearson Corr", pearsonr(test_data.y[:,0], yhat)
print "Spearman Corr", spearmanr(test_data.y[:,0], yhat)
print "SSE", sum((yhat - test_data.y[:,0])**2)


## Compute monthly data
Ejemplo n.º 28
0
    block = delay * num_filter
    chan = num / block
    f = (num % block) / delay
    t = (num % block) % delay
    return (chan, f, t)


if __name__ == "__main__":
    os.chdir(os.path.dirname(__file__))
    subj = 'sub1'
    finger = 1
    with h5py.File('ECoG_data.h5', 'r+') as f:
        u = f[subj]['unmixing_matrix'][:]
        X = f[subj]['train_data'][:]
        X -= X.mean(0)
        X = X.dot(u)
        Y = f[subj]['cleaned_train_dg'][:]
    X1, y1, _ = preprocessing(X, Y[:, finger])
    ls = LassoLarsCV()
    ls.fit(X1, y1[:, 0])
    pickle.dump(ls, open('linear_model_'+subj+'_'+str(finger), 'wb'))
    channel_count = Counter([num2info(c)[0] for c in ls.coef_.nonzero()[0]])
    X2, _, yb = preprocessing(X[:, list(set(channel_count.keys()))],
                              Y[:, finger])
    ls2 = LogisticRegressionCV()
    ls2.fit(X2, yb[:, 0])
    pickle.dump(ls2, open('logistic_model_'+subj+'_'+str(finger), 'wb'))
    with h5py.File('selected_channel.h5', 'w') as f:
            f.create_dataset('selected_channel',
                             data=list(set(channel_count.keys())))
Ejemplo n.º 29
0
pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps'))
pl.clf()

### Compute receptive fields

from sklearn.linear_model import LassoLarsCV

lasso = LassoLarsCV(max_iter=10, )

p = (4, 2)
# Mask for chosen pixel
pixmask = np.zeros((10, 10), dtype=bool)
pixmask[p] = 1

for index in [1780, 1951, 2131, 1935]:
    rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10)
    pl.figure(figsize=(8, 8))
    # Black background
    pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray')
    pl.imshow(np.ma.masked_equal(rf, 0.),
              vmin=0.,
              vmax=0.75,
              interpolation="nearest",
              cmap=cm.bluegreen)
    plot_lines(pixmask, linewidth=6, color='r')
    pl.axis('off')
    pl.subplots_adjust(left=0., right=1., bottom=0., top=1.)
    pl.savefig(os.path.join('miyawaki', 'encoding_%d.pdf' % index))
    pl.savefig(os.path.join('miyawaki', 'encoding_%d.eps' % index))
    pl.clf()
def lasso(X, y, value):
    regressor = LassoLarsCV(cv=10, precompute=False)
    regressor.fit(X, y)
    y_pred = regressor.predict(value)
    return y_pred
Ejemplo n.º 31
0
        y_trainset_001.append(1.0)
        num_2 += 1
print num_1, num_2
classify_model_001 = RandomForestClassifier(n_estimators=55, random_state=1)
classify_model_001.fit(X_trainset_001, y_trainset_001)

### 构建0.003的回归模型
from sklearn.linear_model import LassoLarsCV, BayesianRidge
X_trainset_0003 = []
y_trainset_0003 = []
for i in range(0, y_trainset.__len__(), 1):
    if y_trainset[i] < 0.003:
        X_trainset_0003.append(X_trainset[i])
        y_trainset_0003.append(y_trainset[i])
reg_0003 = LassoLarsCV(max_n_alphas=100, positive=True)
reg_0003.fit(X_trainset_0003, y_trainset_0003)

### 构建0.003-0.01的回归模型
from sklearn.linear_model import LassoLarsCV
X_trainset_001 = []
y_trainset_001 = []
for i in range(0, y_trainset.__len__(), 1):
    if y_trainset[i] >= 0.003 and y_trainset[i] < 0.015:
        X_trainset_001.append(X_trainset[i])
        y_trainset_001.append(y_trainset[i])
reg_001 = LassoLarsCV(max_n_alphas=100, cv=10)
reg_001.fit(X_trainset_001, y_trainset_001)

### 构建大于0.01的回归模型
from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV
X_trainset_1 = []
Ejemplo n.º 32
0
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
                use_lars=True, fit_intercept=True, normalize=True,
                cv_folds=None, cv_repeats=None, skip_cv=False,
                xmin=-np.inf, xmax=np.inf, _larch=None, **kws):

    """use a list of data groups to train a Lasso/LassoLars model

    Arguments
    ---------
      groups      list of groups to use as components
      varname     name of characteristic value to model ['valence']
      arrayname   string of array name to be fit (see Note 3) ['norm']
      xmin        x-value for start of fit range [-inf]
      xmax        x-value for end of fit range [+inf]
      alpha       alpha parameter for LassoLars (See Note 5) [None]
      use_lars    bool to use LassoLars instead of Lasso [True]
      cv_folds    None or number of Cross-Validation folds (Seee Note 4) [None]
      cv_repeats  None or number of Cross-Validation repeats (Seee Note 4) [None]
      skip_cv     bool to skip doing Cross-Validation [None]

    Returns
    -------
      group with trained LassoLars model, to be used with lasso_predict
    Notes
    -----
     1.  The group members for the components must match each other
         in data content and array names.
     2.  all grouops must have an attribute (scalar value) for `varname`
     3.  arrayname can be one of `norm` or `dmude`
     4.  Cross-Validation:  if cv_folds is None, sqrt(len(groups)) will be used
            (rounded to integer).  if cv_repeats is None, sqrt(len(groups))-1
            will be used (rounded).
     5.  alpha is the regularization parameter. if alpha is None it will
         be set using LassoLarsSCV
    """
    xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
    groupnames = []
    ydat = []
    for g in groups:
        groupnames.append(getattr(g, 'filename',
                                  getattr(g, 'groupname', repr(g))))
        val = getattr(g, varname, None)
        if val is None:
            raise Value("group '%s' does not have attribute '%s'" % (g, varname))
        ydat.append(val)
    ydat = np.array(ydat)

    nvals = len(groups)

    kws.update(dict(fit_intercept=fit_intercept, normalize=normalize))
    creator = LassoLars if use_lars else Lasso
    model = None

    rmse_cv = None
    if not skip_cv:
        if cv_folds is None:
            cv_folds = int(round(np.sqrt(nvals)))
        if  cv_repeats is None:
            cv_repeats = int(round(np.sqrt(nvals)) - 1)

        cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
        if alpha is None:
            lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
                                 max_iter=1e7, eps=1.e-12, **kws)
            lcvmod.fit(spectra, ydat)
            alpha = lcvmod.alpha_

        model = creator(alpha=alpha, **kws)
        resid = []
        for ctrain, ctest in cv.split(range(nvals)):
            model.fit(spectra[ctrain, :], ydat[ctrain])
            ypred = model.predict(spectra[ctest, :])
            resid.extend((ypred - ydat[ctest]).tolist())
        resid = np.array(resid)
        rmse_cv = np.sqrt( (resid**2).mean() )

    if alpha is None:
        cvmod = creator(**kws)
        cvmod.fit(spectra, ydat)
        alpha = cvmod.alpha_

    if model is None:
        model = creator(alpha=alpha, **kws)

    # final fit without cross-validation
    out = model.fit(spectra, ydat)

    ypred = model.predict(spectra)

    rmse = np.sqrt(((ydat - ypred)**2).mean())

    return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
                 alpha=alpha, active=model.active_, coefs=model.coef_,
                 cv_folds=cv_folds, cv_repeats=cv_repeats,
                 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
                 arrayname=arrayname, fit_intercept=fit_intercept,
                 normalize=normalize, groupnames=groupnames, keywords=kws)
Ejemplo n.º 33
0
    ivars2 = []
    depvars = []
    columns = []

    for pyear in player_years:
        ivars.append([pt_projs[pyear][system] for system in proj_systems])
        depvars.append(pt_actuals[pyear]['actual'])

    for pyear in pt_projs_curr.keys():
        ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems])

    x = numpy.array(ivars)
    x2 = numpy.array(ivars2)
    y = numpy.array(depvars)
    model_pt = LassoLarsCV(cv=cv_num)
    model_pt.fit(x,y)

    print("Rough PT model, to choose sample")
    for system, coef in zip(proj_systems, model_pt.coef_):
        print("%40s : %f" % (system, coef))
    print("%40s : %f" % ('intercept', model_pt.intercept_))

    sample_proj_pt_arr = model_pt.predict(x)

    curr_proj_pt_arr = model_pt.predict(x2)

    sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr))
    curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr))

    models = {}
    final_projs = {}
Ejemplo n.º 34
0
    y = sc_y.fit_transform(input_data['label'].values.reshape(-1,1))
else:
    X = input_data.drop('label', axis=1).values.astype(float)
    y = input_data['label'].values

sss = ShuffleSplit(n_splits=50,train_size=0.7,test_size=0.3,random_state=63)

for i,(train,test) in enumerate(sss.split(X,y)):

    # Create the pipeline for the model
    est = LassoLarsCV()

    #fit model
    # pdb.set_trace()
    t0 = time.time()
    est.fit(X[train],y[train])
    #get fit time
    runtime = time.time()-t0
    # print("training done")
    # pdb.set_trace()
    # predict on test set

    y_true = y[test]
    y_pred = est.predict(X[test])

    if problem in scale_these:
        test_mse = mean_squared_error(sc_y.inverse_transform(y_true),
                                      sc_y.inverse_transform(y_pred))
        test_r2 = r2_score(sc_y.inverse_transform(y_true),
                                    sc_y.inverse_transform(y_pred))
    else:
pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps'))
pl.clf()

### Compute receptive fields

from sklearn.linear_model import LassoLarsCV

lasso = LassoLarsCV(max_iter=10,)

p = (4, 2)
# Mask for chosen pixel
pixmask = np.zeros((10, 10), dtype=bool)
pixmask[p] = 1

for index in [1780, 1951, 2131, 1935]:
    rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10)
    pl.figure(figsize=(8, 8))
    # Black background
    pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray')
    pl.imshow(np.ma.masked_equal(rf, 0.), vmin=0., vmax=0.75,
            interpolation="nearest", cmap=cm.bluegreen)
    plot_lines(pixmask, linewidth=6, color='r')
    pl.axis('off')
    pl.subplots_adjust(left=0., right=1., bottom=0., top=1.)
    pl.savefig(os.path.join('miyawaki', 'encoding_%d.pdf' % index))
    pl.savefig(os.path.join('miyawaki', 'encoding_%d.eps' % index))
    pl.clf()


### Plot the colorbar #########################################################
import matplotlib as mpl
Ejemplo n.º 36
0
class LinearAll:
    """
    A repertoire of Linear Variable Selection and Prediction Models

    Parameters
    ----------
    n_jobs : int, optional
        Number of jobs to run in parallel (default 1).
        If -1 all CPUs are used. This will only provide speedup for
        n_targets > 1 and sufficient large problems
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be:
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’
    refit : boolean
        Refit the best estimator with the entire dataset. If “False”,
        it is impossible to make predictions using this GridSearchCV
        instance after fitting.
    iid : boolean, optional
        If True, the data is assumed to be identically distributed across
        the folds, and the score is computed from all samples individually,
        and not the mean loss across the folds.
        (If the number of data points is the same across folds, either
        returns the same thing)

    Attributes
    ----------
    ols_train,
    predictions models before variable selection
    predictions models after variable selection
    """

    def __init__ (self, cv=20, scoring = 'mean_squared_error',
                  n_jobs=1, refit=False, iid=False, pre_pred=True,
                  param_ridge_post=list(np.arange(1,3,0.1)),
                    rlasso_selection_threshold = 0.5):
        #self.__name__ = '__main__'
        """
        CAUTION: we changed to __main__ so that parallelization works
        """
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.iid = iid
        self.pre_pred =pre_pred
        self.param_ridge_post = param_ridge_post
        self.rlasso_selection_threshold = rlasso_selection_threshold

    def run_models(self, X, y, param_ridge):
        """

        Prediction Models.

        OLS, PLS, Ridge

        """

        ##################################
        ## OLS CV
        ##################################
        #ols = linear_model.LinearRegression(fit_intercept=True,
        #                                          normalize=False,
        #                                          copy_X=True)
        #ols_cv_score = cross_validation.cross_val_score(
        #        ols, X, y,
        #        cv=self.cv, scoring=self.scoring,
        #        n_jobs=self.n_jobs)
        """
        self.ols_cv_score.shape = (cv,)
        """

        ##################################
        ## PLS CV
        ##################################
        tuned_parameters = [{'n_components': range(1, 5)}]
        pls = PLSRegression()
        pls_cv = GridSearchCV(pls, tuned_parameters,
                                cv=self.cv, scoring=self.scoring,
                                n_jobs=self.n_jobs,
                                refit=self.refit, iid=self.iid)
        pls_cv.fit(X, y)


        ##################################
        ## Ridge CV
        ##################################
        tuned_parameters = [{'alpha': param_ridge}]
        ridge = linear_model.Ridge(alpha = 1)
        ridge_cv = GridSearchCV(ridge, tuned_parameters,
                                     cv=self.cv, scoring=self.scoring,
                                     n_jobs=self.n_jobs,
                                     refit=self.refit, iid=self.iid)
        ridge_cv.fit(X, y)

        return (pls_cv, ridge_cv)

    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self

    def predict(self, X_test):
        assert(self.refit == True)
        if self.pls_post.best_score_ > self.ridge_post.best_score_:
            self.best_model = self.pls_post
            print "Chosen Model: pls"
        else:
            self.best_model = self.ridge_post
            print "Chosen Model: ridge"

        if self.rlasso_selection_threshold == 0:
            X_test_selected = X_test[:, self.active]
        else:
            X_test_selected = self.rlasso.transform(X_test)
        return self.best_model.best_estimator_.predict(X_test_selected)
from xgboost.sklearn import XGBClassifier
from xgboost import DMatrix


df = pd.read_csv("processed.csv", header=0, index_col="ID")
#df.TARGET.describe()

y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values

lr = LassoLarsCV()
sfm = SelectFromModel(lr, threshold=1e-3)
X_std = StandardScaler().fit_transform(X, y)
sfm.fit(X_std,y)
lr.fit(X_std, y)

#feat_imp = pd.DataFrame(lr.coef_, index=X_labels)
#feat_imp.plot(kind="bar", title="Feature Importance", use_index=False)

chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ]
#chosen_feat = pickle.load(open("feat", "rb"))
print(len(chosen_feat))
chosen_feat

# kaggle forum
df.var3 = df.var3.replace(-999999,2)
y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values
Ejemplo n.º 38
0
        [0.607492, 3.965162], [0.358622, 3.514900], [0.147846, 3.125947],
        [0.637820, 4.094115], [0.230372, 3.476039], [0.070237, 3.210610],
        [0.067154, 3.190612], [0.925577, 4.631504], [0.717733, 4.295890],
        [0.015371, 3.085028], [0.335070, 3.448080], [0.040486, 3.167440],
        [0.212575, 3.364266], [0.617218, 3.993482], [0.541196, 3.891471]]

#生成X和y矩阵
dataMat = np.array(data)
X = dataMat[:, 0:1]  # 变量x
y = dataMat[:, 1]  #变量y

# ========Lasso回归========
# model = Lasso(alpha=0.01)  # 调节alpha可以实现对拟合的程度
# model = LassoCV()  # LassoCV自动调节alpha可以实现选择最佳的alpha。
model = LassoLarsCV()  # LassoLarsCV自动调节alpha可以实现选择最佳的alpha
model.fit(X, y)  # 线性回归建模
print('系数矩阵:\n', model.coef_)
print('线性回归模型:\n', model)
# print('最佳的alpha:',model.alpha_)  # 只有在使用LassoCV、LassoLarsCV时才有效
# 使用模型预测
predicted = model.predict(X)

# 绘制散点图 参数:x横轴 y纵轴
plt.scatter(X, y, marker='x')
plt.plot(X, predicted, c='r')

# 绘制x轴和y轴坐标
plt.xlabel("x")
plt.ylabel("y")

# 显示图形
Ejemplo n.º 39
0
        N_SEG.append(X.shape[0])
        # parameters search range
        #param_ridge_post = list(np.arange(200,400,10))
        #param_ridge_post.append(0.5)
        param_ridge_post= np.concatenate((np.arange(0.1,1,0.1),np.arange(3,5,0.1)))
        #param_ridge_post = [330, 0.5] #p=24489
        #param_ridge_post = [3.7, 0.5] #p=303

        # fit
        from sklearn.linear_model import LassoLarsCV
        from sklearn import linear_model
        lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=5, n_jobs=2)
        lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        lasso_refit = linear_model.LassoLars(alpha=lasso_cv.alpha_,
                            fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000,
                            eps=2.2204460492503131e-16, copy_X=True,
                            fit_path=False)
        lasso_refit.fit(X, y)
        active = lasso_refit.coef_
        for i, x in enumerate(active[0]):
            if x != 0 and i > main.shape[1] - 1:
                collect[j].add(i)