def perform_lasso_stability_path(df, target):
    return lasso_stability_path(df,
                                target,
                                scaling=0.5,
                                random_state=2703,
                                n_resampling=1000,
                                n_grid=300,
                                sample_fraction=0.75)
Ejemplo n.º 2
0
    def test_lasso_stability_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lasso_stability_path(random_state=self.random_state)
        expected = lm.lasso_stability_path(diabetes.data, diabetes.target,
                                           random_state=self.random_state)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelFrame)
        tm.assert_index_equal(result[1].index, df.data.columns)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Ejemplo n.º 3
0
    def test_lasso_stability_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lasso_stability_path(random_state=self.random_state)
        expected = lm.lasso_stability_path(diabetes.data, diabetes.target,
                                           random_state=self.random_state)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelFrame)
        tm.assert_index_equal(result[1].index, df.data.columns)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
    def lasso_stability(X_scaled, Y, labels, X_test):
        print "Features sorted by their stability score using lasso stability paths:"
        if debug:
            print X_scaled.shape
            alpha_grid, scores_path = linear_model.lasso_stability_path(
                X_scaled,
                Y[:, 1],
                n_jobs=-1,
                random_state=42,
                eps=0.05,
                sample_fraction=0.50,
                verbose=debug)
            plt.figure(num=1)
            #plot as a function of the alpha/alpha_max
            variables = plt.plot(alpha_grid[1:]**0.333, scores_path.T[1:], 'k')
            ymin, ymax = plt.ylim()
            plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
            plt.ylabel('Stability score: proportion of times selected')
            plt.title('Stability Scores Path')
            plt.axis('tight')
            plt.figure(num=2)
            auc = (scores_path.dot(alpha_grid))
            auc_plot = plt.plot((scores_path.dot(alpha_grid)))
            plt.xlabel(r'Features')
            plt.ylabel(r'Area under stability curve')
            plt.title('Overall stability of features')
            plt.show()
            if X_scaled.shape[1] > 500:
                k = X_scaled.shape[1] / 3
            else:
                k = X_scaled.shape[1] / 2
            print "Top %d performing features" % (k)
            ind = np.argpartition(auc, -k)[-k:]
            for (arg, value) in sorted(zip(labels[ind], auc[ind]),
                                       key=lambda (x, y): y,
                                       reverse=True):
                print arg, value
            print ind
            print np.where(ind)
            labels = labels[np.where(ind)]
            X_scaled = np.squeeze(X_scaled[:, np.where(ind)])
            X_test = np.squeeze(X_test[:, np.where(ind)])
            printSizes('lasso_stability end', X_scaled, Y, X_test)

        else:
            print 'Debug option not set, supress plotting'
        return (X_scaled, Y, labels, X_test)
Ejemplo n.º 5
0
import data
from sklearn import linear_model
import numpy as np
import pandas as pd
import os

if __name__ == "__main__" or not os.path.exists('bow_selected.txt'):
    nonzero, = np.where(data.BBC_x.loc[:,data.isbow].std() != 0)
    BBC_nonzero = data.BBC_x.loc[:,data.isbow].iloc[:,nonzero]
    alphas_grid, scores_path = linear_model.lasso_stability_path(BBC_nonzero.values, data.BBC_y)

    selected = scores_path[:,99] != 0
    bow_selected = np.zeros(data.isbow.sum(), bool)
    bow_selected[nonzero[selected]] = True
    np.savetxt('bow_selected.txt', bow_selected, fmt='%d')

    # Borrowed from sklearn plot_sparse_recovery.py
    import pylab
    pylab.ion()
    sel = pylab.plot(alphas_grid[1:] ** .333, scores_path[selected,1:].T, 'r')
    nsel= pylab.plot(alphas_grid[1:] ** .333, scores_path[~selected,1:].T, 'k')
    pylab.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    pylab.ylabel('Stability score: proportion of lasso models where feature selected')
    pylab.axis('tight')
    pylab.legend((sel[0], nsel[0]), ('selected features', 'other features'))
    pylab.savefig('lasso_select2.pdf')
else:
    bow_selected = np.loadtxt('bow_selected.txt', dtype=bool)

def remove_bad_words(X):
    Xnonbow = X.loc[:, ~data.isbow]
        linalg.svdvals(X[:n_relevant_features])).max()
    X = StandardScaler().fit_transform(X.copy())

    # The output variable
    y = np.dot(X, coef)
    y /= np.std(y)
    # We scale the added noise as a function of the average correlation
    # between the design and the output variable
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(X[:, :n_relevant_features],
                            X[:, n_relevant_features:])

    ###########################################################################
    # Plot stability selection path, using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
                                                   eps=0.05)

    plt.figure()
    # We plot the path as a function of alpha/alpha_max to the power 1/3: the
    # power 1/3 scales the path less brutally than the log, and enables to
    # see the progression along the path
    hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
    hb = plt.plot(alpha_grid[1:] ** .333, scores_path[coef == 0].T[1:], 'k')
    ymin, ymax = plt.ylim()
    plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    plt.ylabel('Stability score: proportion of times selected')
    plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
    plt.axis('tight')
    plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
               loc='best')
Ejemplo n.º 7
0
    X = Scaler().fit_transform(X.copy())

    # The output variable
    y = np.dot(X, coef)
    y /= np.std(y)
    # We scale the added noise as a function of the average correlation
    # between the design and the output variable
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(X[:, :n_relevant_features],
                            X[:, n_relevant_features:])

    ###########################################################################
    # Plot stability selection path, using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X,
                                                   y,
                                                   random_state=42,
                                                   eps=0.05)

    pl.figure()
    # We plot the path as a function of alpha/alpha_max to the power 1/3: the
    # power 1/3 scales the path less brutally than the log, and enables to
    # see the progression along the path
    hg = pl.plot(alpha_grid[1:]**.333, scores_path[coef != 0].T[1:], 'r')
    hb = pl.plot(alpha_grid[1:]**.333, scores_path[coef == 0].T[1:], 'k')
    ymin, ymax = pl.ylim()
    pl.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    pl.ylabel('Stability score: proportion of times selected')
    pl.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
    pl.axis('tight')
    pl.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
              loc='best')
Ejemplo n.º 8
0
# T = np.vstack((T, tmp))
#
# df = pd.DataFrame(data=T, columns=F)
# # df.dropna(inplace=True)

y_train = y_train.ravel()
y_test = y_test.ravel()

###########################################################################
# Plot stability selection path, using a high eps for early stopping
# of the path, to save computation time
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    alpha_grid, scores_path = lasso_stability_path(X_train,
                                                   y_train,
                                                   random_state=42,
                                                   eps=0.5,
                                                   verbose=1)

print alpha_grid
print scores_path.shape
# print scores_path.T[1:]

plt.figure()
# We plot the path as a function of alpha/alpha_max to the power 1/3: the
# power 1/3 scales the path less brutally than the log, and enables to
# see the progression along the path
# hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
hb = plt.plot(alpha_grid[1:]**.333, scores_path.T[1:], 'k_feat')
ymin, ymax = plt.ylim()
plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
Ejemplo n.º 9
0
import pickle
X = np.load(open('/home/vincentli2010/Desktop/train_main.npy', 'rb'))
_, _, y, _ = pickle.load(open('features.p', 'rb'))

with warnings.catch_warnings():
    warnings.simplefilter('ignore', UserWarning)
    lars_cv = LassoLarsCV(cv=6).fit(X, y)

# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
# to avoid exploring the regime in which very noisy variables enter
# the model


alpha_grid, scores_path = lasso_stability_path(X, y, scaling=0.5, random_state=None, n_resampling=200,
                     n_grid=100, sample_fraction=0.75, eps=8.8817841970012523e-16,
                     n_jobs=-1, verbose=False)


lars_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=5, n_jobs=-1).fit(X,y)

alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=lars_cv.alpha_, random_state=42, n_jobs=-1).fit(X, y)
trees = ExtraTreesRegressor(100).fit(X, y)
# Compare with F-score
F, _ = f_regression(X, y)

pl.figure()
Ejemplo n.º 10
0
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None):
    from sklearn import datasets, neighbors, linear_model, svm

    totalTime = 0

    startTrainTime = time()
    logger.info("Start training...")
    if model_type == 'ARDRegression':
        model = linear_model.ARDRegression().fit(train_x, train_y)
    elif model_type == 'BayesianRidge':
        model = linear_model.BayesianRidge().fit(train_x, train_y)
    elif model_type == 'ElasticNet':
        model = linear_model.ElasticNet().fit(train_x, train_y)
    elif model_type == 'ElasticNetCV':
        model = linear_model.ElasticNetCV().fit(train_x, train_y)
    elif model_type == 'HuberRegressor':
        model = linear_model.HuberRegressor().fit(train_x, train_y)
    elif model_type == 'Lars':
        model = linear_model.Lars().fit(train_x, train_y)
    elif model_type == 'LarsCV':
        model = linear_model.LarsCV().fit(train_x, train_y)
    elif model_type == 'Lasso':
        model = linear_model.Lasso().fit(train_x, train_y)
    elif model_type == 'LassoCV':
        model = linear_model.LassoCV().fit(train_x, train_y)
    elif model_type == 'LassoLars':
        model = linear_model.LassoLars().fit(train_x, train_y)
    elif model_type == 'LassoLarsCV':
        model = linear_model.LassoLarsCV().fit(train_x, train_y)
    elif model_type == 'LassoLarsIC':
        model = linear_model.LassoLarsIC().fit(train_x, train_y)
    elif model_type == 'LinearRegression':
        model = linear_model.LinearRegression().fit(train_x, train_y)
    elif model_type == 'LogisticRegression':
        model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'LogisticRegressionCV':
        model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'MultiTaskLasso':
        model = linear_model.MultiTaskLasso().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNet':
        model = linear_model.MultiTaskElasticNet().fit(train_x, train_y)
    elif model_type == 'MultiTaskLassoCV':
        model = linear_model.MultiTaskLassoCV().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNetCV':
        model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuit':
        model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuitCV':
        model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveClassifier':
        model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveRegressor':
        model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y)
    elif model_type == 'Perceptron':
        model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RandomizedLasso':
        model = linear_model.RandomizedLasso().fit(train_x, train_y)
    elif model_type == 'RandomizedLogisticRegression':
        model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y)
    elif model_type == 'RANSACRegressor':
        model = linear_model.RANSACRegressor().fit(train_x, train_y)
    elif model_type == 'Ridge':
        model = linear_model.Ridge().fit(train_x, train_y)
    elif model_type == 'RidgeClassifier':
        model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeClassifierCV':
        model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeCV':
        model = linear_model.RidgeCV().fit(train_x, train_y)
    elif model_type == 'SGDClassifier':
        model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SGDRegressor':
        model = linear_model.SGDRegressor().fit(train_x, train_y)
    elif model_type == 'TheilSenRegressor':
        model = linear_model.TheilSenRegressor().fit(train_x, train_y)
    elif model_type == 'lars_path':
        model = linear_model.lars_path().fit(train_x, train_y)
    elif model_type == 'lasso_path':
        model = linear_model.lasso_path().fit(train_x, train_y)
    elif model_type == 'lasso_stability_path':
        model = linear_model.lasso_stability_path().fit(train_x, train_y)
    elif model_type == 'logistic_regression_path':
        model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'orthogonal_mp':
        model = linear_model.orthogonal_mp().fit(train_x, train_y)
    elif model_type == 'orthogonal_mp_gram':
        model = linear_model.orthogonal_mp_gram().fit(train_x, train_y)
    elif model_type == 'LinearSVC':
        model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SVC':
        model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y)
    else:
        raise NotImplementedError('Model not implemented')

        
    logger.info("Finished training.")
    endTrainTime = time()
    trainTime = endTrainTime - startTrainTime
    logger.info("Training time : %d seconds" % trainTime)


    logger.info("Start predicting train set...")
    train_pred_y = model.predict(train_x)
    logger.info("Finished predicting train set.")
    logger.info("Start predicting test set...")
    test_pred_y = model.predict(test_x)
    logger.info("Finished predicting test set.")
    endTestTime = time()
    testTime = endTestTime - endTrainTime
    logger.info("Testing time : %d seconds" % testTime)
    totalTime += trainTime + testTime

    train_pred_y = np.round(train_pred_y)
    test_pred_y = np.round(test_pred_y)

    np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i')

    logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y)))
    logger.info('[TEST]  Acc: %.3f' % (accuracy_score(test_y, test_pred_y)))

    return accuracy_score(test_y, test_pred_y)