def perform_lasso_stability_path(df, target): return lasso_stability_path(df, target, scaling=0.5, random_state=2703, n_resampling=1000, n_grid=300, sample_fraction=0.75)
def test_lasso_stability_path(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.linear_model.lasso_stability_path(random_state=self.random_state) expected = lm.lasso_stability_path(diabetes.data, diabetes.target, random_state=self.random_state) self.assertEqual(len(result), 2) tm.assert_numpy_array_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelFrame) tm.assert_index_equal(result[1].index, df.data.columns) tm.assert_numpy_array_equal(result[1].values, expected[1])
def lasso_stability(X_scaled, Y, labels, X_test): print "Features sorted by their stability score using lasso stability paths:" if debug: print X_scaled.shape alpha_grid, scores_path = linear_model.lasso_stability_path( X_scaled, Y[:, 1], n_jobs=-1, random_state=42, eps=0.05, sample_fraction=0.50, verbose=debug) plt.figure(num=1) #plot as a function of the alpha/alpha_max variables = plt.plot(alpha_grid[1:]**0.333, scores_path.T[1:], 'k') ymin, ymax = plt.ylim() plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$') plt.ylabel('Stability score: proportion of times selected') plt.title('Stability Scores Path') plt.axis('tight') plt.figure(num=2) auc = (scores_path.dot(alpha_grid)) auc_plot = plt.plot((scores_path.dot(alpha_grid))) plt.xlabel(r'Features') plt.ylabel(r'Area under stability curve') plt.title('Overall stability of features') plt.show() if X_scaled.shape[1] > 500: k = X_scaled.shape[1] / 3 else: k = X_scaled.shape[1] / 2 print "Top %d performing features" % (k) ind = np.argpartition(auc, -k)[-k:] for (arg, value) in sorted(zip(labels[ind], auc[ind]), key=lambda (x, y): y, reverse=True): print arg, value print ind print np.where(ind) labels = labels[np.where(ind)] X_scaled = np.squeeze(X_scaled[:, np.where(ind)]) X_test = np.squeeze(X_test[:, np.where(ind)]) printSizes('lasso_stability end', X_scaled, Y, X_test) else: print 'Debug option not set, supress plotting' return (X_scaled, Y, labels, X_test)
import data from sklearn import linear_model import numpy as np import pandas as pd import os if __name__ == "__main__" or not os.path.exists('bow_selected.txt'): nonzero, = np.where(data.BBC_x.loc[:,data.isbow].std() != 0) BBC_nonzero = data.BBC_x.loc[:,data.isbow].iloc[:,nonzero] alphas_grid, scores_path = linear_model.lasso_stability_path(BBC_nonzero.values, data.BBC_y) selected = scores_path[:,99] != 0 bow_selected = np.zeros(data.isbow.sum(), bool) bow_selected[nonzero[selected]] = True np.savetxt('bow_selected.txt', bow_selected, fmt='%d') # Borrowed from sklearn plot_sparse_recovery.py import pylab pylab.ion() sel = pylab.plot(alphas_grid[1:] ** .333, scores_path[selected,1:].T, 'r') nsel= pylab.plot(alphas_grid[1:] ** .333, scores_path[~selected,1:].T, 'k') pylab.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$') pylab.ylabel('Stability score: proportion of lasso models where feature selected') pylab.axis('tight') pylab.legend((sel[0], nsel[0]), ('selected features', 'other features')) pylab.savefig('lasso_select2.pdf') else: bow_selected = np.loadtxt('bow_selected.txt', dtype=bool) def remove_bad_words(X): Xnonbow = X.loc[:, ~data.isbow]
linalg.svdvals(X[:n_relevant_features])).max() X = StandardScaler().fit_transform(X.copy()) # The output variable y = np.dot(X, coef) y /= np.std(y) # We scale the added noise as a function of the average correlation # between the design and the output variable y += noise_level * rng.normal(size=n_samples) mi = mutual_incoherence(X[:, :n_relevant_features], X[:, n_relevant_features:]) ########################################################################### # Plot stability selection path, using a high eps for early stopping # of the path, to save computation time alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42, eps=0.05) plt.figure() # We plot the path as a function of alpha/alpha_max to the power 1/3: the # power 1/3 scales the path less brutally than the log, and enables to # see the progression along the path hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r') hb = plt.plot(alpha_grid[1:] ** .333, scores_path[coef == 0].T[1:], 'k') ymin, ymax = plt.ylim() plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$') plt.ylabel('Stability score: proportion of times selected') plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi) plt.axis('tight') plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'), loc='best')
X = Scaler().fit_transform(X.copy()) # The output variable y = np.dot(X, coef) y /= np.std(y) # We scale the added noise as a function of the average correlation # between the design and the output variable y += noise_level * rng.normal(size=n_samples) mi = mutual_incoherence(X[:, :n_relevant_features], X[:, n_relevant_features:]) ########################################################################### # Plot stability selection path, using a high eps for early stopping # of the path, to save computation time alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42, eps=0.05) pl.figure() # We plot the path as a function of alpha/alpha_max to the power 1/3: the # power 1/3 scales the path less brutally than the log, and enables to # see the progression along the path hg = pl.plot(alpha_grid[1:]**.333, scores_path[coef != 0].T[1:], 'r') hb = pl.plot(alpha_grid[1:]**.333, scores_path[coef == 0].T[1:], 'k') ymin, ymax = pl.ylim() pl.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$') pl.ylabel('Stability score: proportion of times selected') pl.title('Stability Scores Path - Mutual incoherence: %.1f' % mi) pl.axis('tight') pl.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'), loc='best')
# T = np.vstack((T, tmp)) # # df = pd.DataFrame(data=T, columns=F) # # df.dropna(inplace=True) y_train = y_train.ravel() y_test = y_test.ravel() ########################################################################### # Plot stability selection path, using a high eps for early stopping # of the path, to save computation time with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) alpha_grid, scores_path = lasso_stability_path(X_train, y_train, random_state=42, eps=0.5, verbose=1) print alpha_grid print scores_path.shape # print scores_path.T[1:] plt.figure() # We plot the path as a function of alpha/alpha_max to the power 1/3: the # power 1/3 scales the path less brutally than the log, and enables to # see the progression along the path # hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r') hb = plt.plot(alpha_grid[1:]**.333, scores_path.T[1:], 'k_feat') ymin, ymax = plt.ylim() plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
import pickle X = np.load(open('/home/vincentli2010/Desktop/train_main.npy', 'rb')) _, _, y, _ = pickle.load(open('features.p', 'rb')) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) lars_cv = LassoLarsCV(cv=6).fit(X, y) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model alpha_grid, scores_path = lasso_stability_path(X, y, scaling=0.5, random_state=None, n_resampling=200, n_grid=100, sample_fraction=0.75, eps=8.8817841970012523e-16, n_jobs=-1, verbose=False) lars_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=5, n_jobs=-1).fit(X,y) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=lars_cv.alpha_, random_state=42, n_jobs=-1).fit(X, y) trees = ExtraTreesRegressor(100).fit(X, y) # Compare with F-score F, _ = f_regression(X, y) pl.figure()
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)