def plot_stable_features(X_train,y_train,featnames,**kwargs): from sklearn.linear_model import LassoLarsCV,RandomizedLasso n_resampling = kwargs.pop('n_resampling',200) n_jobs = kwargs.pop('n_jobs',-1) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) # estimate alphas via xvalidation lars_cv = LassoLarsCV(cv=6,n_jobs=n_jobs).fit(X_train,y_train) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42, n_jobs=n_jobs, n_resampling=n_resampling) clf.fit(X_train,y_train) importances = clf.scores_ indices = np.argsort(importances)[::-1] pl.bar(range(len(featnames)), importances[indices], color="r", align="center") pl.xticks(np.arange(len(featnames))+0.5,featnames[indices], rotation=45,horizontalalignment='right') pl.xlim(-0.5,len(featnames)-0.5) pl.subplots_adjust(bottom=0.2) pl.ylim(0,np.max(importances)*1.01) pl.ylabel('Selection frequency (%) for %d resamplings '%n_resampling) pl.title("Stability Selection: Selection Frequencies")
def plot_stable_features(X_train, y_train, featnames, **kwargs): from sklearn.linear_model import LassoLarsCV, RandomizedLasso n_resampling = kwargs.pop('n_resampling', 200) n_jobs = kwargs.pop('n_jobs', -1) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) # estimate alphas via xvalidation lars_cv = LassoLarsCV(cv=6, n_jobs=n_jobs).fit(X_train, y_train) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42, n_jobs=n_jobs, n_resampling=n_resampling) clf.fit(X_train, y_train) importances = clf.scores_ indices = np.argsort(importances)[::-1] pl.bar(range(len(featnames)), importances[indices], color="r", align="center") pl.xticks(np.arange(len(featnames)) + 0.5, featnames[indices], rotation=45, horizontalalignment='right') pl.xlim(-0.5, len(featnames) - 0.5) pl.subplots_adjust(bottom=0.2) pl.ylim(0, np.max(importances) * 1.01) pl.ylabel('Selection frequency (%) for %d resamplings ' % n_resampling) pl.title("Stability Selection: Selection Frequencies")
def compute_randomizedlasso(F_train, X_train, config, out_dir, feat_names): """ Compute RandomizedLasso feat selection. Do RandomizedLasso to select features over each cluster. Return selected features. """ scores = [] clusters = [] for i in X_train.clusters.unique(): selected = X_train[X_train.clusters == i] selected_rid = selected[["RID", "DX_bl"]] selected_rid = selected_rid[((selected_rid.DX_bl == 'CN') | (selected_rid.DX_bl == 'AD'))] selected_rid = selected_rid.merge(F_train, 'inner', on='RID') X = np.array(selected_rid[feat_names]) Y = np.array(selected_rid["DX_bl"]) Y = np.array([1.0 if x == 'AD' else -1.0 for x in Y]) rl = RandomizedLasso(alpha='bic', n_resampling=500, fit_intercept=False, sample_fraction=0.85, scaling=0.1, random_state=1714) rl.fit(X, Y) scores.append(rl.scores_) clusters.append(i) return normalize(scores), clusters
def run_rndlasso(X, y, alpha, n_resampling=500, sample_fraction=0.1, n_threads=1): """ Implement Randomized Lasso in sklearn Args: X (np.array): scaled X. y (pd.df): four columns response table. alpha (float): parameter trained from lassoCV n_resampling (int): number of times for resampling sample_fraction (float): fraction of data to use at each resampling Returns: np.array: feature importance scores """ logger.info( 'Implementing Randomized Lasso with alpha={}, n_resampling={} and sample_fraction={}' .format(alpha, n_resampling, sample_fraction)) # generate logit response y_logit = logit((y.nMut + 0.5) / (y.length * y.N)) reg = RandomizedLasso(alpha=alpha, n_resampling=n_resampling, sample_fraction=sample_fraction, selection_threshold=1e-3, max_iter=3000, normalize=False, n_jobs=n_threads) rndlasso = reg.fit(X, y_logit) fi_scores = rndlasso.scores_ return fi_scores
def lasso_fs(X, y): rlasso = RandomizedLasso() rlasso.fit(X, y) classes = range(0, X.shape[1]) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), classes), reverse=True)
def featureSelection(train_x, train_y): # Create the RFE object and compute a cross-validated score. svc = LinearSVC(C=1, class_weight='balanced') # The "accuracy" scoring is proportional to the number of correct # classifications lasso = RandomizedLasso() lasso.fit(train_x, train_y) rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) rankings = rfecv.ranking_ lasso_ranks = lasso.get_support() lassoFeats = [] recursiveFeats = [] shouldUseFeats = [] for i in range(len(rankings)): if lasso_ranks[i]: lassoFeats.append(feats[i]) if rankings[i] == 1: recursiveFeats.append(feats[i]) if lasso_ranks[i]: shouldUseFeats.append(feats[i]) keyboard() print 'Should use ' + ', '.join(shouldUseFeats) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def _stab_select(x_train, y_train): '''Perform stability selection.''' rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(x_train, y_train) for vals in reversed(sorted(zip(rlasso.scores_, x_train.columns))): print '\t'.join([str(val) for val in vals])
def stability_randomizedlasso(X,y,**rl_parameters): """ Score predictor based on `scikit-learn`_ randomizedlasso stability selection. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **rl_parameters: Named parameters for sklearn randomizedlasso Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn randomizedlasso stability selection to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = stability_randomizedlasso(tfs,tg) >>> scores array([0.11 , 0.17 , 0.085]) """ regressor = RandomizedLasso(**rl_parameters) regressor.fit(X,y) scores = np.abs(regressor.scores_) return(scores)
def feature_scoring(X, Y): names = ["x%s" % i for i in range(1, 37)] ranks = {} X = X.values[:, :] lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) print('startMIC') mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:, i], Y) m = mine.mic() mic_scores.append(m) print(i) ranks["MIC"] = rank_to_dict(mic_scores, names) print('finish MIc') r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print("\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods]))))
def randomLassoFeatSelect(data, target): column_names = list(data.columns.values) rlasso = RandomizedLasso(alpha=0.1) rlasso.fit(data, target) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), column_names), reverse=True)
def run_rlasso(ranks): print('>> run rlasso/Stability') # Finally let's run our Selection Stability method with Randomized Lasso rlasso = RandomizedLasso(alpha=0.04, verbose=3) rlasso.fit(X, Y) ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames) print('finished') print_memory() return ranks
def stability_selection(option, opt, value, parser): rlasso = RandomizedLasso() rlasso.fit(X, y) print "\nStability Selection: Features sorted by rank:" pprint( sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), feature_names), reverse=True)) print
def select_feature_importance(): data4columns = dataset.drop(['Max overdue'], axis=1) column_names = np.asarray(data4columns.columns.values) lasso = RandomizedLasso(alpha=0.025) scaled_data = scaler.fit_transform(data) lasso.fit(scaled_data, target) scores = lasso.scores_ # column_names # print scores print sorted(zip(map(lambda x: round(x, 4), scores), column_names), reverse=True)
def linear_regression_weight(df, label, black_list=[]): #稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 #它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, #比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 #理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 X = df.drop(black_list, axis=1) rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(X.values, label) d = dict(zip(X.columns, rlasso.scores_)) return d
def stability(self, X, y): print("Performing stability (rlasso) analysis") from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, y) scores = np.absolute(rlasso.scores_) / np.absolute( rlasso.scores_).sum() ranks = self.rank_to_dict(np.abs(scores), X.columns.values) return ranks
def feature_selection(Xnew, Y): train_cols = Xnew.columns.tolist() rlasso = RandomizedLasso(alpha=0.005) rlasso.fit(Xnew, Y) print("features sorted by their socre:") featureRanks = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), train_cols), reverse=True) print(featureRanks) selectedFeats = [feat[1] for feat in featureRanks if feat[0] > 0.01] return selectedFeats, featureRanks
def rlassoSS(data, labels): names = data.columns rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(data, labels) print("Features sorted by their score:") result = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True) for i in result: print(i)
def run_rndlasso(X_train, ybinom_train, alpha, n_resampling=200, sample_fraction=0.4): ''' Implement RandomizedLasso provided by sklearn ''' logger.info('Implementing Randomized Lasso with alpha={}, n_resampling={} and sample_fraction={}'.format(alpha, n_resampling, sample_fraction)) # generate logit response ylogit_train = logit(ybinom_train[:,0]/ybinom_train.sum(1)) clf = RandomizedLasso(alpha=alpha, n_resampling=n_resampling, sample_fraction=sample_fraction, selection_threshold=1e-3, max_iter=3000, normalize=False) rndlasso = clf.fit(X_train, ylogit_train) return rndlasso
def featureRankingMatrix(data, x, y): ranks = {} colnames = data.columns def ranking(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(x, y) ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames) lr = LinearRegression(normalize=True) lr.fit(x, y) rfe = RFE(lr, n_features_to_select=1, verbose=3) rfe.fit(x, y) ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1) lr = LinearRegression(normalize=True) lr.fit(x, y) ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames) ridge = Ridge(alpha=7) ridge.fit(x, y) ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames) lasso = Lasso(alpha=.05) lasso.fit(x, y) ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames) rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3) rf.fit(x, y) ranks["RF"] = ranking(rf.feature_importances_, colnames) r = {} for name in colnames: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking']) meanplot = meanplot.sort_values('Mean Ranking', ascending=False) sns.factorplot(x="Mean Ranking", y="Feature", data=meanplot, kind="bar", size=14, aspect=1.9, palette='coolwarm')
def Randomlasso(self,data): a1= data a1=a1.dropna() Y =a1['price'].values X=a1[a1.columns[5:27]].values names=list(range(1,22)) rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(X, Y) print("Features sorted by their score:") print(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True))
def predict_features(self, df_features, df_target, idx=0, **kwargs): alpha = kwargs.get("alpha", 'aic') scaling = kwargs.get("scaling", 0.5) sample_fraction = kwargs.get("sample_fraction", 0.75) n_resampling = kwargs.get("n_resampling", 10) randomized_lasso = RandomizedLasso(alpha=alpha, scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling) randomized_lasso.fit(df_features.values, np.ravel(df_target.values)) return randomized_lasso.scores_
def auto_add_lasso(self, threshold): """ add features based on randomized lasso """ logging.info("[DataSelector] Starting randomized lasso...") names = self.train_x.columns.tolist() rlasso = RandomizedLasso(alpha=0.005) rlasso.fit(self.train_x, self.train_y) result = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True) for i, j in result: r = re.compile('([a-zA-Z]+)([0-9]+)') feature = r.match(j).groups()[0] idx = r.match(j).groups()[1] print("Feature: {}, idx: {}".format(feature, idx)) if i >= threshold: self.add(feature, idx)
def get_feature_selection_model_from_name(type_of_estimator, model_name): # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable) # TODO(PRESTON): optimize the params used here model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def feature_selection(path, target): X = load_df(path) y = X[target] X = X.drop(target, axis=1) model = Pipeline([("imputer", Imputer(missing_values='NaN', strategy="mean", axis=1)), ('feature', RandomizedLasso()), ("model", LinearRegression())]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model.fit(X_train, y_train) R2 = model.score(X_test, y_test) ypred = model.predict(X_test) mse = mean_squared_error(y_test, ypred) print "R^2 (Linear Regression + feature selection): ", R2 print "mse (Linear Regression + feature selection): ", mse features = model.named_steps['feature'] selected_features = X.columns[features.transform(np.arange(len( X.columns)))].values.tolist()[0] return selected_features
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def feature_selection_regression(predictors, responses, test_predictors, selectfeattech): if selectfeattech == 0: chk = int(predictors.shape[1] * 0.40) # have fixed the value of how many features are to be selected as of now. model = SelectKBest(f_regression, k=10) model = model.fit(predictors, responses) predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) print "SelectKBest -> " + str(len(indices)) if selectfeattech == 1: model = RandomizedLasso(alpha='aic', scaling=0.3, sample_fraction=0.60, n_resampling=200, selection_threshold=0.15) model = model.fit(predictors, responses) predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) print "Randomized Lasso -> " + str(len(indices)) column_names = predictors.columns[indices] predictors_new = pd.DataFrame(predictors_new, index=predictors.index, columns=column_names) predictors_test_new = pd.DataFrame(predictors_test_new, index=test_predictors.index, columns=column_names) return predictors_new, predictors_test_new
def run_lasso_on_input(df, target): X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0])) X_part, _ = scale_input_data(X_part) print "#######################################" print "Starting LARS CV" print "#######################################" lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part) print "#######################################" print "Done with LARS CV" print "#######################################" #alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0]) X, _ = scale_input_data(X) print "#######################################" print "Starting main lasso" print "#######################################" clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) print "#######################################" print "Done with main lasso" print "#######################################" return clf, column_list_for_sampled
def stability(frame): data_y = frame['target'] data_x = frame.drop('target', axis=1) selection = RandomizedLasso(alpha=0.0011, scaling=0.8, sample_fraction=0.6, max_iter=100000).fit_transform(data_x, data_y) print(selection.shape)
def feature_extraction_RandomLasso(flag = True): from sklearn.linear_model import RandomizedLasso if flag == True: X_train = pd.read_csv('feature_001.csv') X_train.drop('id',axis = 1,inplace = True) X_train = parse_nan(X_train) y_train = pd.read_csv('target.csv') print(type(X_train)) for i in X_train.columns: X_train[i] = X_train[i].astype('float16') print(X_train.info(memory_usage = 'deep')) print(y_train.info(memory_usage = 'deep')) print("稳定性选择法提取特征开始...") #print(X_train.isnull().sum().sort_values(ascending=False).head()) NUM = 20 randomLasso = RandomizedLasso() randomLasso.fit(X_train, y_train) features = randomLasso.scores_ score = X_train.columns print(features) print(sorted(zip(map(lambda x:round(x,4),features),score),reverse = True)) featureList = sorted(zip(map(lambda x:round(x,4),features),score),reverse = True) featureList = [i[1] for i in featureList][:NUM] X_train = X_train[featureList] print(X_train.shape) if X_train.shape[1]!= NUM: raise NotImplementedError("稳定性选择法提取特征处理失败") print("稳定性选择法提取特征结束...") X_train.to_csv('feature_tree_end.csv') else: X_train = pd.read_csv('feature_linear_end.csv') y_train = pd.read_csv('target.csv') X_train.drop('id',axis = 1,inplace = True) X_train = parse_nan(X_train) print("稳定性选择法提取特征开始...") print(X_train.isnull().sum().sort_values(ascending=False).head()) NUM = 30 randomLasso = RandomizedLasso() randomLasso.fit(X_train, y_train) features = randomLasso.scores_ score = X_train.columns print(features) print(sorted(zip(map(lambda x:round(x,4),features),score),reverse = True)) featureList = sorted(zip(map(lambda x:round(x,4),features),score),reverse = True) featureList = [i[1] for i in featureList][:NUM] X_train = X_train[featureList] print(X_train.shape) if X_train.shape[1]!= NUM: raise NotImplementedError("稳定性选择法提取特征处理失败") print("稳定性选择法提取特征结束...") X_train.to_csv('feature_linear_best.csv') return X_train
def rank_features(algorithm, X, y): # The RFE approach can be used with various different classifiers if algorithm == 'random_forest_rfe': from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE estimator = RandomForestClassifier(n_estimators=50, random_state=R_SEED, n_jobs=1) selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'svm_rfe': from sklearn.svm import SVC from sklearn.feature_selection import RFE estimator = SVC(random_state=R_SEED, kernel='linear') selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'random_logistic_regression': # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ from sklearn.linear_model import RandomizedLogisticRegression rlasso = RandomizedLogisticRegression(random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'random_lasso': from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(random_state=R_SEED) #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'anova': from sklearn.feature_selection import f_classif F, pval = f_classif(X, y) random_array = random.random(len(pval)) order = lexsort((random_array, pval)) # will break ties by random for i in order: print features[i] else: print "Invalid algorithm: %s" % algorithm exit(1)
def featureSelect(select_fun, train_data, train_label, threshold_num=0, alpha=0.000015): X = train_data # train data Y = train_label # train label feture_names = list(train_data.columns) # 现有的特征名字 importance_features_list = [] if select_fun == 'MeanDecreaseImpurity': '''平均不纯度减少 mean decrease impurity''' rf = RandomForestRegressor(random_state=2019) rf.fit(X, Y) feature_score = sorted( zip(feture_names, map(lambda x: round(x, 4), rf.feature_importances_))) elif select_fun == 'StabilitySelection': '''稳定性选择 StabilitySelection''' rlasso = RandomizedLasso(alpha, random_state=2019) # alpha太大会导致所有特征都会为0,为1最好 rlasso.fit(X, Y) feature_score = sorted( zip(feture_names, map(lambda x: round(x, 4), rlasso.scores_))) else: importance_features_list = [ 'MeanDecreaseImpurity', 'StabilitySelection', 'RecursiveFeatureElimination', 'MeanDecreaseAccuracy' ] print("可选挑选特征的方法名:", importance_features_list) return importance_features_list for item in feature_score: if item[1] > threshold_num: importance_features_list.append(item[0]) else: continue return importance_features_list
def lasso(): columns = [ col for col in data.columns if col not in [ 'id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate', 'question1_tk', 'question2_tk' ] ] columns = [col for col in columns if col not in FEATURES_CORR] X = data[columns] X.fillna(0, inplace=True) Y = data.is_duplicate rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(X, Y) print( sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), columns), reverse=True)) svm = LinearSVC(C=0.75) svm.fit(X, Y) print( sorted(zip(map(lambda x: abs(round(x, 4)), svm.coef_[0]), columns), reverse=True))
def stable_select(df, y, rd_reg_columns, threshold=0.2, model='rlr'): X = df.loc[:, rd_reg_columns] Y = df[y] if model == 'rlr': rlr = RLR(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold) # 随机逻辑回归 rlr.fit(X, Y) scores = rlr.scores_ elif model == 'rls': rls = RLS(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold) # 随机Lasso回归 rls.fit(X, Y) scores = rls.scores_ elif model == 'rfr': rf = RFR() rf.fit(X, Y) scores = rf.feature_importances_ else: pass result = pd.Series(dict(zip(X.columns, scores))).rename('score').sort_values(ascending=False) plt.figure(figsize=(20, 10)) result.plot.barh(title='Feature Importances', color='lightblue') plt.ylabel('Feature Importance Score') return result
def lass_varselect(train, num_vars, target, alpha): lass = RandomizedLasso(alpha=alpha, n_resampling=5) lass.fit(train[num_vars], train[target]) return lass.get_support()
def feature_selection(df,dfo,target_column,id_column): """ df = The training dataframe dfo = The test dataframe target_column = The column containing the target variable id_column = The column containing the id variable Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve. If the output column is binary (0/1), we use Genetic Algorithms for feature selection. If the """ print("IDENTIFYING TYPES...") in_model = [] list_ib = set() #input binary list_icn = set() #input categorical nominal list_ico = set() #input categorical ordinal list_if = set() #input numerical continuos (input float) list_inputs = set() output_var = target_column for var_name in df.columns: if re.search('^ib_',var_name): list_inputs.add(var_name) list_ib.add(var_name) print (var_name,"is input binary") elif re.search('^icn_',var_name): list_inputs.add(var_name) list_icn.add(var_name) print (var_name,"is input categorical nominal") elif re.search('^ico_',var_name): list_inputs.add(var_name) list_ico.add(var_name) print (var_name,"is input categorical ordinal") elif re.search('^if_',var_name): #list_inputs.add(var_name) list_if.add(var_name) print (var_name,"is input numerical continuos (input float)") elif re.search('^ob_',var_name): output_var = var_name else: print ("ERROR: unable to identify the type of:", var_name) if (df[output_var].isin([0,1]).all()): method_type = 'categorical' else: method_type = 'numerical' print(method_type) if method_type == "categorical": methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"] elif method_type == "numerical": methods = ["SVM","Ridge","Lasso"] if method_type == "categorical": print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):") ##### #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION) ##### creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, fitness=creator.FitnessMax) toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs)) toolbox.register("population", tools.initRepeat, list, toolbox.individual) def evalOneMax(individual): return sum(individual), toolbox.register("evaluate", evalOneMax) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) toolbox.register("select", tools.selTournament, tournsize=3) NPOPSIZE = 50 #RANDOM STARTING POOL SIZE population = toolbox.population(n=NPOPSIZE) ##### #ASSESSING GINI ON THE STARTING POOL ##### dic_gini={} for i in range(np.shape(population)[0]): # TRASLATING DNA INTO LIST OF VARIABLES (1-81) var_model = [] for j in range(np.shape(population)[0]): if (population[i])[j]==1: var_model.append(list(list_inputs)[j]) # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### if "Logistic Regression" in methods: lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] list_gini=sorted(dic_gini.keys(),reverse=True) #### # ASSESSING RMSE ON THE STARTING POOL #### if method_type == "numerical": X_train=df[var_model] Y_train=df[output_var] names = list(X_train) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X_train, Y_train) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X_train, Y_train) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X_train, Y_train) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X_train, Y_train) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) rf = RandomForestRegressor() rf.fit(X_train,Y_train) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X_train, Y_train, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(ranks["Mean"]) print("\t\t%s" % "\t".join(methods)) for name in names: print ("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods])))) ranks_f = pd.DataFrame(ranks) ranks_f.sort_values("RF",0,0,inplace = True) print(ranks_f) featureset = ranks_f.index.values[0:5] print(featureset) if method_type == "categorical": #GENETIC ALGORITHM MAIN LOOP - START # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES) ##### sum_current_gini=0.0 sum_current_gini_1=0.0 sum_current_gini_2=0.0 first=0 OK = 1 a=0 while OK: #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS a=a+1 print('loop ', a) OK=0 #### # GENERATING OFFSPRING - START #### offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10% fits = toolbox.map(toolbox.evaluate, offspring) for fit, ind in zip(fits, offspring): ind.fitness.values = fit population =toolbox.select(offspring, k=len(population)) #### # GENERATING OFFSPRING - END #### sum_current_gini_2=sum_current_gini_1 sum_current_gini_1=sum_current_gini sum_current_gini=0.0 ##### #ASSESSING GINI ON THE OFFSPRING - START ##### for j in range(np.shape(population)[0]): if population[j] not in dic_gini.values(): var_model = [] for i in range(np.shape(population)[0]): if (population[j])[i]==1: var_model.append(list(list_inputs)[i]) X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] ##### #ASSESSING GINI ON THE OFFSPRING - END ##### ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START ##### list_gini=sorted(dic_gini.keys(),reverse=True) population=[] for i in list_gini[:NPOPSIZE]: population.append(dic_gini[i]) gini=float(i.split(';')[0]) sum_current_gini+=gini ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END ##### #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2) if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001): OK=1 ##### #GENETIC ALGORITHM MAIN LOOP - END ##### if method_type == "categorical": gini_max=list_gini[0] gini=float(gini_max.split(';')[0]) features=gini_max.split(';')[1] #### # PRINTING OUT THE LIST OF FEATURES ##### f=0 for i in range(len(features)): if features[i]=='1': f+=1 print('feature ', f, ':', list(list_inputs)[i]) print ('gini: ', gini) featureset = features return featureset
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']): attribute[idx] = attr # In[ ]: # select the best features with true values and save them features = pd.read_csv('all_features.csv',index_col=0).sort() target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included for i in range(21): print(attribute[i]) sys.stdout.flush() Y = target[attribute[i]].dropna() X = features.loc[Y.index] selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200, random_state=25).fit(X,Y) selected = pd.DataFrame(selector.transform(features)) selected.index = features.index print('shape ', selected.shape) selected.to_csv('...path to features folder/selected_features/features_'+str(i)+'.csv') # In[ ]:
#4 两种顶层特征选择算法 #4.1 稳定性选择 (Stability selection) [0,1] #它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, #比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数) from sklearn.linear_model import RandomizedLasso #随机Lasso from sklearn.datasets import load_boston boston = load_boston() #using the Boston housing data. #Data gets scaled automatically by sklearn's implementation X = boston["data"] Y = boston["target"] names = boston["feature_names"] rlasso = RandomizedLasso(alpha=0.025) #alpha自动选择最优的值 rlasso.fit(X, Y) print "Features sorted by their score:" #得分:rlasso.scores_ print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True) #结论:好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。 #对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一 #4.2 递归特征消除 (Recursive feature elimination (RFE)) 最优特征子集贪心算法 #反复的构建模型(如SVM或者回归模型)然后选出最好的(或者最差的)的特征(可以根据系数来选),把选出来的特征放到一遍, #然后在剩余的特征上重复这个过程,直到所有特征都遍历了。这个过程中特征被消除的次序就是特征的排序 from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression,Ridge boston = load_boston()
def run(args): X_train = np.nan_to_num( np.genfromtxt(args.training_data, delimiter=args.delimiter)) y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1) X_trains = X_train if args.scale: print "Scaling features (mean removal divided by std)..." scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # create output folders outF = args.output_folder + "/" + os.path.basename( args.training_data) + "--FS_" + str( args.select_features) + "--i_" + str(args.iterations) buildDir(outF) maskF = outF + "/masks/" buildDir(maskF) #evaluation features first_experiments labels logs masks parameters # predictions src suca paramF = outF + "/parameters/" buildDir(paramF) #featF = outF+"/features/" #buildDir(featF) #evalF = buildDir(outF+"/evaluation") #os.path.basename( # args.training_data)]) + featsel_str + "--" + os.path.basename( # test_label # initializes numpy random seed np.random.seed(args.seed) # performs feature selection featsel_str = ".all-feats" if args.select_features: print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=8, random_state=args.seed, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join( # [".", "masks", os.path.basename(args.training_data)]) [maskF, os.path.basename(args.training_data)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=args.iterations, scoring=mae_scorer, n_jobs=8, refit=True, cv=KFold(X_train.shape[0], args.folds, shuffle=True, random_state=args.seed), verbose=1, random_state=args.seed) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(args.models_dir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=8) estimator2.fit(X_trains,y_train) from sklearn.externals import joblib print "koooonnn %s" % args.models_dir joblib.dump(estimator2, args.models_dir+"/XRT.pkl") joblib.dump(scaler, args.models_dir+"/scaler.pkl") joblib.dump(sel_est, args.models_dir+"/sel_est.pkl") # print "Kioonnn number of feat:\n", n_feature # ................SHAHAB ........................ print "Best parameters: ", search.best_params_ # saves parameters on yaml file #param_path = os.sep.join([".", "parameters", os.path.basename( param_path = os.sep.join([paramF, os.path.basename( args.training_data)]) + featsel_str + ".params.yaml" param_file = codecs.open(param_path, "w", "utf-8") yaml.dump(search.best_params_, stream=param_file) testF = os.sep.join([outF, "/test/"]) buildDir(testF) m = y_train.mean() # evaluates model on the different test sets test_features = sorted(glob.glob(args.test_data + os.sep + "*")) test_labels = sorted(glob.glob(args.test_labels + os.sep + "*")) for test_feature, test_label in zip(test_features, test_labels): print "Evaluating on %s" % test_label X_test = np.nan_to_num( np.genfromtxt(test_feature, delimiter=args.delimiter)) y_test = np.clip(np.genfromtxt(test_label), 0, 1) X_tests = X_test if args.scale: X_tests = scaler.transform(X_test) if args.select_features: X_tests = sel_est.transform(X_tests) # gets predictions on test set #y_pred = search.predict(X_tests) y_pred = np.clip(search.predict(X_tests), 0, 1) # evaluates on test set mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print "Test MAE = %2.8f" % mae print "Test RMSE = %2.8f" % rmse print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max()) # saves evaluation testFX = testF + "/" + os.path.basename(test_label) buildDir(testFX) buildDir(testFX + "/evaluation/") eval_path = os.sep.join([testFX, "evaluation", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8") mae_eval.write(str(mae) + "\n") rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8") rmse_eval.write(str(rmse) + "\n") mu = m * np.ones(y_test.shape[0]) # baseline on test set maeB = mean_absolute_error(y_test, mu) rmseB = np.sqrt(mean_squared_error(y_test, mu)) print "Test MAE Baseline= %2.8f" % maeB print "Test RMSE Baseline= %2.8f" % rmseB mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8") mae_eval.write(str(maeB) + "\n") rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8") rmse_eval.write(str(rmseB) + "\n") # saves predictions buildDir(testFX + "/predictions/") preds_path = os.sep.join([testFX, "predictions", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) + ".preds" np.savetxt(preds_path, y_pred, fmt="%2.15f")
def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
'from_poi_to_this_person', 'from_messages', \ 'from_this_person_to_poi', 'shared_receipt_with_poi','from_poi_fraction','to_poi_fraction',\ 'tot_to_salary','tot_to_bonus','restr_to_total'] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) #SCALE FEATURES: #For RandomForest and DecisionTree, scaling is not necessary. #scaler = MinMaxScaler() #features = scaler.fit_transform(features) #Stability Selection: #http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ rlasso = RandomizedLasso(random_state=2) rlasso.fit(features,labels) scores = rlasso.scores_ print scores for j in range(len(scores)): print features_list[j+1],": ",scores[j] features_list_selected = ['poi'] for j in np.where(scores > 0.3)[0]: features_list_selected.append(features_list[j+1]) print "-------------Selected features:-------------" print features_list_selected
def main(): print "read train" df_train = pd.read_csv('data/train.csv') print "read test" df_test = pd.read_csv('data/test.csv') sample = pd.read_csv('data/sampleSubmission.csv') cats = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'dummy'] print "convert mixed columns to strings" df_train.loc[:, cats] = df_train[cats].applymap(str) df_test.loc[:, cats] = df_test[cats].applymap(str) print "one-hot encoding" df_train = make_dummies(df_train, cats) df_test = make_dummies(df_test, cats) print "fill missing values" df_train = df_train.fillna(df_train.mean()) df_test = df_test.fillna(df_test.mean()) print "set binary labels" df_train['target_class'] = (df_train.target>0).astype(int) classes = df_train.target_class.values loss = df_train.target.values df_train = df_train.drop(['target', 'id', 'target_class'], axis = 1) df_test = df_test.drop(['id'], axis = 1) build_features = True #flag, determines whether features will be trained or read from file if build_features: print "univariate feature selectors" selector_clf = SelectKBest(score_func = f_classif, k = 'all') selector_reg = SelectKBest(score_func = f_regression, k = 'all') selector_clf.fit(df_train.values, classes) selector_reg.fit(df_train.values, loss) pvalues_clf = selector_clf.pvalues_ pvalues_reg = selector_reg.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 pvalues_reg[np.isnan(pvalues_reg)] = 1 #put feature vectors into dictionary feats = {} feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05) feats['univ_reg_sub005'] = (pvalues_reg<0.05) feats['univ_clf_sub005'] = (pvalues_clf<0.05) print "randomized lasso feature selector" sel_lasso = RandomizedLasso(random_state = 42, n_jobs = 4).fit(df_train.values, loss) #put rand_lasso feats into feature dict feats['rand_lasso'] = sel_lasso.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(df_train.values) sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes) feats['LinearSVC'] = np.ravel(sel_svc.coef_>0) sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes) feats['LogReg'] = np.ravel(sel_log.coef_>0) feat_sums = np.zeros(len(feats['rand_lasso'])) for key in feats: feat_sums+=feats[key].astype(int) feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes joblib.dump(feats, 'features/feats.pkl', compress = 3) else: feats = joblib.load('features/feats.pkl') xtrain = df_train.values xtest = df_test.values print "fitting gb-regressor" reg_gbr = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.001, max_depth =5, random_state = 42, verbose = 100, min_samples_leaf=5) reg_gbr.fit(xtrain[:, feats['ensemble']], loss) gbr_preds = reg_gbr.predict(xtest[:, feats['ensemble']]) sample['target'] = gbr_preds sample.to_csv('submissions/gbm_sub.csv', index = False) reg_lin = LinearRegression() scaler = StandardScaler() xtrain = scaler.fit_transform(xtrain) xtest = scaler.transform(xtest) print "fitting linear regressor" reg_lin.fit(xtrain[:, feats['rand_lasso']], loss) lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']]) gbr_order = gbr_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc. lin_order = lin_preds.argsort().argsort() #averaging mean_order = np.vstack((gbr_order, lin_order)).mean(0) sample['target'] = mean_order sample.to_csv('submissions/mean_sub.csv', index = False)
def main(): print "read train" df_train = pd.read_csv('./data/train.csv') print "read test" df_test = pd.read_csv('./data/test.csv') sample = pd.read_csv('./data/sample_submission.csv') cats = ['T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 'T1_V9', 'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16', 'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12', 'T2_V13'] print "convert mixed columns to strings" df_train.loc[:, cats] = df_train[cats].applymap(str) df_test.loc[:, cats] = df_test[cats].applymap(str) print "one-hot encoding" df_train = make_dummies(df_train, cats) df_test = make_dummies(df_test, cats) print "set binary labels" df_train['hazard_class'] = (df_train.Hazard==1).astype(int) classes = df_train.hazard_class.values # loss = df_train.target.values hazard = df_train.Hazard.values df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis = 1) df_test = df_test.drop(['Id'], axis = 1) build_features = False #flag, determines whether features will be trained or read from file if build_features: print "univariate feature selectors" selector_clf = SelectKBest(score_func = f_classif, k = 'all') selector_reg = SelectKBest(score_func = f_regression, k = 'all') selector_clf.fit(df_train.values, classes) selector_reg.fit(df_train.values, hazard) pvalues_clf = selector_clf.pvalues_ pvalues_reg = selector_reg.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 pvalues_reg[np.isnan(pvalues_reg)] = 1 #put feature vectors into dictionary feats = {} feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05) feats['univ_reg_sub005'] = (pvalues_reg<0.05) feats['univ_clf_sub005'] = (pvalues_clf<0.05) print "randomized lasso feature selector" sel_lasso = RandomizedLasso(random_state = 42).fit(df_train.values, hazard) #put rand_lasso feats into feature dict feats['rand_lasso'] = sel_lasso.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(df_train.values) sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes) feats['LinearSVC'] = np.ravel(sel_svc.coef_>0) sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes) feats['LogReg'] = np.ravel(sel_log.coef_>0) feat_sums = np.zeros(len(feats['rand_lasso'])) for key in feats: feat_sums+=feats[key].astype(int) feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes joblib.dump(feats, './features/feats.pkl', compress = 3) else: feats = joblib.load('features/feats.pkl') xtrain = df_train.values xtest = df_test.values print "fitting xgb-regressor" params = {} params["objective"] = "reg:linear" params["eta"] = 0.01 params["max_depth"] = 7 params["subsample"] = 0.8 params["colsample_bytree"] = 0.8 params["min_child_weight"] = 5 params["silent"] = 1 plst = list(params.items()) num_rounds = 600 #create a train and validation dmatrices xgtrain = xgb.DMatrix(xtrain[:,feats['ensemble']], label=hazard) xgtest = xgb.DMatrix(xtest[:,feats['ensemble']]) reg_xgb = xgb.train(plst, xgtrain, num_rounds) xgb_preds = reg_xgb.predict(xgtest) sample['Hazard'] = xgb_preds sample.to_csv('./submissions/xgb.csv', index = False) reg_lin = LinearRegression() scaler = StandardScaler() xtrain = scaler.fit_transform(xtrain) xtest = scaler.transform(xtest) print "fitting linear regressor" reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard) lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']]) sample['Hazard'] = lin_preds sample.to_csv('./submissions/lin.csv', index = False) xgb_order = xgb_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc. lin_order = lin_preds.argsort().argsort() #averaging mean_order = np.vstack((xgb_order, lin_order)).mean(0) sample['Hazard'] = mean_order sample.to_csv('./submissions/mean.csv', index = False)
def main(): start = time.time() MAX_TRAIN_SIZE = 126838 train_size = 20000 val_size = MAX_TRAIN_SIZE - train_size data, test_data = get_data('data') X = data[0:train_size,0:-1] y = [lbl for lbl in data[0:train_size,-1]] print(X.shape) print(len(y)) # use randomized log regression for feature selection clfR = RandomizedLasso( alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=1, pre_dispatch='3*n_jobs', #memory=Memory(cachedir=None) ) # fit regresion clfR.fit(X,y) # Transform Train Data to selected features X = np.array(X).copy() # little hack to fix assignment dest. read only error X_new = clfR.transform(X) X = X_new ## transform Quiz Dataset test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error transformed_test_data = clfR.transform(test_data) test_data = transformed_test_data print('Dimensions after feature Reduction: ' + str(X.shape) ) print("Elapsed Time For Feature Reduction: " + str(duration)) # Training classifier clf1 = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None, presort=False) # fit sub-classifiers clf1.fit(X,y) # fit voting classifier print("Elapsed Time For Classifier Training: " + str(duration)) # predict & calculate training error y_hat = clf1.predict(X) test_err = 1 for yi, y_hati in zip(y, y_hat): test_err += (yi == y_hati) test_err /= train_size print("train: " + str(test_err)) # validation data - calculate valdiation error val_start = train_size val_end = train_size + val_size # get validation data set # TODO: put this back in if MAX_TRAIN_SIZE - train_size > val_size: print("Beginning test validation...") X_val = data[val_start:val_end,0:-1] y_val = [lbl for lbl in data[val_start:val_end,-1]] y_val_hat = clf1.predict(X_val) test_err = 1 for yi, y_hati in zip(y_val, y_val_hat): test_err += (yi == y_hati) test_err /= X_val.shape[0] print("val: " + str(test_err)) #quiz data print("Beginning quiz validation...") # test_data = get_data('quiz') X_test = test_data[:,:] print(X_test.shape) y_test = [lbl for lbl in data[:,-1]] y_test_hat = clf1.predict(X_test) test_err = 1 # for yi, y_hati in zip(y_test, y_test_hat): # test_err += (yi == y_hati) # test_err /= X_test.shape[0] # print("test: " + str(test_err)) store_csv(y_test_hat, "prediction") end = time.time() duration = end - start print("Took this many seconds: " + str(duration))
for key in final_feats: final_inputs[x][count] = final_feats[key][x] count = count+1 inputs = [input for input in final_inputs.values()] # Recursive feature elimination svr = SVR(kernel="linear") rfe = RFE(svr, step=1) rfe = rfe.fit(inputs,outputs[1]) rfe.support_ rfe.ranking_ # selected features by RFE selected_features = [] count = 0 for key in final_feats.keys(): if (rfe.support_[count] == True): selected_features.append(key) count = count + 1 # Randomized Lasso for feature selection rlasso = RandomizedLasso(alpha=1) rlasso.fit(inputs, outputs[2]) rlasso.scores_
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains,y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir+"/XRT.pkl") joblib.dump(scaler, modelsdir+"/scaler.pkl") joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
if attribute is "_all": continue else: # select the columns containing the attribute attribute_columns=filter(lambda x:re.search(attribute,x), data.iloc[:,10:].columns) X = data[attribute_columns[:20]] # use only 20 mode paramteres remove_highly_correlated(X,threshold=0.98) print(X.columns.values) list_dicts = list() for train_index, test_index in skf: X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train.shape) if feature_selection == "randomized_lasso": feature_selector=RandomizedLasso(sample_fraction=0.5,n_resampling=50,verbose=False,n_jobs=-1) elif feature_selection == "RFECV_linearSVM": # print(feature_selection % "selected") feature_selector = RFECV(SVC(kernel="linear"),step=1,cv=StratifiedKFold(y,5),scoring="accuracy") else: print("Options are: randomized_lasso, RFECV_linearSVM") feature_selector.fit(X_train,y_train) result = {'X_train':X_train,'y_train':y_train,'X_test':X_test,'y_test':y_test,'feature_selector':feature_selector} list_dicts.append(result) dict_for_attribute[attribute] = list_dicts print("done in %0.3fs" % (time()-t0))
class LinearAll: """ A repertoire of Linear Variable Selection and Prediction Models Parameters ---------- n_jobs : int, optional Number of jobs to run in parallel (default 1). If -1 all CPUs are used. This will only provide speedup for n_targets > 1 and sufficient large problems pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’ refit : boolean Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting. iid : boolean, optional If True, the data is assumed to be identically distributed across the folds, and the score is computed from all samples individually, and not the mean loss across the folds. (If the number of data points is the same across folds, either returns the same thing) Attributes ---------- ols_train, predictions models before variable selection predictions models after variable selection """ def __init__ (self, cv=20, scoring = 'mean_squared_error', n_jobs=1, refit=False, iid=False, pre_pred=True, param_ridge_post=list(np.arange(1,3,0.1)), rlasso_selection_threshold = 0.5): #self.__name__ = '__main__' """ CAUTION: we changed to __main__ so that parallelization works """ self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.refit = refit self.iid = iid self.pre_pred =pre_pred self.param_ridge_post = param_ridge_post self.rlasso_selection_threshold = rlasso_selection_threshold def run_models(self, X, y, param_ridge): """ Prediction Models. OLS, PLS, Ridge """ ################################## ## OLS CV ################################## #ols = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_cv_score = cross_validation.cross_val_score( # ols, X, y, # cv=self.cv, scoring=self.scoring, # n_jobs=self.n_jobs) """ self.ols_cv_score.shape = (cv,) """ ################################## ## PLS CV ################################## tuned_parameters = [{'n_components': range(1, 5)}] pls = PLSRegression() pls_cv = GridSearchCV(pls, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) pls_cv.fit(X, y) ################################## ## Ridge CV ################################## tuned_parameters = [{'alpha': param_ridge}] ridge = linear_model.Ridge(alpha = 1) ridge_cv = GridSearchCV(ridge, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) ridge_cv.fit(X, y) return (pls_cv, ridge_cv) def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self def predict(self, X_test): assert(self.refit == True) if self.pls_post.best_score_ > self.ridge_post.best_score_: self.best_model = self.pls_post print "Chosen Model: pls" else: self.best_model = self.ridge_post print "Chosen Model: ridge" if self.rlasso_selection_threshold == 0: X_test_selected = X_test[:, self.active] else: X_test_selected = self.rlasso.transform(X_test) return self.best_model.best_estimator_.predict(X_test_selected)
from sklearn.cross_validation import train_test_split from scipy import io as sio from tensorflow.python.framework import ops from dfs2 import DeepFeatureSelectionNew import numpy as np from sklearn.datasets import make_classification from sklearn.preprocessing import normalize # ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat") ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat") # ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat") inputX = ourdataB['X'] inputX = normalize(inputX, axis=0) inputY = ourdataB['Y'][0,:] columnNames = ourdataB['columnNames'] X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42) randomized_lasso = RandomizedLasso() randomized_lasso.fit(X_train, y_train) featureMask = randomized_lasso.get_support() X_train_lasso = X_train[:,featureMask] X_test_lasso = X_train[:,featureMask] columnNames[0][:100][featureMask] sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \ 'X_train_lasso':X_test_lasso, 'featureMask':featureMask})