def naive_bayes_classifier(data, labels, columns): print 'Applying Naive Bayes classification' # create param grid n_numeric = len([ c.TYPE for c in columns if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None ]) n_components = list(range(1, data.shape[1] + 1, 1)) parameters = dict(pca__n_components=n_components) # create model pipeline ns = NumericScaler(n_numeric, with_std=False) rf = RandomForestClassifier() #random_state=2) rfe = feature_selection.RFE(rf) pca = decomposition.PCA() gnb = GaussianNB() pipe = Pipeline(steps=[('ns', ns), ('pca', pca), ('gnb', gnb)]) # run grid search with 10-fold validation clf = GridSearchCV(pipe, parameters, cv=10, verbose=1) clf.fit(data, labels) pred = clf.predict(data) print 'accuracy: %0.3f' % clf.best_score_ print 'Best parameters set: ' best_parameters = clf.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) print '\n', print classification_report(labels, pred) return clf
def main(): #company_names, X, Y, Y_citation = load_data(network_folder) # #X = normalize(X, axis=1) company_names, X, Y = load_data_combined("../data/networks/", "../data/citation_networks/", 17) # company_names, X, Y = load_data("../data/citation_networks/", 8) print X[1, :] lr = linear_model.HuberRegressor() sel = feature_selection.RFE(lr, n_features_to_select=11) # # sel = feature_selection.SelectKBest(feature_selection.f_regression, k=8) #3.856 X = sel.fit_transform(X, Y) print "sup", sel.get_support() # lr = linear_model.RANSACRegressor() if use_pca: pca = PCA(n_components=pca_components) X = pca.fit_transform(X) #Run k-fold cross validation and prediction simultaneously Y_pred = cross_val_predict(lr, X, Y, cv=8) if use_ranking: Y = convert_to_rank(Y) Y_pred = convert_to_rank(Y_pred) #F_scores, p_values = f_regression(X, Y) #print F_scores #print p_values #for pred_pair in zip(Y, Y_pred): # print "Actual: %s, Predicted: %s" %pred_pair print "Mean Absolute Error: %s" % mean_absolute_error(Y, Y_pred) print "Ground Truth StdDev: %s" % np.std(Y) lr.fit(X, Y_pred) print lr.coef_
def svm_classifier(data, labels, columns): print 'Applying SVM classification with RBF kernel' # create param grid n_numeric = len([ c.TYPE for c in columns if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None ]) C = [0.1, 1, 10, 100, 1000] gamma = ['auto', 1, 0.1, 0.001, 0.0001] parameters = dict(svm__C=C, svm__gamma=gamma) # create model pipeline ns = NumericScaler(n_numeric) rf = RandomForestClassifier() #random_state=2) rfe = feature_selection.RFE(rf) svm = SVC(kernel='rbf') #, random_state=17) pipe = Pipeline(steps=[('ns', ns), ('rfe', rfe), ('svm', svm)]) # run grid search with 10-fold validation clf = GridSearchCV(pipe, parameters, cv=10, verbose=1) clf.fit(data, labels) pred = clf.predict(data) print 'accuracy: %0.3f' % clf.best_score_ print 'Best parameters set: ' best_parameters = clf.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) print '\n', print classification_report(labels, pred) return clf
def knn_classifier(data, labels, columns): print 'Applying k-nearest neighbor classification' # create param grid n_numeric = len([ c.TYPE for c in columns if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None ]) n_neighbors = list(range(1, 51, 1)) parameters = dict(knn__n_neighbors=n_neighbors) # create model pipeline ns = NumericScaler(n_numeric) rf = RandomForestClassifier() #random_state=8) knn = KNeighborsClassifier() rfe = feature_selection.RFE(rf) pipe = Pipeline(steps=[('ns', ns), ('rfe', rfe), ('knn', knn)]) # run grid search with 10-fold cross validation clf = GridSearchCV(pipe, parameters, cv=10, verbose=1) clf.fit(data, labels) pred = clf.predict(data) print 'accuracy: %0.3f' % clf.best_score_ print 'Best parameters set: ' best_parameters = clf.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) print '\n', print classification_report(labels, pred) return clf
def main(): df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') df_test = pd.read_csv('data/test_data.csv') feature_cols = [f for f in list(df_train) if "feature" in f] target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values X_test = df_test[feature_cols].values estimator = LogisticRegression(C=10.0) rfe = feature_selection.RFE(estimator=estimator, n_features_to_select=reduction_dim, verbose=1) print('Fitting Recursive Feature Elimination on data...') X_train_rfe = rfe.fit_transform(X_train, y_train) X_valid_rfe = rfe.fit_transform(X_valid, y_valid) X_test_rfe = rfe.transform(X_test) print('Saving...') save_path = 'data/rfe_selection_data_{}d.npz'.format(reduction_dim) np.savez(save_path, \ train=X_train_rfe, \ valid=X_valid_rfe, \ test=X_test_rfe)
def get_rfe(X_train, y_train, step=5): BEST_FITTED = get_best_fitted(X_train, y_train) RFE = { "svm-rfe": { "model": feature_selection.RFE(CLASSIFIERS['svm'], step=step), "estimator": CLASSIFIERS['svm'] }, "rf-rfe": { "model": feature_selection.RFE(CLASSIFIERS['rf'], step=step), "estimator": CLASSIFIERS['rf'] }, "knn-rfe": { "model": feature_selection.RFE(CLASSIFIERS['rf'], step=step), "estimator": CLASSIFIERS['knn'] }, "nb-bernoulli-rfe": { "model": feature_selection.RFE(CLASSIFIERS['nb-bernoulli'], step=step), "estimator": CLASSIFIERS['nb-bernoulli'] }, "svm-grid-rfe": { "model": feature_selection.RFE(BEST_FITTED['svm-grid'], step=step), "estimator": BEST_FITTED['svm-grid'] }, "rf-grid-rfe": { "model": feature_selection.RFE(BEST_FITTED['rf-grid'], step=step), "estimator": BEST_FITTED['rf-grid'] }, "knn-grid-rfe": { "model": feature_selection.RFE(BEST_FITTED['rf-grid'], step=step), "estimator": BEST_FITTED['knn-grid'] }, } return RFE
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def select_ests(X, y, nfeats, clf): rfe = feature_selection.RFE(estimator=clf, n_features_to_select=100, step=10) rfe.fit(X, y) ranking = rfe.ranking_ # 1 values refer to the features that are taken rks = ranking[ranking == 1] return rks
def logreg_model_result(self,X_train, Y_train, X_test, Y_test, iter): logreg_model = LogisticRegression(max_iter=iter) rfe = feature_selection.RFE(logreg_model, 20) #t0=time.time() rfe = rfe.fit(X_train, Y_train) #print("training time:", round(time.time() - t0, 3), "s") logreg_y_pred = rfe.predict(X_test) return (metrics.accuracy_score(Y_test, logreg_y_pred), metrics.f1_score(Y_test, logreg_y_pred, average='macro'), metrics.recall_score(Y_test, logreg_y_pred, average='macro'), metrics.precision_score(Y_test, logreg_y_pred, average='macro'))
def learning_by_target_lasso(self, X, y, alpha, input_c=None): # alphas = np.logspace(-3, 0, 20) print(("Finding the most important half of {} features").format(len(X[0]))) regr = linear_model.LogisticRegression(penalty="l2", C=input_c, n_jobs=-1, solver="newton-cg") rfe = feature_selection.RFE(regr) rfe.fit(X, y) new_dict = dict() for index, code in enumerate(list(self.code_dict)): # new_dict[code] = regr.coef_[0][index] new_dict[code] = rfe.ranking_[index] return new_dict
def __init__(self): f5 = feature_selection.RFE(estimator=MultinomialNB(), n_features_to_select=100000, step=100, verbose=1) pipeline = Pipeline([ ('rfe_feature_selection', f5), ('clf', MultinomialNB()), ]) self.clf = pipeline
def ref(arr0, target, n_features): from sklearn.linear_model import LogisticRegression matrix = np.array(arr0) target = np.array(target) temp = feature_selection.RFE(estimator=LogisticRegression(), n_features_to_select=n_features).fit( matrix, target) scores = temp.ranking_.tolist() indx = temp.support_.tolist() # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index) result = temp.transform(matrix).tolist() return scores, indx, result
def rfe_svm(X, y): clf = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', max_iter=1000, alpha=1e-9, tol=1e-3, random_state=123456, class_weight={ 0: 0.044, 1: 1 - 0.044 }) cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) nb_features = X.shape[1] print(nb_features) scores = model_selection.cross_validate( clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True) print(scores) if nb_features > 1: rfe = feature_selection.RFE(clf, n_features_to_select=nb_features - 1, step=1) rfe.fit(X, y) output = rfe_svm(rfe.transform(X), y) output.append([ nb_features, np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), rfe.support_, rfe.ranking_ ]) return output else: return [[ nb_features, np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), [True], [1] ]]
def calcFoldScores ( nFold = 10): # kFold = sk_ms.KFold ( nFold).split(X,y) foldReturns = [] lr = sk_lm.LogisticRegression( penalty='l1', C=10000 ) for trainIndex, testIndex in sk_ms.KFold(nFold).split(X,y): xTrain, yTrain = X[trainIndex], y[trainIndex] xTest,yTest = X[testIndex], y[testIndex] featureReturns = [] for nFeatures in range(1,14): rfe = sk_fs.RFE( lr, n_features_to_select = nFeatures ) rfe.fit(xTrain,yTrain) score = rfe.score( xTest, yTest) featureReturns.append(( nFeatures, score)) foldReturns.append(featureReturns) returns = np.array(foldReturns) return returns
def selectfeatures(dfn, toexclude, topredict): assert isinstance(dfn, pd.DataFrame), 'Argument of wrong type!' assert isinstance(toexclude, list), 'Argument of wrong type!' assert isinstance(topredict, str), 'Argument of wrong type!' dfn = dfn.select_dtypes(include=[np.number]).copy() feature_cols = [x for x in dfn.columns.values.tolist() if x not in toexclude] # exclude features we are not predicting print(feature_cols) XO = dfn[feature_cols] YO = dfn[topredict] estimator = svm.SVR(kernel="linear") selector = feature_selection.RFE(estimator, 5, step=1) selector = selector.fit(XO, YO) # From the ranking you can select your predictors with rank 1 # Model 1; let us select the folowing features as predictors: select_features = np.array(feature_cols)[selector.ranking_ == 1].tolist() print("Features: ", select_features) return select_features
def rfeCV(self, n_folds=5): self.important_features = [True] * (len(self.train_X.columns) - 1) for fold in range(n_folds): rfe_obj = feature_selection.RFE( self.estimator, n_features_to_select=self.n_features) fold_index = self.train_X[self.train_X.kfold != fold].index df = self.train_X[self.train_X.kfold != fold].copy() df.pop('kfold') rfe_obj.fit(df, self.train_y.loc[fold_index, :].values.ravel()) assert (len(self.important_features) == len(rfe_obj.get_support())) tup = zip(self.important_features, rfe_obj.get_support()) self.important_features = [i[0] and i[1] for i in tup] print("Completed for fold {}".format(fold)) self.important_features = df.columns[self.important_features] self.train_X = self.train_X[self.important_features] self.test_df = self.test_df[self.important_features]
def linear_regressor_test(features, target, testing_data, solutions): svc = SVC(kernel="linear") dim = feature_selection.RFE(estimator=svc, n_features_to_select=7) feat = dim.fit_transform(features, target) print(dim.n_features_to_select) lr = LR() lr.fit(np.matrix(feat), np.matrix(target)) testing_data = dim.transform(testing_data) predictions = lr.predict(np.matrix(testing_data)) predictions = [p[0] for p in predictions.tolist()] predictions = list(map(constraints, predictions)) score = metrics.mean_squared_error(list(solutions), predictions) print("Accuracy: %f" % score)
def recursive_feature_elimination(xs, ys, xnames, cutoff): estimator = ensemble.RandomForestRegressor() selector = feature_selection.RFE(estimator, math.floor(len(xs[0]) * cutoff), step=1) selector.fit(xs, ys) bool_arr = selector.support_ new_xs = xs new_features = xnames i = len(xs[0]) - 1 while (i > -1): if (not bool_arr[i]): new_xs = np.delete(new_xs, i, axis=1) new_features = np.delete(new_features, i, axis=0) i -= 1 if (len(new_xs[0]) > 0): return new_xs, new_features else: return xs, xnames
def get_model(self, resume=False): if not resume: if self.method == 'variance': # Unsupervised p = .5 selector = feature_selection.VarianceThreshold( threshold=(p * (1 - p))) elif self.method == 'rfe': estimator = LogisticRegression() selector = feature_selection.RFE( estimator, n_features_to_select=self.feat_limit, step=1, verbose=0) elif self.method == 'forward': estimator = ExtraTreesClassifier(n_estimators=100) selector = SelectFromModel(estimator) elif self.method == 'seq_bwd': estimator = LogisticRegression(solver='lbfgs') selector = SFS(estimator, k_features=self.feat_limit, forward=False, floating=False, scoring='roc_auc', cv=4, n_jobs=-1) elif self.method == 'seq_fwd': estimator = LogisticRegression(solver='lbfgs') selector = SFS(estimator, k_features=self.feat_limit, forward=True, floating=False, scoring='roc_auc', cv=4, n_jobs=-1) else: selector = joblib.load(self.model_save_path) if self.verbose > 2: print(selector) return selector
def __init__(self, method='skb', clf=None, n_vars=None): self.n_vars = n_vars self.method = method #print("Metodo elegido:", self.method) if (method == 'sfm'): if (clf == None): self.clf = sk_en.RandomForestClassifier(n_estimators=100, max_features='auto') else: self.clf = clf elif (self.method == 'rfo'): if (clf == None): self.clf = sk_nb.GaussianNB() else: self.clf = clf elif (self.method == 'rfs'): if (clf == None): self.clf = sk_nb.GaussianNB() else: self.clf = clf elif (self.method == 'skb'): if (self.n_vars is None): self.n_vars = 10 self.clf = sk_fs.SelectKBest(score_func=sk_fs.f_classif, k=self.n_vars) elif (self.method == 'eli5_rfe'): if (self.n_vars is None): self.n_vars = 10 if (clf == None): base_clf = sk_lm.LogisticRegression() else: base_clf = clf eli5_estimator = eli5.sklearn.PermutationImportance(base_clf, cv=10) self.clf = sk_fs.RFE(eli5_estimator, n_features_to_select=self.n_vars, step=1) elif (self.method == 'biofes'): pass
def reverse_feature_elimination(df, k, model, target_name=None, y=None, verbose=False): """ Use sklearn function for recursively elimininating the least important features How does it pick the best features? the absolute value of the model.coef_ (not considering p_value) """ if y is None: X, y = pml.X_y(df, target_name) else: X, y = df, y selector = fs.RFE(model, n_features_to_select=k, step=1, verbose=verbose) selector.fit(X, y) return pml.feature_names(X)[selector.support_]
pd_feature_selection.SelectKBest(k=1), True)) _feature_selectors.append( (feature_selection.SelectKBest(k=1), pickle.loads(pickle.dumps(pd_feature_selection.SelectKBest(k=1))), True)) _feature_selectors.append((feature_selection.SelectKBest(k=2), pd_feature_selection.SelectKBest(k=2), True)) _feature_selectors.append((feature_selection.SelectPercentile(), pd_feature_selection.SelectPercentile(), True)) _feature_selectors.append( (feature_selection.SelectFdr(), pd_feature_selection.SelectFdr(), True)) _feature_selectors.append( (feature_selection.SelectFwe(), pd_feature_selection.SelectFwe(), True)) # Tmp Ami if False: _feature_selectors.append( (feature_selection.RFE(linear_model.LogisticRegression()), pd_feature_selection.RFE(pd_linear_model.LogisticRegression()), True)) _keras_estimators = [] if _level > 0: _keras_estimators.append( (KerasClassifier(_build_classifier_nn, verbose=0), PdKerasClassifier(_build_classifier_nn, _load_iris()[0]['class'].unique(), verbose=0), False)) _keras_estimators.append((KerasRegressor(_build_regressor_nn, verbose=0), PdKerasRegressor(_build_regressor_nn, verbose=0), False)) class _EstimatorTest(unittest.TestCase):
df = data_initialModel.select_dtypes(include=[np.number]).copy() feature_cols = df.columns.values.tolist() feature_cols.remove('SALE_PRICE') XO = df[feature_cols] YO = df['SALE_PRICE'] estimator = svm.SVR(kernel="linear") selector = feature_selection.RFE(estimator, 5, step=1) selector = selector.fit(XO, YO) select_features = np.array(feature_cols)[selector.ranking_ == 1].tolist() print(select_features) # In[38]: X = df[select_features] Y = df['SALE_PRICE'] trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2) lm = linear_model.LinearRegression() lm.fit(trainX, trainY) # Inspect the calculated model equations
titanic_train[imputable_cont_features]) titanic_train.loc[titanic_train['Embarked'].isnull(), 'Embarked'] = 'S' encodable_columns = ['Sex', 'Embarked', 'Pclass'] feature_defs = [(col_name, preprocessing.LabelEncoder()) for col_name in encodable_columns] mapper = DataFrameMapper(feature_defs) mapper.fit(titanic_train) titanic_train[encodable_columns] = mapper.transform(titanic_train) titanic_train1 = titanic_train.drop( ['PassengerId', 'Name', 'Cabin', 'Ticket', 'Survived'], axis=1) features = ['Pclass', 'Sex', 'Embarked'] titanic_train2 = pd.get_dummies(titanic_train1, columns=features) y_train = titanic_train['Survived'] X_train = titanic_train2 dt_estimator = tree.DecisionTreeClassifier(random_state=100) rfe = feature_selection.RFE(dt_estimator, 5, 1) rfe.fit(X_train, y_train) X_new = rfe.transform(X_train) print(rfe.support_) model = feature_selection.SelectFromModel(best_est, prefit=True) X_new = model.transform(X_train) X_new.shape #build model on X_new
penalty='l2', multi_class='multinomial', solver='lbfgs', max_iter=500))]) # nearest neighbor c_1NN = sklnn.KNeighborsClassifier(n_neighbors=1, algorithm='brute', metric='correlation') # cross-validation scheme cv_schem = skms.StratifiedShuffleSplit(n_splits=1, test_size=0.2) n_rep = 10 # number of repetitions # RFE wrappers RFE_pow = skfs.RFE(c_MLR, n_features_to_select=3) RFE_FC = skfs.RFE(c_MLR, n_features_to_select=90) # record classification performance perf = np.zeros([n_bands, n_measures, n_rep, 2]) # (last index: MLR/1NN) perf_shuf = np.zeros([n_bands, n_measures, n_rep, 2]) # (last index: MLR/1NN) conf_matrix = np.zeros([n_bands, n_measures, n_rep, 2, n_motiv, n_motiv]) # (fourthindex: MLR/1NN) rk_pow = np.zeros([n_bands, n_rep, N], dtype=np.int) # RFE rankings for power (N feature) rk_FC = np.zeros( [n_bands, 2, n_rep, int(N * (N - 1) / 2)], dtype=np.int) # RFE rankings for FC-type measures (N(N-1)/2 feature) pearson_corr_rk = np.zeros([ n_bands, n_measures, int(n_rep * (n_rep - 1) / 2)
def classify(X, y, verbose=False, nfolds=2, dim_red=None, n_components=[5, 10, 20], scale=True, fs=None, njobs=1, LR_C=[.01, .1, 1, 10, 100], LR_class_weight=[None, 'balanced'], SVC_C=[.01, .1, 1, 10, 100], SVC_class_weight=[None, 'balanced'], SVC_kernels=['rbf', 'linear', 'poly'], n_estimators=[10, 20, 30], max_features=['auto', 'log2', None], **kwargs): # spit out to the screen the function parameters, for logging if verbose: import inspect frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) print 'function name "%s"' % inspect.getframeinfo(frame)[2] for i in args[2:]: print " %s = %s" % (i, values[i]) # prepare configuration for cross validation test harness seed = 8 # prepare models models = [] # all these support multiclass: # http://scikit-learn.org/stable/modules/multiclass.html models.append( ('LR', LogisticRegression(multi_class='multinomial', solver='newton-cg'), { "C": LR_C, "class_weight": LR_class_weight })) models.append(('LDA', LinearDiscriminantAnalysis(), {})) models.append(('RndFor', RandomForestClassifier(), { 'n_estimators': n_estimators, 'max_features': max_features })) models.append(('NB', GaussianNB(), {})) models.append(('SVC', SVC(), { "C": SVC_C, "class_weight": SVC_class_weight, 'kernel': SVC_kernels })) models.append( ('Most frequent', DummyClassifier(strategy='most_frequent'), {})) models.append(('Stratified', DummyClassifier(strategy='stratified'), {})) # spit out to the screen the parameters to be tried in each classifier if verbose: print 'Trying these parameters:' for m in models: print m[0], ':', m[2] # evaluate each model in turn results = [] names = [] for name, model, params in models: # need to create the CV objects inside the loop because they get used # and not get reset! inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1, random_state=seed) outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1, random_state=seed) # # do this if no shuffling is wanted # inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) # outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) steps = [('clf', model)] pipe_params = {} for key, val in params.iteritems(): key_name = 'clf__%s' % key pipe_params[key_name] = val if fs == 'l1': lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) fs = feature_selection.SelectFromModel(lsvc) elif fs == 'rfe': fs = feature_selection.RFE(estimator=model) pipe_params['feat_sel__n_features_to_select'] = n_components steps = [('feat_sel', fs)] + steps if dim_red is not None: if dim_red == 'pca': dr = decomposition.PCA() pipe_params['dim_red__n_components'] = n_components elif dim_red == 'ica': dr = decomposition.FastICA() pipe_params['dim_red__n_components'] = n_components steps = [('dim_red', dr)] + steps if scale: steps = [('scale', preprocessing.RobustScaler())] + steps pipe = Pipeline(steps) cv_results = [] cnt = 0 for train_idx, test_idx in outer_cv.split(X, y): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv) opt_model.fit(X_train, y_train) if verbose: if len(params.keys()) > 0: print 'Best paramaters for', name, \ ' (%d/%d):' % (cnt + 1, outer_cv.n_splits) print opt_model.best_params_ predictions = opt_model.predict(X_test) cv_results.append(metrics.accuracy_score(y_test, predictions)) cnt += 1 results.append(cv_results) names.append(name) if verbose: print '\n======' for model, res in zip(models, results): msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res)) print(msg) print 'Chance: %f' % (1 / float(len(np.unique(y)))) print '======\n' return results, models
print(X[:, 1]) Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5)**2 + 10 * X[:, 3] + 5 * X[:, 4]**5 + np.random.normal(0, 1)) X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4)) lin = linear_model.LinearRegression() lin.fit(X, Y) ridge = Ridge() # alpha=0.1 ridge.fit(X, Y) lasso = linear_model.Lasso() # alpha=0.1 lasso.fit(X, Y) randLasso = linear_model.RandomizedLasso() randLasso.fit(X, Y) rfe = feature_selection.RFE(estimator=linear_model.LinearRegression()) rfe.fit(X=X, y=Y) rfr = RandomForestRegressor() rfr.fit(X, Y) freg = feature_selection.f_regression(X, Y) ans_lin = abs(lin.coef_) mx = [max(ans_lin)] * 14 ans_lin = ans_lin / mx ans_ridge = abs(ridge.coef_) mx = [max(ans_ridge)] * 14 ans_ridge = ans_ridge / mx ans_lasso = abs(lasso.coef_) # ������� �� 0 ans_randLasso = abs(randLasso.scores_)
import tabulate import pickle df_wine = pd.read_csv('wine.data', header=None) rIndex = sk_utils.shuffle( range(len(df_wine))) X,y = df_wine.iloc[:,1:].values[rIndex], df_wine.iloc[:,0].values[rIndex] if False: data = [] lr = sk_lm.LogisticRegression( penalty='l1', C=10000 ) for nToSelect in range (1,14): rfe = sk_fs.RFE(lr, n_features_to_select= nToSelect) rfe.fit(X,y) # RFE has ranking of selected features. # rfe.ranking_ data.append(rfe.ranking_) xx = pd.DataFrame (data) xx.index = range(1,14) with ( open ("../tex/RFE_Features.tbl", 'w')) as f: print >> f, tabulate.tabulate ( xx, tablefmt='latex', floatfmt=".3f" , headers="keys") print (" %d Selected Features : %s " % (nToSelect, rfe.ranking_)) # print "Features sorted by their rank:" # # print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_),)) def calcFoldScores ( nFold = 10): # kFold = sk_ms.KFold ( nFold).split(X,y)
# Spliting the dataset into training subset (70%) and testinf subset (%30) X_train, X_test, Y_train, Y_test = skl_ms.train_test_split(X, Y, test_size=0.3, random_state=0) # initiating the score list score_list = [] selected_feature_masks = [] # iterating over different numbers of features to be selected for n in range(1, len(X.columns)): # constructing the regression model model = skl_lm.LinearRegression() # initiate the RFE model rfe_selector = skl_fs.RFE(model, n_features_to_select=n) # finding the most relevant features based on recursively fitting the "model" object passed in the previous step; and removing the non-selected features from X_train X_train_rfe = rfe_selector.fit_transform(X_train, Y_train) # removing the non-selected features from X_test X_test_rfe = rfe_selector.transform(X_test) # fitting the regression model only with the selected features model.fit(X_train_rfe, Y_train) # scoring the model with the test data score = model.score(X_test_rfe, Y_test) # storing the score value score_list.append(score) # storing the feature mask selected_feature_masks.append(rfe_selector.support_) # retrieving the name of features features = np.array(X.columns)
def gen_test_estimators(): """ Generate couple of estimators for tests. """ test_folder = Path(__file__).parent test_folder = test_folder.joinpath('tools', 'test-data') with open(test_folder.joinpath('LinearRegression01.zip'), 'wb') as f: estimator = linear_model.LinearRegression() pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL) with open(test_folder.joinpath('RandomForestRegressor01.zip'), 'wb') as f: estimator = ensemble.RandomForestRegressor(n_estimators=10, random_state=10) pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL) with open(test_folder.joinpath('XGBRegressor01.zip'), 'wb') as f: estimator = xgboost.XGBRegressor(learning_rate=0.1, n_estimators=100, random_state=0) pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL) with open(test_folder.joinpath('pipeline10'), 'wb') as f: estimator = ensemble.AdaBoostRegressor(learning_rate=1.0, n_estimators=50) pipe = pipeline.make_pipeline(estimator) pickle.dump(pipe, f, pickle.HIGHEST_PROTOCOL) dump_model_to_h5 = try_get_attr('galaxy_ml.model_persist', 'dump_model_to_h5') estimator = linear_model.LinearRegression() dump_model_to_h5(estimator, test_folder.joinpath('LinearRegression01.h5mlm')) estimator = ensemble.RandomForestRegressor(n_estimators=10, random_state=10) dump_model_to_h5(estimator, test_folder.joinpath('RandomForestRegressor01.h5mlm')) estimator = xgboost.XGBRegressor(learning_rate=0.1, n_estimators=100, random_state=0) dump_model_to_h5(estimator, test_folder.joinpath('XGBRegressor01.h5mlm')) estimator = ensemble.AdaBoostRegressor(learning_rate=1.0, n_estimators=50) pipe = pipeline.make_pipeline(estimator) dump_model_to_h5(pipe, test_folder.joinpath('pipeline10')) import pandas as pd X_path = test_folder.joinpath('regression_X.tabular') X = pd.read_csv(X_path, sep='\t').values y_path = test_folder.joinpath('regression_y.tabular') y = pd.read_csv(y_path, sep='\t').values.ravel() estimator = ensemble.RandomForestRegressor(n_estimators=10, random_state=10) searcher = model_selection.GridSearchCV(estimator, {}) searcher.fit(X, y) dump_model_to_h5(searcher, test_folder.joinpath('GridSearchCV01.h5mlm')) rfe = feature_selection.RFE(estimator) rfe.fit(X, y) dump_model_to_h5(rfe, test_folder.joinpath('RFE.h5mlm'))