def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import GradientBoostingRegressor from sklearn.grid_search import RandomizedSearchCV print "-- {} --".format("Fine-tuning Gradient Boosting Regression") rf = GradientBoostingRegressor( n_estimators=1000 ) param_dist = { "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2], "max_depth": sp_randint(1, 15), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "max_features": sp_randint(1, 15) } n_iter_search = 300 random_search = RandomizedSearchCV( rf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1, cv=5, verbose=1 ) start = time() random_search.fit(data_train_x, data_train_y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def Decision_tree(Xtrain, Ytrain, Xtest): tuned_parameters = { 'splitter': ['best', 'random'], "max_features": ["log2", "sqrt"], 'min_samples_split': np.arange(30, 60, 5), 'min_samples_leaf': np.arange(7, 14), 'max_depth': np.arange(700, 1389, 10) } """Randomized optimizationSearch which used cross validation to optimized best parameters for the estimator. In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter. """ Multreg = RandomizedSearchCV(DecisionTreeRegressor(random_state=0), param_distributions=tuned_parameters, cv=10, n_iter=int(args[1]), n_jobs=-1, random_state=0) #Fitting decision tree model Multreg.fit(Xtrain, Ytrain) #Predicting with unseen testing set YMultreg = Multreg.predict(Xtest) # save the model to disk filename = 'finalized_DC.sav' pickle.dump(Multreg, open(filename, 'wb')) return YMultreg
def grid_search(symbol='MSFT'): """Find optimal SVC parameters""" from scipy.stats import randint as sp_randint X, y = build_data() X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.4, random_state=0) param_grid = [ { 'C': [.1, 1, 10, 100, 1000], 'gamma': [1e-2, 1e-3, 1e-4], 'kernel': ['linear', 'rbf'] }, # {'C': [1, 10, 100, 1000], 'gamma': [.001, .0001], 'kernel': ['linear', 'rbf']} ] param_dist = { 'C': [.001, .01, .1, 1, 10, 100], 'gamma': [1e2, 1e-1, 1e-2, 1e-3, 1e-4], 'kernel': ['linear', 'rbf'] } n_iter_search = 20 clf = svm.SVC() random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) # # clf = GridSearchCV(estimator=svm.SVC(C=1), param_grid=param_grid, cv=5) # clf.fit(X_train, y_train) # return clf random_search.fit(X_train, y_train) return random_search
def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true( search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format(search.best_params_, tied_best_params), )
def evalModel(train_data, eval_data, train_labels, eval_labels, seed): joined_data = np.concatenate((train_data,eval_data),axis=0) joined_labels = np.concatenate((train_labels,eval_labels),axis=0) train_mask = np.zeros(train_data.shape[0]) - 1.0 eval_mask = np.zeros(eval_data.shape[0]) joined_mask = np.concatenate((train_mask,eval_mask),axis=0) ps = PredefinedSplit(test_fold=joined_mask) loss = make_scorer(get_rmsle, greater_is_better=False) train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = RandomForestRegressor(random_state=seed, verbose=1) #clf.fit(train_data, train_labels) #preds = clf.predict(eval_data) #print(get_rmsle(eval_labels, preds)) ## achieves 0.263 # specify parameters and distributions to sample from param_dist = {"n_estimators": sp_randint(300, 800), "max_depth": sp_randint(10, 50), "max_features": ['auto','sqrt','log2'], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11)} # run randomized search n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=ps, scoring=loss, n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2) start = time() random_search.fit(joined_data, joined_labels) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def train_cv(): # ---------------------- load the data train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # ---------------------- train loss = ['deviance', 'exponential'] learning_rate = np.logspace(-5,1) n_estimate_dist = sp_randint(1000,4800) max_depth_dist = sp_randint(1,10) param_dist = dict(loss=loss, learning_rate=learning_rate, n_estimators=n_estimate_dist, max_depth=max_depth_dist) gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1) print "--------------------- RandomizedSearchCV begins" searchcv.fit(Xtrain,ytrain) print "--------------------- RandomizedSearchCV ends" print "best score: ",searchcv.best_score_ print "best parameters: ",searchcv.best_params_ common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_) print "--------------------- GBDT saved into file"
def scale_pca_rf_pipe_new_import(): from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()), ("rf", H2ORandomForestEstimator()) ]) params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50,60), "rf__max_depth": randint(4,8), "rf__min_rows": randint(5,10), "pca__transform": ["none", "standardize"], } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:],iris[0]) print(random_search.best_estimator_)
def _compute_thresh(this_data, ch_type, cv=10): """ Compute the rejection threshold for one channel. Parameters ---------- this_data: array (n_epochs, n_times) Data for one channel. ch_type: str 'mag', 'grad' or 'eeg'. cv : iterator Iterator for cross-validation. """ est = ChannelAutoReject() Limits = namedtuple('Limits', 'low high') limits = dict(eeg=Limits(low=20e-7, high=400e-6), grad=Limits(low=400e-13, high=20000e-13), mag=Limits(low=400e-15, high=20000e-15)) param_dist = dict( thresh=uniform(limits[ch_type].low, limits[ch_type].high)) rs = RandomizedSearchCV( est, # XXX : is random really better than grid? param_distributions=param_dist, n_iter=20, cv=cv) rs.fit(this_data) best_thresh = rs.best_estimator_.thresh return best_thresh
def optimized_classifier(X, y, classifier, distributions, scorer='f1_weighted', n_iter=30, cv=3): """ Return best classifier and scores for X,y from a randomized search over parameters X -- Features for each sample y -- Class label for each sample classifier -- An estimator class or pipeline from sklearn distributions -- The parameter distributions to search for that estimator scorer -- Scoring function (e.g. accuracy or f1) n_iter -- The number of random iterations to try """ # Make a pipeline out of the classifier, to allow for feature scaling in the first step. # Add prefix to parameters to support use in pipeline class_name = classifier.__class__.__name__.lower() distributions = dict((class_name + "__" + key, val) for key, val in distributions.iteritems()) # It is important to handle scaling here so we don't accidentally overfit some to the # test data by scaling using that information as well. classifier = make_pipeline(preprocessing.RobustScaler(), classifier) randomized_search = RandomizedSearchCV( classifier, param_distributions=distributions, n_iter=n_iter, scoring=scorer, cv=cv, n_jobs=1) randomized_search.fit(X, y) print randomized_search.best_estimator_ print "Validation Score ({}): {:.2f}".format(scorer, randomized_search.best_score_) print "" return randomized_search.best_estimator_, randomized_search.best_score_
def svc_appr(): """ Best params: {'C': 0.022139881953014046} Submission: E_val: E_in: E_out: """ from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import RandomizedSearchCV from scipy.stats import expon X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), verbose=2, param_distributions={'C': expon()}) rs.fit(X_scaled, y) logger.debug('Got best SVC.') logger.debug('Best params: %s', rs.best_params_) logger.debug('Grid scores:') for i, grid_score in enumerate(rs.grid_scores_): print('\t%s' % grid_score) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
def rf_cv(fv_train, target_train, fv_test, target_test): ####---- cross validation of train dataset, gridsearch the best parameters for random forest # Set the parameters by cross-validation tuned_parameters = { 'n_estimators': [1000, 2000], "max_depth": [3, 6, 9, None], "max_features": ["auto", "log2", None], "class_weight": [None, 'balanced'] } scores = ['recall_macro'] n_iter_search = 20 for score in scores: print("# Tuning hyper-parameters for %s" % score) print() mycv = StratifiedKFold(target_train, n_folds=5) clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search, scoring='%s' % score) clf.fit(fv_train, target_train) report_cv(clf, fv_test, target_test)
def best_ExtraTree(X, y): from sklearn.grid_search import RandomizedSearchCV from scipy.stats import randint as sp_randint from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), random_state=42) clf = ExtraTreesClassifier(max_depth=None, bootstrap = False) grid = {'n_estimators': sp_randint(250, 400), 'min_samples_leaf' : sp_randint(1, 12), 'max_features' : sp_randint(5, 50)} clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=10, param_distributions=grid, scoring='accuracy') y_hat = clf_rfc.fit(X_train, y_train.ravel()).predict(X_test) print('Best Params: \n', clf_rfc.best_params_ ) print("Accuracy with Extra Forest = %4.4f" % accuracy_score(y_test.ravel(), y_hat)) binarize_y_confustion_matrix(y_test.ravel(), y_hat) return(clf_rfc.best_params_)
def Cv3(X_train, y_train): crf = sklearn_crfsuite.CRF( algorithm='lbfgs', max_iterations=100, all_possible_transitions=True ) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted') # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=4, n_iter=50, scoring=f1_scorer) rs.fit(X_train, y_train) # labels = list(crf.classes_) # labels.remove('O') print('best params:', rs.best_params_) return rs.best_estimator_
def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False): rf = RandomForestClassifier(random_state = 9) #Tune the model param_distributions = { 'n_estimators': range(1,50,1), 'max_depth': range(1,70,1), 'max_features': range(6,15,1), 'min_samples_split':[2,3,4], 'min_samples_leaf':[1,2,3,4], 'n_jobs':[-1] } rf_optimized = RandomizedSearchCV( estimator = rf, param_distributions = param_distributions, n_iter= n_iter, scoring = 'f1', cv = cv, random_state = 1 ) rf_optimized.fit(X_train, y_train) if save == True: joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1) print "Best parameter: %s" %rf_optimized.best_params_ print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_ print "--------------------------------------------" #predictions predicted_y_train = rf_optimized.predict(X_train) predicted_y_test = rf_optimized.predict(X_test) return predicted_y_train, predicted_y_test
def gridsearch(): labels = ['T', 'D'] # define fixed parameters and parameters to search crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { # 'algorithm': ['lbfgs', 'l2sgd', 'ap', 'pa', 'arow'], 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='macro', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(train_data, train_labels) return rs.best_params_, rs.best_estimator_, rs.best_score_
def tuneSGD(data,labels, clf=None): from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import SGDClassifier sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .2, random_state = 42) clf = Pipeline([#('num_features',SelectPercentile(f_classif,percentile = 5)), ('sgd', SGDClassifier(random_state = 11, penalty = 'elasticnet', n_jobs = 1, alpha = 10**-4))]) param_grid = { #'num_features__percentile': list(range(1,101)), 'sgd__loss':['modified_huber','squared_hinge'],#,'hinge','log'], 'sgd__class_weight':['balanced',None], 'sgd__l1_ratio': list(np.arange(0,1.0,.01)), 'sgd__alpha': list(10.**np.arange(-6,-3,.1)) } grid_search = RandomizedSearchCV(clf, param_grid, n_iter = 250, random_state = 42, cv=sss, scoring = 'roc_auc',#roc_score, n_jobs= -2, pre_dispatch = '2*n_jobs') grid_search.fit(data,labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for p in param_grid.keys(): print (p, best_parameters[p]) return grid_search plot_cs(grid_search)
def searchBestModelParameters(algorithm, trainingData): if algorithm == 'multinomialnb': # model the data using multinomial naive bayes # define the parameter values that should be searched alpha = [0, 0.2, 0.4, 0.6, 0.8, 1] fitPrior = [True, False] # specify "parameter distributions" rather than a "parameter grid" paramDistribution = dict(alpha=alpha, fit_prior=fitPrior) model = MultinomialNB() bestRun = [] for _ in range(1): rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring='precision', n_iter=5) rand.fit(trainingData, trainingData['isSpam']) # examine the best model bestRun.append({ 'score': round(rand.best_score_, 3), 'params': rand.best_params_ }) print(max(bestRun, key=lambda x: x['score'])) return max(bestRun, key=lambda x: x['score'])
def randomized_search_ksvm(): clf = SVC(random_state=1) # specify parameters and distributions to sample from param_dist = { 'clf__C': [0.01, 0.1, 1, 10, 100, 1000], 'clf__gamma': [0.01, 0.1, 1, 10, 100, 1000], 'clf__kernel': ['rbf', 'linear'], } steps = [('scl', StandardScaler()), ('clf', SVC())] pipeline = Pipeline(steps) # run randomized search n_iter_search = 50 random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X_train, y_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings.\n" % ((time() - start), n_iter_search)) print('best score: %f\n' % (random_search.best_score_)) print('best estimator: %s\n' % (random_search.best_estimator_)) print('best params: %s\n' % (random_search.best_params_)) clf = random_search.best_estimator_ clf.fit(X_train, y_train) print('Test accuracy: %.3f' % clf.score(X_test, y_test))
def pick_best_features(df): """ Grid search to find best features. TODO refactor :param train: train data :param test: test data :return: """ #X = sample_data_random(df, .25) X = df[0:int(df.shape[0] * .25)] overfit_models = dict() for out in outputs: print out pipe_clf = CustomPipeline.get_transforms() clf = SGDClassifier(loss='log') tuned_parameters = {'alpha': sp_rand()} score = 'log_loss' tran_x = pipe_clf.fit_transform(X) grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score) grid.fit(tran_x, X[out]) print grid.best_estimator_ overfit_models[out] = grid.best_estimator_ return overfit_models
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20): """Run a random search for best Decision Tree parameters. Args ---- X -- features y -- targets (classes) cf -- scikit-learn Decision Tree param_dist -- [dict] list, distributions of parameters to sample cv -- fold of cross-validation, default 5 n_iter_search -- number of random parameter sets to try, default 20. Returns ------- top_params -- [dict] from report() """ random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print(("\nRandomizedSearchCV took {:.2f} seconds " "for {:d} candidates parameter " "settings.").format((time() - start), n_iter_search)) top_params = report(random_search.grid_scores_, 3) return top_params
def RandomFo(self): parameters_forest={'n_estimators':randint(10,self.n_estimators_max), "bootstrap": [True, False]} X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) forest_reg.fit(X_train,y_train) self.forest_reg=forest_reg.best_estimator_
def find_best_parameters_and_get_fitted_model(self, **kwargs): """ Finds the best set of hyperparameters for a Random Forest for the provided data. The best hyperparameters are found by repeatedly drawing random samples from a distribution of parameters and evaluating them by using cross validation. """ # load data data = kwargs['data'] X = data['features'] y = data['targets'] out_args = {} # we choose Random Fores Classifier as the Machine Learning algorithm for # this DPModel. rc = RandomForestClassifier() # here we define the space of parameters over which we want to perform the random search param_distributions = {} param_distributions["n_estimators"] = [50, 100, 150] # do random search random_search_outer = RandomizedSearchCV(rc, param_distributions=param_distributions, cv=5, n_iter=3) random_search_outer.fit(X, y) predictor = random_search_outer.best_estimator_ return predictor, out_args
def Gradient(self): X_train, y_train =self.X_train,self.y_train parameters_boost={'max_depth':randint(3,self.max_depth_max+1), 'n_estimators':randint(80,100+self.n_estimators_max)} boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) boost_reg.fit(X_train,y_train) self.boost_reg=boost_reg.best_estimator_
def test_sklearn_cv(): model = LightFM(loss='warp', random_state=42) # Set distributions for hyperparameters randint = stats.randint(low=1, high=65) randint.random_state = 42 gamma = stats.gamma(a=1.2, loc=0, scale=0.13) gamma.random_state = 42 distr = {'no_components': randint, 'learning_rate': gamma} # Custom score function def scorer(est, x, y=None): return precision_at_k(est, x).mean() # Custom CV which sets train_index = test_index class CV(KFold): def __iter__(self): ind = np.arange(self.n) for test_index in self._iter_test_masks(): train_index = np.logical_not(test_index) train_index = ind[train_index] yield train_index, train_index cv = CV(n=train.shape[0], random_state=42) search = RandomizedSearchCV(estimator=model, param_distributions=distr, n_iter=10, scoring=scorer, random_state=42, cv=cv) search.fit(train) assert search.best_params_['no_components'] == 52
def fit(self, drop_features=[], segments=["adopted", "sporadic", "low"]): """ :param drop_features: list, which features to drop before fit :param segments: which user segments to consider while fitting :return: """ if not self.transformed: self.transform_features() user_id, class_labels, features = self._prep_for_fit( drop_features, segments=["adopted", "sporadic", "low"]) # this is a user based model, therefore we want to avoid including same user in Train and Test cv_strat = LabelKFold(user_id, n_folds=self.cv_params["folds"]) # RandomSearch is vastly faster than GridCV with tolerable loss of optimization # NB: RandomForest doesn't generally require heavy parm optimization, this is somewhat for posterity here random_search = RandomizedSearchCV(self.clf, param_distributions=self.param_grid, n_iter=self.n_iter, cv=cv_strat, scoring=self.cv_params["scorer"], n_jobs=self.n_jobs) print("running random param search on {} ".format( self.clf.__class__.__name__)) random_search.fit(features, class_labels) self._handle_result(random_search, list(features.columns))
def optimize_hyperparameters(df): n_samples = df.shape[0] random_test = { 'n_estimators': np.linspace(n_samples * 2, n_samples * 10, 5).astype(int), 'criterion': ['gini', 'entropy'], 'max_features': [None, 'sqrt', 'log2'], 'min_samples_split': np.linspace(2, n_samples / 50, 10).astype(int), 'min_samples_leaf': np.linspace(1, n_samples / 200, 10).astype(int), 'max_leaf_nodes': np.linspace(10, n_samples / 50, 10).astype(int) } clf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42) X = df.values[:, :-1] y = df.values[:, -1] random_search = RandomizedSearchCV(clf, random_test, n_jobs=-1, cv=10, n_iter=500, random_state=42) random_search.fit(X, y) best_params = report(random_search.grid_scores_, verbose=False) # save best hyperparameters to csv with open('./temp/best_params.csv', 'wt') as f: w = csv.DictWriter(f, best_params.keys()) w.writeheader() w.writerow(best_params)
class CVSearcher(SearcherBase): ''' Cross validation searcher is not specific for time series ''' def __init__(self, sklearn_model_class, params, scoring=None, method=None, n_randomized_search=200, cv=5): super(CVSearcher, self).__init__(sklearn_model_class, params, method=method, n_randomized_search=n_randomized_search, cv=cv, scoring=scoring) def fit(self, X, Y): if self.method == 'Grid': self.__searcher = GridSearchCV(estimator=self.ml_class(), param_grid=self.search_space, scoring=self.scoring, cv=self.cv, refit=True) elif self.method == 'Randomized' or self.method is None: self.__searcher = RandomizedSearchCV(estimator=self.ml_class(), param_distributions=self.search_space, scoring=self.scoring, n_iter=self.n_randomized_search, cv=self.cv, refit=True) else: raise ValueError('CVSearcher only support GridSearch and RandomizedSearch') self.__searcher.fit(X, Y) print("Best: %s" % (self.__searcher.best_estimator_)) return self def predict(self, X): return self.__searcher.predict(X) def get_scores(self): return self.__searcher.grid_scores_
def search_classifier(n_iter): assignments = load_structure()['ASS_ASSIGNMENT'] features = load_featurized_training_set("files/train_featurized.pkl") # print(len(features.columns)) X = features.drop(['DATE', 'n_calls'], axis=1).as_matrix().astype(float) y = (features.n_calls > 0).astype(int).as_matrix() calls = features.n_calls.as_matrix() X = StandardScaler().fit_transform(X) pipe = Pipeline([ # ('scaler', StandardScaler()), # ('pca', RandomizedPCA()), ('clf', SGDClassifier()) ]) params = { # 'pca__n_components': [30, 50, 70, 86], 'clf__class_weight': ['balanced'], 'clf__loss': ['hinge'], 'clf__penalty': ['l1'], 'clf__alpha': st.uniform(0, 0.0003), 'clf__fit_intercept': [False] # 'clf__alpha': [0.0001] } kf = KFold(len(X), n_folds=3, shuffle=True) grid_search = RandomizedSearchCV(pipe, params, scoring='accuracy', cv=kf, verbose=1000, n_iter=n_iter) grid_search.fit(X, y) print("\n") print(grid_search.best_params_) print(grid_search.best_score_) joblib.dump(grid_search.best_estimator_, "files/best_classifier.pkl")
def run(src_dir, mod, random_state=1234): if isinstance(src_dir, str): mat, labels_arr = load_mat_and_labels(src_dir, mod) else: mat, labels_arr = (src_dir, mod) masker = SimpleMaskerPipeline(threshold=.2) svc = SVC(kernel='linear') pipeline = Pipeline([('masker', masker), ('anova', SelectKBest(k=500)), ('svc', svc)]) c_range = gamma.rvs(size=100, a=1.99, random_state=random_state) param_dist = {"svc__C": c_range} n_iter = 100 cv = StratifiedShuffleSplit(labels_arr, n_iter=n_iter, test_size=1/6.0, random_state=random_state) total_runs = n_iter scorer = verbose_scorer(total_runs) search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer, random_state=random_state) search.fit(mat, labels_arr) return search
def K_NN(Xtrain, Ytrain, Xtest): KNNoptparam = { "n_neighbors": np.arange(20, 200, 10), "weights": ['uniform', 'distance'], "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'] #,"leaf_size":np.arange(30,150,15) , "p": [2, 3] } #Randomized search parameter optimization RF1 = RandomizedSearchCV(KNeighborsRegressor(), param_distributions=KNNoptparam, cv=10, n_iter=int(args[1]), n_jobs=-1, random_state=0) RF1.fit(Xtrain, Ytrain) #Predicting using unseen data KNN_predict = RF1.predict(Xtest) # save the model to disk filename = 'finalized_KNN.sav' pickle.dump(RF1, open(filename, 'wb')) return KNN_predict
def train_dataset_crf(self, X, Y, ratio): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05) } import multiprocessing cpus = multiprocessing.cpu_count() rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=cpus - 1, n_iter=50) assert len(X) == len(Y) # subset of indexes to used in training r_indexes = randint(low=0, high=len(X) - 1, size=round(ratio * (len(X) - 1))) X_subset = [X[i] for i in r_indexes] Y_subset = [Y[i] for i in r_indexes] rs.fit(X_subset, Y_subset) return rs
def tune(data,labels, clf=None): from sklearn.cross_validation import StratifiedShuffleSplit sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .1, random_state = 42) clf = Pipeline([('num_features', SelectKBest(f_classif,k=100)), ('svm', svm.SVC(C=.01, kernel = 'linear', probability = True, random_state = 11))]) param_grid = { 'num_features__k':range(250,2500,250), 'svm__C':10.**np.arange(-3,4), #'svm__loss':['hinge','squared_hinge'], 'svm__class_weight':['balanced',None] } grid_search = RandomizedSearchCV(clf, param_grid, n_iter = 100, cv=sss, scoring='f1', n_jobs=-1, pre_dispatch = '2*n_jobs', random_state = 42) grid_search.fit(data,labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for p in param_grid.keys(): print (p, best_parameters[p]) #plot_cs(grid_search) return grid_search
def randomized_search_rfc(txt_lst, y): # build a classifier pipeline = Pipeline([ ('vect', CountVectorizer(stop_words='english', analyzer = analyzer, ngram_range=(1, 3))) ,('tfidf', TfidfTransformer()) , ('clf', RandomForestClassifier(n_estimators=100)) ]) # specify parameters and distributions to sample from param_dist = { 'vect__ngram_range':[None, (1, 2), (1,3),(1,4)], "clf__max_depth": map(lambda x: int(x), np.logspace(1, 4, 10)), #sp.stats.randint(10,1000), "clf__max_features": map(lambda x: int(x), np.logspace(0, 3, 10)), "clf__min_samples_split": sp.stats.randint(1, 11), "clf__min_samples_leaf": sp.stats.randint(2, 11), "clf__criterion": ["gini", "entropy"] } n_iter_search = 50 grid_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, verbose = 1, n_iter=n_iter_search, cv = 5, n_jobs=1, scoring = 'accuracy') start = time.time() grid_search.fit(txt_lst, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) report(grid_search.grid_scores_, n_top = 5) return grid_search
def doRandomSearch(self, clfName, clf, param_dist, X, Y): if self._custRandomSearchFlag == True: return self.doCustRandomSearch(clfName, clf, param_dist, X, Y) else: start = time.time() multiCores = -1 if clfName == "Logistic_Regression": multiCores = 1 if self._setXgboostTheradToOne == True and clfName =="Xgboost": multiCores = 1 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss' ,verbose=10) random_search.fit(X, Y) log(clfName + " randomized search cost: " , time.time() - start , " sec") self._bestClf[clfName] = random_search.best_estimator_ #self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y) self._bestLoglossDict[clfName] = self.validation(self._bestClf[clfName], X, Y, test_size=0.3) log("customize logloss: ",self._bestLoglossDict[clfName]) self.report(random_search.grid_scores_, clfName) random_search.best_params_ dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName) self._lastRandomSearchBestParam = random_search.best_params_ return random_search.best_estimator_
def auto_tune_paras_random_search(model, param_dist, x_input_train, y_input_train, n_iter_search=1, num_folds=5): """ Executing random search of the input model according to the param dictionary # Credit: source code adapted from SKLearn # Adapt from http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py :param model: a sklearn model (an Estimator) :param param_dist: parameter dictionary :param x_input_train: A pandas data frame of input features for the train set :param y_input_train: A numpy array or pandas series of ground truth for the train set :param n_iter_search: number of iterations to search :param num_folds: number of folds to do cross validation :return: trained model from the cross validation """ random_search_pipe = RandomizedSearchCV(model, param_distributions=param_dist, scoring=f1_scorer, n_iter=n_iter_search, verbose=10, cv=num_folds, random_state=0) start = time() random_search_pipe.fit(x_input_train, y_input_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report_search_scores(random_search_pipe.cv_results_) return (random_search_pipe)
def randomized_search_forest(): clf = RandomForestClassifier(n_estimators=20) # specify parameters and distributions to sample from param_dist = { "max_depth": [3, None], "max_features": sp_randint(1, 7), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search n_iter_search = 100 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X_train, y_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings.\n" % ((time() - start), n_iter_search)) print('best score: %f\n' % (random_search.best_score_)) print('best estimator: %s\n' % (random_search.best_estimator_)) print('best parameters: %s\n' % (random_search.best_params_)) clf = random_search.best_estimator_ clf.fit(X_train, y_train) print('Test accuracy: %.3f' % clf.score(X_test, y_test))
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) rf = RandomForestClassifier(n_jobs=8) param_dist = { "n_estimators":sp_randint(100,300), "criterion": ["gini"], #"max_depth": sp_randint(3, 10000), #"min_samples_split": sp_randint(1, 300), #"min_samples_leaf": sp_randint(1, 300), "max_features": sp_randint(10, 26), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } clf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,cv=10,scoring='roc_auc') clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] loss = roc_auc_score(valid_y,valid_predictions) print('loss:') print(loss) print(clf.best_estimator_) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
def get_best_model(X_train, y_train, labels): ''' :param X_train: Train features :param y_train: Train labels :param labels: list of all labels to be evaluated :return: ''' crf = sklearn_crfsuite.CRF( algorithm='lbfgs', max_iterations=100, all_possible_transitions=True ) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(X_train, y_train) return rs.best_estimator_
def crf_tune_hyperparam(data, index, label, word_set_suffix, word_set_prefix, max_iterations=500): train_data = [data[i] for i in index] X = [ sent2features(s, word_set_suffix, word_set_prefix) for s in train_data ] y = [sent2labels(s) for s in train_data] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=max_iterations, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } label.remove("O") # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=label) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=8, n_iter=50, scoring=f1_scorer) rs.fit(X, y) return rs.best_params['c1'], rs.best_params['c2']
def run(full, target_col, random_state=1234, c_range_alpha=.05, c_range_size=100, normalize=False, score_fn=r2_score): svr = linearSVRPermuteCoefFactory() pipeline_steps = [('svr', svr)] pipeline = Pipeline(pipeline_steps) c_range = gamma.rvs(size=c_range_size, a=c_range_alpha, random_state=random_state) param_dist = {"svr__C": c_range} data, target = separate(full, target_col) if normalize: data = scale(data) n_iter = 100 cv = ShuffleSplit(len(target), n_iter=n_iter, test_size=1/6.0, random_state=random_state) total_runs = n_iter scorer = verbose_scorer(total_runs, score_fn) search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer, random_state=random_state) search.fit(data, target) return search
def optimize_parameter(self): self.console.output("[CTG] OPTIMIZATION START...", "\n") # 计算旧模型(即初始模型)的交叉验证精度 old_scores = cross_validation.cross_val_score(estimator=self.evaluator.pipeline, X=self.x_train, y=self.y_train, scoring='accuracy', cv=10, n_jobs=-1) old_score = np.mean(old_scores) # 计算新模型们中最好的交叉验证精度 new_score = -1.0 self.new_estimator = None for clf, param_grid in RandomParameterSettings.possible_models: self.console.output("[CTG] SEARCH MODEL:", str(clf) + "\n") estimator = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('clf', clf)]) gs = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(self.x_train, self.y_train) if new_score < gs.best_score_: new_score = gs.best_score_ self.new_estimator = gs.best_estimator_ if new_score > old_score: self.label_tips.config( text='Found a new model with improvement: %.2f%%' % (100.0 * (new_score - old_score) / old_score)) self.button_opt.config(text='应用', command=self.apply_new_estimator) else: self.label_tips.config(text="No better model founded.") self.console.output("[CTG] OPTIMIZATION COMPLETE !", "\n") self.console.output("[CTG] RESULT: ", "old_model_accuracy=%f, new_model_accuracy=%f, improvement=%.2f%%\n" % ( old_score, new_score, (100.0 * (new_score - old_score) / old_score)) + "\n")
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20, njobs=4): """Run a random search for best Decision Tree parameters. Args ---- X -- features y -- targets (classes) cf -- scikit-learn Decision Tree param_dist -- [dict] list, distributions of parameters to sample cv -- fold of cross-validation, default 5 n_iter_search -- number of random parameter sets to try, default 20. Returns ------- top_params -- [dict] from report() """ random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print(("\nRandomizedSearchCV took {:.2f} seconds " "for {:d} candidates parameter " "settings.").format((time() - start), n_iter_search)) top_params = report(random_search.grid_scores_, 3) return top_params
def training_op(X_train, y_train): crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=10, all_possible_transitions=True) crf.fit(X_train, y_train) filename = 'crf_withoutCV' pickle.dump(crf, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) labels = list(crf.classes_) labels.remove('O') labels params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=5, verbose=1, n_jobs=1, n_iter=5, scoring=f1_scorer) rs.fit(X_train, y_train) crf = rs.best_estimator_ filename = 'CRF_model' pickle.dump(crf, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
def searchBestModelParameters(algorithm, trainingData): #using randomforest if algorithm == 'rf': numTrees = range(10, 100, 10) numMinLeafSamples = range(2, 20, 2) numMinSamplesSplit = range(1, 20, 3) paramDistribution = dict(n_estimators = numTrees, min_samples_leaf = numMinLeafSamples, min_samples_split = numMinSamplesSplit) model = RandomForestClassifier() elif algorithm == 'knn': # model the data using knn # define the parameter values that should be searched k_range = range(1, 50) weight_options = ['uniform', 'distance'] # specify "parameter distributions" rather than a "parameter grid" paramDistribution = dict(n_neighbors = k_range, weights = weight_options) model = KNeighborsClassifier() elif algorithm == 'logr': #model data using logistic regression model = LogisticRegression() get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())") return bestRun = [] for _ in range(20): rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10) rand.fit(trainingData, trainingData['isSpam']) # examine the best model bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_}) print(max(bestRun, key=lambda x:x['score'])) return max(bestRun, key=lambda x:x['score'])
def randomSearch(classifier, parameters, XTr, yTr, cv, n_iter): print("***** Random Search *****") print("Cross-Validation:{0} and number of iterations:{1}".format( cv, n_iter)) scores = ['accuracy'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() if (score == 'accuracy'): scoring_method = score else: scoring_method = score + '_micro' clf = RandomizedSearchCV(classifier, param_distributions=parameters, cv=cv, scoring=scoring_method, n_iter=n_iter) clf.fit(XTr, yTr) print("Best parameters and scores set found on development set:") # print(self.clf.best_estimator_) print(clf.best_params_) print(clf.best_score_) print() return clf.best_params_
def _compute_thresh(this_data, ch_type, cv=10): """ Compute the rejection threshold for one channel. Parameters ---------- this_data: array (n_epochs, n_times) Data for one channel. ch_type: str 'mag', 'grad' or 'eeg'. cv : iterator Iterator for cross-validation. """ est = ChannelAutoReject() Limits = namedtuple('Limits', 'low high') limits = dict(eeg=Limits(low=20e-7, high=400e-6), grad=Limits(low=400e-13, high=20000e-13), mag=Limits(low=400e-15, high=20000e-15)) param_dist = dict(thresh=uniform(limits[ch_type].low, limits[ch_type].high)) rs = RandomizedSearchCV(est, # XXX : is random really better than grid? param_distributions=param_dist, n_iter=20, cv=cv) rs.fit(this_data) best_thresh = rs.best_estimator_.thresh return best_thresh
def rf_cv(fv_train,target_train,fv_test,target_test): ####---- cross validation of train dataset, gridsearch the best parameters for random forest # Set the parameters by cross-validation tuned_parameters = {'n_estimators': [1000, 2000], "max_depth": [3, 6, 9, None], "max_features": ["auto","log2",None], "class_weight": [None, 'balanced']} scores = ['recall_macro'] n_iter_search = 20 for score in scores: print("# Tuning hyper-parameters for %s" % score) print() mycv = StratifiedKFold(target_train, n_folds = 5) clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search, scoring='%s' % score) clf.fit(fv_train, target_train) report_cv(clf,fv_test,target_test)
def makeRandomCV(dataset,dbtype='CATH', level=1, k_iters=10, minsamples=500, clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')): from scipy.stats import randint as sp_randint dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples) print dataDict labels = dataDict['target_names'] param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} n_iter_search = k_iters random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(dataDict['vectors'], dataDict['target_names']) report(random_search.grid_scores_)
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(y.shape[0], random_state=0) estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) for parameters, _, cv_validation_scores in grid_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i])
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5): '''Return the fitted classifier Keyword arguments: clf - - base classifier Xtrain - - training feature matrix Ytrain - - training target array param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1 opt_metric - - optimization metric opt - - whether to do optimization or not ''' if opt & (param_dict != None): assert(map(lambda x: isinstance(param_dict[x],list), param_dict)) rs = RandomizedSearchCV(estimator = clf, n_iter = n_iter, param_distributions = param_dict, scoring = opt_metric, refit = True, n_jobs=-1, cv = 3, verbose = 3) rs.fit(Xtrain, Ytrain) imp = [] if clf.__class__.__name__ == "RandomForestClassifier": imp = rs.best_estimator_.feature_importances_ return rs.best_estimator_, rs.grid_scores_, imp else: if param_dict != None: assert(map(lambda x: not isinstance(param_dict[x], list), param_dict)) for k in param_dict.keys(): clf.set_params(k = param_dict[k]) clf.fit(Xtrain, Ytrain) return clf, [], []
def doRandomSearch(self, clfName, clf, param_dist, X, Y): start = time.time() multiCores = -1 if clfName == "Logistic_Regression": multiCores = 1 if self._setXgboostTheradToOne == True and clfName == "Xgboost": multiCores = 1 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss') random_search.fit(X, Y) log(clfName + " randomized search cost: ", time.time() - start, " sec") self._bestClf[clfName] = random_search.best_estimator_ self._bestLoglossDict[clfName] = self.getLogloss( self._bestClf[clfName], X, Y) self.report(random_search.grid_scores_, clfName, self._bestLoglossDict[clfName]) dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName) return random_search.best_estimator_
def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import GradientBoostingRegressor from sklearn.grid_search import RandomizedSearchCV print "-- {} --".format("Fine-tuning Gradient Boosting Regression") rf = GradientBoostingRegressor(n_estimators=1000) param_dist = { "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2], "max_depth": sp_randint(1, 15), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "max_features": sp_randint(1, 15) } n_iter_search = 300 random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1, cv=5, verbose=1) start = time() random_search.fit(data_train_x, data_train_y) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1): # hyperparameter optimization param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08, 1e-03), "eta0": uniform(1e-03, 1), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant", "optimal"]} scoring = 'roc_auc' n_iter_search = n_iter_search random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, scoring=scoring, n_jobs=n_jobs, random_state=random_state, refit=True) X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target) random_search.fit(X, y) logger.debug('\nClassifier:') logger.debug('%s' % random_search.best_estimator_) logger.debug('\nPredictive performance:') # assess the generalization capacity of the model via a 10-fold cross validation for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']: scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs) logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return random_search.best_estimator_
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20): random_search = RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search) start = time() random_search.fit(X, y) print(("\nRandomizedSearchCV took {:.2f} seconds ""for {:d} candidates parameter ""settings.").format((time() - start),n_iter_search)) top_params = report(random_search.grid_scores_, 3) return top_params
def main(): NUM_TRAIN = bw_componentrecognition.NUM_TRAIN N_BINS = 23 N_HU_MOMENTS = 7 N_FEATURES = N_BINS + N_HU_MOMENTS X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) clfs = [ RandomForestClassifier(n_estimators=20), ] param_dists = [ {"max_depth": [10, 5, 3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]},] for clf, param_dist in zip(clfs, param_dists): # run randomized search n_iter_search = 25 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(X, y) report(random_search.grid_scores_)