def search(self, search_space, search_iter, n_estimators, x, y): if 'n_estimators' in search_space: del search_space['n_estimators'] params = { 'boosting_type': ['gbdt'], 'min_child_weight': [5], 'min_split_gain': [1.0], 'subsample': [0.8], 'colsample_bytree': [0.6], 'max_depth': [10], 'n_estimators': n_estimators, 'num_leaves': [70], 'learning_rate': [0.04], } params.update(search_space) if self.verbose: print(params) folds = 3 score_metric, skf = self.get_skf(folds) random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter, scoring=score_metric, n_jobs=1, cv=skf, verbose=0, random_state=1001) random_search.fit(x, y) self.clf = random_search.best_estimator_ return random_search.best_params_
def parameter_search(model, X, y, params, metric, n=10): ''' returns the best parameters of the classification model ''' random_search = RandomizedSearchCV(model, param_distributions=params, \ scoring = metric, n_jobs=3, n_iter=n) random_search.fit(X, y) return random_search
def pr_curve(i): label = labels[i] statistics_l = Statistics() print('Doing label {}'.format(label)) for train_idx, valid_idx in folds: rng = np.random.RandomState() rng.seed(seeds[i]) training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) base_estimators = make_classifiers(method, balanced, labels, random_state=rng) # Find the best params, then do a final proper calibration. base_estimator = base_estimators[label] estimator = RandomizedSearchCV( estimator=base_estimator, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) # Fit an evaluate the performance of the classifier. x_train = vectorizer.transform(training_fold['terms'].values) y_train = np.asarray(training_fold[label].values, dtype=int) x_valid = vectorizer.transform(validation_fold['terms'].values) y_valid = np.asarray(validation_fold[label].values, dtype=int) estimator.fit(x_train, y_train) for t in thresholds: y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)] precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1) recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1) f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1) statistics_l.update_statistics(label=t, s_type='Precision', data=precision) statistics_l.update_statistics(label=t, s_type='Recall', data=recall) statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1) statistics_l.frame()['reaction'] = label return statistics_l
def build_nn(x_train, y_train, x_test, y_test, n_features): """ Constructing a regression neural network model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ net = NeuralNet(layers=[('input', InputLayer), ('hidden0', DenseLayer), ('hidden1', DenseLayer), ('output', DenseLayer)], input_shape=(None, x_train.shape[1]), # Number of i/p nodes = number of columns in x hidden0_num_units=15, hidden0_nonlinearity=lasagne.nonlinearities.softmax, hidden1_num_units=17, hidden1_nonlinearity=lasagne.nonlinearities.softmax, output_num_units=1, # Number of o/p nodes = number of columns in y output_nonlinearity=lasagne.nonlinearities.softmax, max_epochs=100, update_learning_rate=0.01, regression=True, verbose=0) # Finding the optimal set of params for each variable in the training of the neural network param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)} clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist, n_iter=15, n_jobs=-1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/nn_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(net, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def test_trivial_cv_results_attr(): # Test search over a "grid" with only one point. # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV. clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1]}) grid_search.fit(X, y) assert_true(hasattr(grid_search, "cv_results_")) random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) random_search.fit(X, y) assert_true(hasattr(grid_search, "cv_results_"))
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) grid_search.fit(X, y) pickle.dumps(grid_search) # smoke test random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) pickle.dumps(random_search) # smoke test
def test_randomgridsearch_slm(make_gaus_data): X, y, Xs, ys = make_gaus_data slm = StandardLinearModel(LinearBasis(onescol=True)) param_dict = { 'var': [Parameter(1.0 / v, Positive()) for v in range(1, 6)] } estimator = RandomizedSearchCV(slm, param_dict, n_jobs=-1, n_iter=2) estimator.fit(X, y) Ey = estimator.predict(Xs) assert len(ys) == len(Ey) # we just want to make sure this all runs
def test_randomgridsearch_glm(make_gaus_data): X, y, Xs, ys = make_gaus_data glm = GeneralizedLinearModel(Gaussian(), LinearBasis(onescol=True), random_state=1, maxiter=100) param_dict = {'batch_size': range(1, 11)} estimator = RandomizedSearchCV(glm, param_dict, verbose=1, n_jobs=-1, n_iter=2) estimator.fit(X, y) Ey = estimator.predict(Xs) assert len(ys) == len(Ey) # we just want to make sure this all runs
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))
def test__extract_arfftrace(self): param_grid = {"max_depth": [3, None], "max_features": [1, 2, 3, 4], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} num_iters = 10 task = openml.tasks.get_task(20) clf = RandomizedSearchCV(RandomForestClassifier(), param_grid, num_iters) # just run the task train, _ = task.get_train_test_split_indices(0, 0) X, y = task.get_X_and_y() clf.fit(X[train], y[train]) trace_attribute_list = _extract_arfftrace_attributes(clf) trace_list = _extract_arfftrace(clf, 0, 0) self.assertIsInstance(trace_attribute_list, list) self.assertEquals(len(trace_attribute_list), 5 + len(param_grid)) self.assertIsInstance(trace_list, list) self.assertEquals(len(trace_list), num_iters) # found parameters optimized_params = set() for att_idx in range(len(trace_attribute_list)): att_type = trace_attribute_list[att_idx][1] att_name = trace_attribute_list[att_idx][0] if att_name.startswith("parameter_"): # add this to the found parameters param_name = att_name[len("parameter_"):] optimized_params.add(param_name) for line_idx in range(len(trace_list)): val = json.loads(trace_list[line_idx][att_idx]) legal_values = param_grid[param_name] self.assertIn(val, legal_values) else: # repeat, fold, itt, bool for line_idx in range(len(trace_list)): val = trace_list[line_idx][att_idx] if isinstance(att_type, list): self.assertIn(val, att_type) elif att_name in ['repeat', 'fold', 'iteration']: self.assertIsInstance(trace_list[line_idx][att_idx], int) else: # att_type = real self.assertIsInstance(trace_list[line_idx][att_idx], float) self.assertEqual(set(param_grid.keys()), optimized_params)
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(x, y, estimator, dataframe, params): vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(dataframe[x].values) fresh_estimator = clone(estimator) x_np, y_np, feature_names, selector = \ select_features( df = dataframe, vectorizer=vectorizer, feature_col=x, label_col=y, select_method=None, continuous_col=None ) estimator = RandomizedSearchCV(estimator, params, n_iter=60, cv=3, n_jobs=3, refit=True) estimator.fit(x_np, y_np) best_params = estimator.best_params_ if method not in ['lr', 'svm']: print("Calibrating...") estimator = CalibratedClassifierCV(fresh_estimator.set_params(**best_params), 'isotonic', 3) estimator.fit(x_np, y_np) from sklearn.base import _pprint _pprint(estimator.get_params(deep=True), offset=2) return estimator, selector, vectorizer
def model_param_search(estimator, X, y, param_dist, scoring, n_iter=1, n_cv=5, verbose=10, random_state=1, model_id='model', save_search=True): start = time.time() random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter, scoring=scoring, cv=n_cv, verbose=verbose, random_state=random_state) random_search.fit(X, y) print('Best param: ', random_search.best_params_) print('Best score: ', random_search.best_score_) print('Best model: ', random_search.best_estimator_) if save_search: with open(model_id+'.pickle', 'wb') as f: pickle.dump(random_search, f) print('Time searching param for {}: {}'.format( model_id, (time.time() - start) / 60)) return random_search.best_estimator_
def Stacking(real_train_tar): predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T sns.pairplot(predictions_train) learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] stack_model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_stack.fit(predictions_train, real_train_tar) xgb_stack.best_params_ write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl') model_stacking = XGBRegressor(**xgb_stack.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_stacking.fit(predictions_train,real_train_tar) end=time.time() print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train)))) print('Time elapsed: %.4f seconds' % (end-start)) y_stack_predict=model_stacking.predict(predictions_train) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,y_stack_predict) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price')
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(random_state=0) estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) res_params = grid_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, grid_search.cv_results_['split%d_test_score' % i][cand_i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) res_params = random_search.cv_results_['params'] for cand_i in range(len(res_params)): est.set_params(**res_params[cand_i]) for i, (train, test) in enumerate(cv.split(X, y)): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal( correct_score, random_search.cv_results_['split%d_test_score' % i][cand_i])
def train_classifier(self, trainvectors, labels, c='1.0', kernel='linear', gamma='0.1', degree='1', class_weight='balanced', jobs=1, iterations=10, scoring='f1_micro', v=2): if len(list(set(labels))) > 2: # more than two classes to distinguish parameters = ['estimator__C', 'estimator__kernel', 'estimator__gamma', 'estimator__degree'] multi = True else: # only two classes to distinguish parameters = ['C', 'kernel', 'gamma', 'degree'] multi = False if len(class_weight.split(':')) > 1: # dictionary class_weight = dict([label_weight.split(':') for label_weight in class_weight.split()]) c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == 'search' else [float(x) for x in c.split()] kernel_values = ['linear', 'rbf', 'poly'] if kernel == 'search' else [k for k in kernel.split()] gamma_values = [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048] if gamma == 'search' else [float(x) for x in gamma.split()] degree_values = [1, 2, 3, 4] if degree == 'search' else [int(x) for x in degree.split()] grid_values = [c_values, kernel_values, gamma_values, degree_values] if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings settings = {} for i, parameter in enumerate(parameters): settings[parameter] = grid_values[i][0] else: param_grid = {} for i, parameter in enumerate(parameters): param_grid[parameter] = grid_values[i] model = svm.SVC(probability=True) if multi: model = OutputCodeClassifier(model) trainvectors = trainvectors.todense() paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, scoring=scoring, verbose = v, n_iter = iterations, n_jobs = jobs, pre_dispatch = 4) paramsearch.fit(trainvectors, labels) settings = paramsearch.best_params_ # train an SVC classifier with the settings that led to the best performance self.model = svm.SVC( probability = True, C = settings[parameters[0]], kernel = settings[parameters[1]], gamma = settings[parameters[2]], degree = settings[parameters[3]], class_weight = class_weight, cache_size = 1000, verbose = v ) self.model.fit(trainvectors, labels)
def build_lasso(x_train, y_train, x_test, y_test, n_features): """ Constructing a Lasso linear model with cross validation from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ model = Lasso(random_state=1) # Random state has int value for non-random sampling param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()} clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=15, n_jobs=-1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.best_params_, clf.best_score_) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/lasso_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true(search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format( search.best_params_, tied_best_params))
def build_tree(x_train, y_train, x_test, y_test, n_features): """ Constructing a decision trees regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ model = DecisionTreeRegressor() param_dist = {'max_depth': sp_randint(1, 15), 'min_samples_split': sp_randint(2, 15)} clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=15, n_jobs=-1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.best_params_, clf.best_score_) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/dt_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def random_grid_search_tuning(self,cat_param, cat_param_distribution, f_score, n_jobs, n_iter): cat_estimator = cat.CatBoostClassifier(**cat_param) cat_rgs = RandomizedSearchCV( estimator=cat_estimator, param_distributions=cat_param_distribution, cv=self.skf, scoring=make_scorer(f_score, greater_is_better=True, needs_proba=True), n_iter=n_iter, n_jobs=n_jobs, verbose=2, refit=False, ) time_begin = time.time() cat_rgs.fit(self.X, self.y) time_end = time.time() logging.info('Random grid search eat time {0}'.format(time_end - time_begin)) logging.info('best_score_ : {0}'.format(cat_rgs.best_score_)) logging.info('best_params_ : {0}'.format(cat_rgs.best_params_)) for score in cat_rgs.grid_scores_: logging.info('grid_scores_ : {0}'.format(score)) gc.collect() return cat_rgs.best_params_
def evaluate_model(self, pipelines): n,m = pipelines parameters = self.get_params(n, self.optimizer) if self.optimizer == 'GridSearchCV': print("Performing GridSearchCV...", n) grid_search_t = GridSearchCV(m, parameters, verbose=1) grid_search_t.fit(self.evaluator.X_train, self.evaluator.y_train) return [grid_search_t.best_score_,grid_search_t.best_params_] elif self.optimizer == 'RandomizedSearchCV': print("Performing RandomizedSearchCV...", n) random_search_t = RandomizedSearchCV(m, parameters, verbose=1) random_search_t.fit(self.evaluator.X_train, self.evaluator.y_train) return [random_search_t.best_score_,random_search_t.best_params_] elif self.optimizer == 'GeneticSearchCV': print("Performing GeneticSearchCV...", n) genetic_search_t = GeneticSearchCV(m, parameters, scoring=None, cv=KFold(n_splits=5), n_jobs=1, verbose=1, refit=False, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=10) genetic_search_t.fit(self.evaluator.X_train, self.evaluator.y_train) return [genetic_search_t.best_score_,genetic_search_t.best_params_] elif self.optimizer == 'EdasSearch': print("Performing EdasSearch...", n) eda_search_t = EdasSearch(getModelAccuracy, parameters, m,iterations=2, sample_size=15, select_ratio=0.3, debug=False, n_jobs=1) eda_search_t.fit() return [eda_search_t.best_score_,eda_search_t.best_params_]
def train_classifier(self, trainvectors, labels, n_neighbors='3', weights='uniform', algorithm='auto', leaf_size='30', metric='euclidean', p=2, scoring='roc_auc', jobs=1, v=2): if len(list(set(labels))) > 2: # more than two classes to distinguish parameters = ['estimator__n_neighbors','estimator__weights', 'estimator__leaf_size', 'estimator__metric'] multi = True else: # only two classes to distinguish parameters = ['n_neighbors','weights', 'leaf_size', 'metric'] multi = False n_neighbours = [3,5,7,9] if n_neighbors == 'search' else [int(x) for x in n_neighbors.split()] weights = ['uniform','distance'] if weights == 'search' else weights.split() leaf_size = [10,20,30,40,50] if n_neighbors == 'search' else [int(x) for x in leaf_size.split()] metric = ['minkowski','euclidean','manhattan','hamming'] if metric == 'search' else metric.split() grid_values = [n_neighbors, weights, leaf_size, metric] if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings settings = {} for i, parameter in enumerate(parameters): settings[parameter] = grid_values[i][0] else: param_grid = {} for i, parameter in enumerate(parameters): param_grid[parameter] = grid_values[i] model = KNeighborsClassifier(algorithm=algorithm,p=p) if multi: model = OutputCodeClassifier(model) trainvectors = trainvectors.todense() paramsearch = RandomizedSearchCV(model, param_grid, verbose=v, scoring=scoring, cv=5, n_jobs=jobs) paramsearch.fit(trainvectors, labels) settings = paramsearch.best_params_ self.model = KNeighborsClassifier( algorithm=algorithm, p=p, n_neighbors=settings[parameters[0]], weights=settings[parameters[1]], leaf_size=settings[parameters[2]], metric=settings[parameters[3]] ) self.model.fit(trainvectors, labels)
def model(clf_name, features, labels): start_time = time.time() # specify parameters and distributions to sample from clf = make_pipeline(StandardScaler(), PCA(), classifiers[clf_name]) #PCA optional: n_components=2 '''clf = Pipeline([ ('reduce_dim', PCA()), ('classify', classifiers[clf_name]) ])''' # select correct param set, adjust the pca to current window param_dist = param_dict[clf_name] param_dist["pca__n_components"] = sp_randint(2, features.shape[1]-1) #if 'randomforestclassifier__max_features' in param_dist: # param_dist['randomforestclassifier__max_features'] = sp_randint(2, features.shape[1]) # run randomized search n_iter_search = 50 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring="f1_weighted")#, #n_jobs = 8)#,pre_dispatch=8) #start = time.time() random_search.fit(features, labels) #print("RandomizedSearchCV took %.2f seconds for %d candidates" # " parameter settings." % ((time.time() - start), n_iter_search)) #report(random_search.cv_results_) elapsed_time = time.time() - start_time print 'Optimizing %s on window %d took %d sec'%(clf_name, features.shape[1]/6, elapsed_time) return random_search
def test_random_search_cv_results(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. n_splits = 3 n_search_iter = 30 params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_splits, iid=False, param_distributions=params) random_search.fit(X, y) random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_splits, iid=True, param_distributions=params) random_search_iid.fit(X, y) param_keys = ('param_C', 'param_gamma') score_keys = ('mean_test_score', 'mean_train_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'std_test_score', 'std_train_score', 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time') n_cand = n_search_iter for search, iid in zip((random_search, random_search_iid), (False, True)): assert_equal(iid, search.iid) cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(cv_results, param_keys, score_keys) check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) # For random_search, all the param array vals should be unmasked assert_false(any(cv_results['param_C'].mask) or any(cv_results['param_gamma'].mask)) check_cv_results_grid_scores_consistency(search)
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] params = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} cv_result = RandomizedSearchCV(RandomForestClassifier(),params,cv=3,scoring='accuracy',random_state = 5) cv_result.fit(X_resampled, y_resampled) cv_result.best_params_ # In[114]: classifier_op = RandomForestClassifier(n_estimators = 2000, min_samples_split=2,min_samples_leaf=1,max_features='auto', max_depth=70, random_state = 42,bootstrap=False) classifier_op.fit(X_resampled, y_resampled) # In[115]:
return model def create_hyperparameters(): batches = [10, 20] optimizers = ['rmsprop', 'adam', 'adadelta'] dropout = [0.1] return {'batch_size': batches, 'optimizer': optimizers, 'drop': dropout} hyperparameters = create_hyperparameters() # 우리가 만든 케라스 모델을 싸이킷런에 넣을수 있게 wrapping! from tensorflow.keras.wrappers.scikit_learn import KerasClassifier model = KerasClassifier(build_fn=build_model, verbose=0) # wrapping된 모델을 이용해서 싸이킷런의 GridSearchCV 이용! from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # search = GridSearchCV(model, hyperparameters, cv=3) search = RandomizedSearchCV(model, hyperparameters, cv=3) search.fit(x_train, y_train) acc = search.score(x_test, y_test) print('최적의 파라미터 :', search.best_params_) print('최종 스코어:', acc) print('hyper_cnn') ''' 최적의 파라미터 : {'optimizer': 'adam', 'drop': 0.1, 'batch_size': 20} 최종 스코어: 0.9825000166893005 '''
batches = [50,100,120] optimizers = ['rmsprop', 'adam', 'adadelta'] # 용도에 맞게 쓰자. dropout = np.linspace(0.1, 0.25, 0.5, 5) epochs = [1,2] return{"kerasclassifier__batch_size":batches, "kerasclassifier__optimizer":optimizers, "kerasclassifier__epochs":epochs} #, "keep_prob":dropout} # 밑에서 make를 쓸때는 각 parameter앞에 kerasclassifier__를, 그냥 Pipeline 쓸때는 svc__를 써주면 됨! from keras.wrappers.scikit_learn import KerasClassifier # 사이킷런과 호환하도록 함. (mnist에서 쓸듯) # from keras.wrappers.scikit_learn import KerasRegressor # 사이킷런의 교차검증을 keras에서 사용하기 위해 wrapping함 model = KerasClassifier(build_fn=build_network, verbose=1) # verbose=0 위에서 만든 함수형 모델 당겨옴. from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import make_pipeline from sklearn.pipeline import Pipeline from sklearn.svm import SVC hyperparameters = create_hyperparameters() pipe = make_pipeline(MinMaxScaler(), model) from sklearn.model_selection import RandomizedSearchCV search = RandomizedSearchCV(estimator=pipe, param_distributions=hyperparameters, n_iter=10, n_jobs=1, cv=3, verbose=1) # # 작업이 10회 수행, 3겹 교차검증 사용(3조각을 나눠서 검증). n_jobs는 알아서 찾아볼 것. # KFold가 5번 돌았다면 얘는 랜덤하게 돈다. 이 작업을 하는 것은 위의 하이퍼파라미터 중 최적의 결과를 내는 파라미터들을 찾기 위함. # search.fit(data["x_train"], data["y_train"]) search.fit(x_train, y_train) # 데이터 집어넣기! print(search.best_score_) print(search.best_params_)
results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 6), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(info_train, OL_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_, n_top=10) stop forest = RandomForestClassifier(n_estimators=100, **random_search.best_params_) scores = cross_val_score(forest, class_info, stars) print scores.mean() forest.fit(info_train, OL_train) #joblib.dump(forest, 'trained_forest_classifier.pkl')
X = train_df.drop("Survived", axis=1).copy() y = train_df["Survived"] # In[ ]: param_grid = { 'max_depth': st.randint(6, 11), 'n_estimators': st.randint(300, 500), 'max_features': np.arange(0.5, .81, 0.05), 'max_leaf_nodes': st.randint(6, 10) } grid = RandomizedSearchCV(rfc, param_grid, cv=10, scoring='accuracy', verbose=1, n_iter=20) grid.fit(X, y) # In[ ]: grid.best_estimator_ # In[ ]: grid.best_score_ # Ok so now let's generate our predictions based on the best estimator model.
# specify parameters for hyperparameter search # specify parameters and distributions to sample from param_dist = { "max_depth": [6, None], "max_features": sp_randint(1, max_features), "min_samples_split": sp_randint(2, 30), "min_samples_leaf": sp_randint(1, 30), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search n_iter_search = _RF_iterations random_search = RandomizedSearchCV( clf, param_distributions=param_dist, n_iter=n_iter_search, verbose=_VERBOSITY ) # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html # let's start training the model. print("Beginning hyper-parameter search") start = time() random_search.fit(X_train, y_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(random_search.best_params_) # now let's train a model on the entire training set # with those parameters
results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]}
from scipy.stats import uniform as sp_uniform # Create parameters grid for RBF kernel, we have to set C and gamma C_dist = sp_uniform(scale=10) gamma_dist = sp_uniform(scale=1) parameters = {'kernel':['rbf'], 'C':C_dist, 'gamma': gamma_dist } from sklearn.model_selection import RandomizedSearchCV n_iter_search = 8 svm_clsf = svm.SVC() rnd_clsf = RandomizedSearchCV(estimator=svm_clsf, param_distributions=parameters, n_iter=n_iter_search, cv=3, n_jobs=1, verbose=2) # Warning! It takes really long time to compute this about 2 days start_time = dt.datetime.now() print('Start param searching at {}'.format(str(start_time))) rnd_clsf.fit(X_train, y_train) elapsed_time= dt.datetime.now() - start_time print('Elapsed time, param searching {}'.format(str(elapsed_time))) sorted(rnd_clsf.cv_results_.keys()) classifier = rnd_clsf.best_estimator_ params = rnd_clsf.best_params_
'bootstrap': bootstrap, 'max_samples': max_samples } pprint(random_grid) #%% # Use the random grid to search for best hyperparameters # First create the base model to tune rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 10000 different combinations, and use all available cores rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_absolute_error', n_iter=10000, cv=5, verbose=2, random_state=42, n_jobs=-1) # Fit the random search model rf_random.fit(train_features, train_labels) #%% # Create dataframe with metrics of every single combination tested random_results = pd.DataFrame(rf_random.cv_results_) random_results = random_results.sort_values('rank_test_score') random_results.to_json( r'/Users/matthew/Desktop/data/RF_randomized_search_CV_mae.json') # set RF with best parameters from random sampling print(rf_random.best_params_)
def run_test(filename, results_dir, models, random_state, external_split, internal_split, optimization_iterations): global df_results print(filename) data_dict['Dataset Name'] = filename.replace('.csv', '') df = pd.read_csv(directory + '/' + filename) X, Y = fix_dataset(df) kf = StratifiedKFold(n_splits=external_split, random_state=random_state, shuffle=True) for fold_index, (train_index, test_index) in enumerate(kf.split(X, Y)): data_dict['Cross Validation[1-10]'] = fold_index print("fold index =", fold_index) x_train = X.iloc[train_index] y_train = Y.iloc[train_index] x_test = X.iloc[test_index] y_test = Y.iloc[test_index] for model_name, model_class, model, model_dict in models: print('Model:', model_name) data_dict['Algorithm Name'] = model_name # distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1']) distributions = model_dict start_training_time = time.time() randomSearcher = RandomizedSearchCV( model, distributions, random_state=random_state, cv=internal_split, n_iter=optimization_iterations, scoring=make_scorer(accuracy_score)) randomSearcher.fit(x_train, y_train.values.ravel()) if model_class is wprb: params = { k.replace("estimator__", ""): v for k, v in randomSearcher.best_params_.items() } best_model = OneVsRestClassifier(model_class(**params)) else: params = randomSearcher.best_params_ best_model = model_class(**params) data_dict['Hyper-Parameters Values'] = params best_model.fit(x_train, y_train.values.ravel()) data_dict['Training Time'] = time.time() - start_training_time print("best params:", params) print( "train accuracy:", round(accuracy_score(y_train, best_model.predict(x_train)), 4)) start_inference_time = time.time() test_pred = best_model.predict(x_test) test_pred_proba = best_model.predict_proba(x_test) data_dict['Inference Time'] = ( time.time() - start_inference_time) / (len(x_test)) * 1000 print("test accuracy:", round(accuracy_score(y_test, test_pred), 4)) print() data_dict['Accuracy'] = accuracy_score(y_test, test_pred) data_dict['Precision'] = precision_score( y_test, test_pred, average='macro', labels=np.unique(test_pred)) unique_labels = np.unique(Y.values) if len(unique_labels) == 2: # multiclass vs binary classification data_dict['AUC'] = roc_auc_score(y_true=y_test, y_score=test_pred_proba[:, 1]) else: # plaster = test_pred_proba[:, [np.where(np.unique(Y.values) == x)[0][0] for x in np.unique(y_test)]] # plaster2 = np.array([[x / sum(y) for x in y] for y in plaster]) data_dict['AUC'] = roc_auc_score(y_true=y_test, y_score=test_pred_proba, multi_class='ovr', labels=np.unique(y_test)) all_TPR = [] all_FPR = [] all_PR_CURVE = [] for index, class_label in enumerate(np.unique(y_test)): tn, fp, fn, tp = confusion_matrix( y_test == class_label, test_pred == class_label).ravel() all_FPR.append(fp / (fp + tn)) all_TPR.append(tp / (tp + fn)) precision, recall, _ = precision_recall_curve( y_test == class_label, test_pred_proba[:, index]) all_PR_CURVE.append(auc(recall, precision)) data_dict['FPR'] = np.mean(all_FPR) data_dict['TPR'] = np.mean(all_TPR) data_dict['PR Curve'] = np.mean(all_PR_CURVE) df_results = df_results.append(data_dict, ignore_index=True) df_results.to_csv(results_dir + '/' + filename, index=False) df_results = df_results.iloc[0:0]
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
def _compute_thresh(this_data, method='bayesian_optimization', cv=10, y=None, random_state=None): """Compute the rejection threshold for one channel. Parameters ---------- this_data: array (n_epochs, n_times) Data for one channel. method : str 'bayesian_optimization' or 'random_search' cv : iterator Iterator for cross-validation. random_state : int seed, RandomState instance, or None (default) The seed of the pseudo random number generator to use. Returns ------- best_thresh : float The best threshold. Notes ----- For method='random_search', the random_state parameter gives deterministic results only for scipy versions >= 0.16. This is why we recommend using autoreject with scipy version 0.16 or greater. """ est = _ChannelAutoReject() all_threshes = np.sort(np.ptp(this_data, axis=1)) if method == 'random_search': param_dist = dict(thresh=uniform(all_threshes[0], all_threshes[-1])) rs = RandomizedSearchCV(est, param_distributions=param_dist, n_iter=20, cv=cv, random_state=random_state) rs.fit(this_data, y) best_thresh = rs.best_estimator_.thresh elif method == 'bayesian_optimization': cache = dict() def func(thresh): idx = np.where(thresh - all_threshes >= 0)[0][-1] thresh = all_threshes[idx] if thresh not in cache: est.set_params(thresh=thresh) obj = -np.mean(cross_val_score(est, this_data, y=y, cv=cv)) cache.update({thresh: obj}) return cache[thresh] n_epochs = all_threshes.shape[0] idx = np.concatenate(( np.linspace(0, n_epochs, 40, endpoint=False, dtype=int), [n_epochs - 1])) # ensure last point is in init idx = np.unique(idx) # linspace may be non-unique if n_epochs < 40 initial_x = all_threshes[idx] best_thresh, _ = bayes_opt(func, initial_x, all_threshes, expected_improvement, max_iter=10, debug=False, random_state=random_state) return best_thresh
# Import necessary modules from scipy.stats import randint from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import RandomizedSearchCV # Setup the parameters and distributions to sample from: param_dist param_dist = {"max_depth": [3, None], "max_features": randint(1, 9), "min_samples_leaf": randint(1, 9), "criterion": ["gini", "entropy"]} # Instantiate a Decision Tree classifier: tree tree = DecisionTreeClassifier() # Instantiate the RandomizedSearchCV object: tree_cv tree_cv = RandomizedSearchCV(tree, param_dist, cv=5) # Fit it to the data tree_cv.fit(X,y) # Print the tuned parameters and score print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) print("Best score is {}".format(tree_cv.best_score_)) #Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 2} #Best score is 0.7395833333333334 # RandomizedSearchCV will never outperform GridSearchCV. Instead, it is valuable because it saves on computation time. ####################33 # First: Hold-out evaluation data # how well can the model perform on never seen data?
#random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf } #Random forest regressor with random grid rf = RandomForestRegressor() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, scoring='neg_mean_squared_error', verbose=2, random_state=7, cv=5) #timer function def timer(start_time=None): if not start_time: start_time = datetime.now() return start_time elif start_time: hour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600) mins, sec = divmod(temp_sec, 60) print("\n Time taken: %i:%i:%s" % (hour, mins, round(sec, 2)))
from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV # Create the parameter grid based on the results of random search param_grid = { 'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100, 200, 300, 500, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=10, n_jobs=-1, verbose=2) model_fit_RF = random_search_rf.fit(X_train, y_train) print(model_fit_RF.best_params_) ##Testing the model test_predict_RF = model_fit_RF.predict(X_test) from sklearn import metrics print('RF Mean Absolute Error:', metrics.mean_absolute_error(y_test, test_predict_RF)) print('RF Mean Squared Error:', metrics.mean_squared_error(y_test, test_predict_RF)) print('RF R2:', metrics.r2_score(y_test, test_predict_RF)) print('RF Root Mean Squared Error:',
def ml_tests(imputed_data): # ScikitLearn Anforderung: Nur numerische Werte - Transformation der kategorischen Spalten categorical_mask = (imputed_data.dtypes == "category") categorical_columns = imputed_data.columns[categorical_mask].tolist() category_enc = pd.get_dummies(imputed_data[categorical_columns]) imputed_data = pd.concat([imputed_data, category_enc], axis=1) imputed_data = imputed_data.drop(columns=categorical_columns) imputed_data = imputed_data.reset_index() # Ausgabe # print(imputed_data.info()) # imputed_data.to_excel(excel_writer="Files/Tests/imputed_data.xlsx", sheet_name="Immobilien") # XGBoost Standardmodell print("XGBoost Standardmodell:") x = imputed_data.drop(columns=["angebotspreis"]).values y = imputed_data["angebotspreis"].values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) xg_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=20, seed=123) xg_reg.fit(x_train, y_train) preds = xg_reg.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, preds)) print("RMSE: %f" % rmse) print() print_feature_importances( model=xg_reg, data=imputed_data.drop(columns=["angebotspreis"])) # Grid Search parameter Tuning print("Grid Search Parameter Tuning:") gbm_param_grid = { 'colsample_bytree': [0.3, 0.7], 'n_estimators': [50], 'max_depth': [2, 5] } gbm = xgb.XGBRegressor(objective="reg:squarederror") grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring="neg_mean_squared_error", cv=4, verbose=1) grid_mse.fit(x_train, y_train) print("Best parameters found: ", grid_mse.best_params_) print("Lowest RMSE Grid Search found: ", np.sqrt(np.abs(grid_mse.best_score_))) print() # Randomized Search parameter tuning print("Randomized Search Parameter Tuning:") gbm_param_grid2 = {'n_estimators': [25], 'max_depth': range(2, 12)} gbm2 = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10) randomized_mse = RandomizedSearchCV(estimator=gbm2, param_distributions=gbm_param_grid2, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=1) randomized_mse.fit(x_train, y_train) print("Best parameters found: ", randomized_mse.best_params_) print("Lowest RMSE Randomized Search found: ", np.sqrt(np.abs(randomized_mse.best_score_))) dm_train = xgb.DMatrix(data=x_train, label=y_train) dm_test = xgb.DMatrix(data=x_test, label=y_test) params = {"booster": "gblinear", "objective": "reg:squarederror"} xg_reg2 = xgb.train(dtrain=dm_train, params=params, num_boost_round=15) preds2 = xg_reg2.predict(dm_test) rmse = np.sqrt(mean_squared_error(y_test, preds2)) print("RMSE: %f" % rmse) reg_params = [0.1, 0.3, 0.7, 1, 10, 100] params1 = {"objective": "reg:squarederror", "max_depth": 3} rmses_l2 = [] for reg in reg_params: params1["lambda"] = reg cv_results_rmse = xgb.cv(dtrain=dm_train, params=params1, nfold=3, num_boost_round=15, metrics="rmse", as_pandas=True) rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0]) print("Best rmse as a function of l2:") print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2", "rmse"])) print() print_feature_importances( model=xg_reg2, data=imputed_data.drop(columns=["angebotspreis"])) # Stochastic Gradient Boosting print("Stochastic Gradient Boosting:") sgbr = GradientBoostingRegressor(max_depth=4, subsample=0.9, max_features=0.75, n_estimators=200, random_state=2) sgbr.fit(x_train, y_train) y_pred = sgbr.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("RMSE: %f" % rmse) print() print_feature_importances( model=sgbr, data=imputed_data.drop(columns=["angebotspreis"])) # Random Forrest print("Random Forrest:") rf = RandomForestRegressor(n_estimators=25, random_state=2) rf.fit(x_train, y_train) y_pred2 = rf.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred2)) print("RMSE: %f" % rmse) print() print_feature_importances( model=rf, data=imputed_data.drop(columns=["angebotspreis"]))
n_estimators = [1000] max_features = ['auto'] max_depth = [3, 5, 7] min_samples_leaf = [2, 4] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf } rf = RandomForestClassifier() rf_random = RandomizedSearchCV( estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, scoring='accuracy', verbose=5, n_jobs=-1) rf_random.fit(X_train, y_train) print(rf_random.best_params_) print("Scores for the Train Dataset: ") y_train_pred = rf_random.predict(X_train) accuracy_train = accuracy_score(y_train, y_train_pred) print("Accuracy: %.2f%%" % (accuracy_train * 100.0)) print("- - - - - - - - - - ") print("Scores for the Test Dataset: ")
np.random.seed(SEED) param_space = { "max_depth": [3, 5, 10, 15, 20, 30, None], "min_samples_split" : randint(2, 150), "min_samples_leaf" : randint(2, 150), "criterion" : ["gini", "entropy"] } raw_train_x, validation_x, raw_train_y, validation_y = train_test_split(x, y, test_size=0.25, random_state=SEED, stratify=y) from sklearn.model_selection import RandomizedSearchCV search = RandomizedSearchCV(DecisionTreeClassifier(), param_space, n_iter=100, cv = 5, random_state = SEED) search.fit(raw_train_x, raw_train_y) results = pd.DataFrame(search.cv_results_) results.head() print(len(results)) print(search.best_params_) print(search.best_score_) best = search.best_estimator_ best predictions = best.predict(validation_x)
loss = ['deviance', 'exponential'] grid_random = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'loss': loss } # random search of parameters, parameters of the randomnized search can be modified GB_random = RandomizedSearchCV(estimator=GD_Classifier, param_distributions=grid_random, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1) # fit the randomnized search CV GB_random.fit(X_train, Y_train) print( '>>> Starting Grid Search Cross Validation to find optimal Gradient Boosting Classfifier...' ) # create the parameter grid based on the results of random search # varying slightly the optimised parameters found grid_parameters = { 'loss': [GB_random.best_params_['loss']],
'max_leaf_nodes': (1,10,100,), 'min_samples_split': (0.1,0.25,0.5,0.75,1.0,), 'min_samples_leaf': (1,10,100,), }] #est=ensemble.RandomForestRegressor() #est=kernel_ridge.KernelRidge() #est=neighbors.NearestNeighbors() #est=neighbors.KNeighborsRegressor() est=ensemble.ExtraTreesRegressor() # https://stackoverflow.com/questions/37161563/how-to-graph-grid-scores-from-gridsearchcv # run randomized search n_iter_search = 20 rs = RandomizedSearchCV(est, param_distributions=hyper_params, n_iter=n_iter_search) t0 = time.time() rs.fit(x_train, y_train.ravel()) runtime = time.time() - t0 print("RandomizedSearchCV took %.6f seconds for %d candidates" " parameter settings." % (runtime, n_iter_search)) print(rs.cv_results_) #scores = [x[1] for x in rs.grid_scores_] #scores = np.array(scores).reshape(len(Cs), len(Gammas)) # #for ind, i in enumerate(Cs): # plt.plot(Gammas, scores[ind], label='C: ' + str(i)) #plt.legend() #plt.xlabel('Gamma') #plt.ylabel('Mean score') #plt.show()
from sklearn.model_selection import ShuffleSplit clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic') param_dist = {'silent': [False], 'max_depth': [6, 10, 15, 20], 'learning_rate': [0.001, 0.01, 0.1, 0.2, .3, .4, 0,3], 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], 'gamma': [0, 0.25, 0.3, 0.4, 0.5, 1.0], 'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0], 'reg_alpha': 2. ** np.arange(-13, 10, 2), 'n_estimators': [100,150,200] } clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, n_iter = 25, scoring = 'f1', error_score = 0, verbose = 3, n_jobs = -1) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) gc.collect() estimators = [] results = np.zeros(len(one_hot_encoded_X)) score = [] for train_index, test_index in rs.split(one_hot_encoded_X): # print('Iteration:', i) X_train, X_test = one_hot_encoded_X.iloc[train_index], one_hot_encoded_X.iloc[test_index] y_train, y_test = new_y_binary.iloc[train_index], new_y_binary.iloc[test_index] clf.fit(X_train, y_train) estimators.append(clf.best_estimator_)
'scaler': scalers_to_test, 'red_dim': [PCA()], 'red_dim__n_components': n_features_to_test, 'clf__hidden_layer_sizes': l, 'clf__activation': ['identity', 'logistic', 'tanh', 'relu'], 'clf__solver': ['lbfgs', 'sgd', 'adam'], 'clf__batch_size': b_size, 'clf__learning_rate': ['constant', 'invscaling', 'adaptive'] }] from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV grid = RandomizedSearchCV(pipeline, param_distributions=parameteres, n_iter=100, cv=5, random_state=1) print('a') grid.fit(X_train, y_train) print('b') score = {grid.score(X_test, y_test)} score = grid.score(X_test, y_test) best_p = grid.best_params_ #print(f'score = {grid.score(X_test, y_test)}') #print(grid.best_params_)
# Define the parameters that will be tuned randomly keras_param_options = {'filters' : [4, 8, 16], 'filters_LSTM' : [4, 8, 16], 'strides' : [1], 'padding' : ['valid'], 'activation_convolution' : [None], 'activation_LSTM' : ['tanh'], 'optimizers' : ['Adam', 'Adadelta'], 'number_hidden_units' : [4, 8], 'epochs' : [30], 'batch_size' : [8, 16, 32]} # Using RandomizedSearchCV to find the best model randomly random_search = RandomizedSearchCV(model, param_distributions = keras_param_options, n_iter = 50, cv = 5, verbose = 10) # Fit to the training data random_search.fit(x_train, y_train) df_result_hyper_tuned = pd.DataFrame.from_dict(random_search.cv_results_) df_result_hyper_tuned.to_csv('/hpc-home/kristian/effector-non-effector/scripts-cnn-lstm-separate-group/scripts-scan-multiclass/bacteria/results/all_scan_results_cnn_lstm_scan_bacteria_secreted.csv') # Save all of the params to be used to predict on the test data df_result_hyper_tuned['mean_test_score']= pd.to_numeric(df_result_hyper_tuned['mean_test_score']) param_best_model_dict = dict(df_result_hyper_tuned.nlargest(30, 'mean_test_score')['params']) params = list(param_best_model_dict.values()) print(params) # Get info ahead about the best model obtained
y_dat = data['cancel_1.0'] X_dat = data.drop(['id','train','credit','state','cancel_1.0'], axis=1) X_train, X_test, y_train,y_test = cross_validation.train_test_split(X_dat, y_dat, test_size=0.2, random_state=0) params = { 'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5] } # The data is huge, so maybe don't need so many cv folds = 3 # Test Enough of these param_comb = 1 skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001) # early stopping to decrease time xgb = XGBClassifier(learning_rate=0.02, n_estimators=100, objective='binary:logistic', silent=True, nthread=1) random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=8, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 ) start_time = timer(None) # timing starts from this point for "start_time" variable random_search.fit(X_train, y_train) timer(start_time) # timing ends here for "start_time" variable results = pd.DataFrame(random_search.cv_results_) results.to_csv('xgb-random-grid-search-results-01.csv', index=False)
"batch_size": batches, 'opt': optimizer, "drop": droptout, 'lr': lr, 'act': act, 'epochs': epoch, 'validation_split': splt } model = KerasClassifier(build_fn=build_model, verbose=1) hyperparameters = create_hyper() search = RandomizedSearchCV(estimator=model, param_distributions=hyperparameters, n_iter=1, cv=None, n_jobs=1) search.fit(x_train, y_train) # print(search.estimator.fit(x_test,y_test)) pred = search.predict(x_test) print(pred) print(y_train) print("shape", y_test.shape) print("shape", pred.shape) try: pred = np.argmax(pred) score = accuracy_score(y_test, pred) print(score)
#predict_train=clf.predict(X_train) pred_tree_test=decisionTree.predict(X_test) pred_tree_train=decisionTree.predict(X_train) tree_accuracy_train=model_generate_reports(y_train,pred_tree_train) tree_accuracy_test=model_generate_reports(y_test,pred_tree_test) from sklearn.metrics import confusion_matrix confusion_matrixTree = confusion_matrix(y_test, pred_tree_test) ## randomized search cv for decision tree param_dist = {"max_depth": [2,3,4,5,6,7,8,9, None], "max_features": [2,4,6,8,10,12,14,16,18,20], "min_samples_leaf":[2,4,6,8,10,12,14,16,18,20,22,24,26], "criterion": ["gini", "entropy"]} tree_cv = RandomizedSearchCV(decisionTree, param_dist, cv = 5) tree_cv.fit(X_train,y_train) print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) pred_tree_train=tree_cv.predict(X_train) pred_tree_test=tree_cv.predict(X_test) tree_accuracy_train=model_generate_reports(y_train,pred_tree_train) tree_accuracy_test=model_generate_reports(y_test,pred_tree_test) confusion_matrixTree=confusion_matrix(y_test, pred_tree_test) ############## final decision tree################################# tree_algo=DecisionTreeClassifier(random_state=0,min_samples_leaf=10,max_features=20,max_depth=9,criterion='entropy') tree_algo.fit(X_train,y_train) #predict_train=clf.predict(X_train) pred_tree_test=tree_algo.predict(X_test) pred_tree_train=tree_algo.predict(X_train) tree_accuracy_train=model_generate_reports(y_train,pred_tree_train)
'learning_rate': [0.1], 'max_depth': [6], 'booster': ['dart'], 'rate_drop': [0.21], 'eval_metric': ['logloss', 'mae'], 'is_training_metric': [True], 'max_leaves': [144], 'colsample_bytree': [0.8], 'subsample': [0.8], 'seed': [66] }] kfold = KFold(n_splits=5, shuffle=True, random_state=66) y_test_pred = [] y_pred = [] search = RandomizedSearchCV(XGBRegressor(n_jobs=6), parameters, cv=kfold, n_iter=1) for i in range(4): fit_params = { 'verbose': True, 'eval_metric': ['logloss', 'mae'], 'eval_set': [(x_train, y_train[:, i]), (x_test, y_test[:, i])], 'early_stopping_rounds': 5 } search.fit(x_train, y_train[:, i], **fit_params) y_pred.append(search.predict(x_pred)) y_test_pred.append(search.predict(x_test)) # print(search.best_score_) #############################
class Trainer(object): # Mlflow parameters identifying the experiment, you can add all the parameters you wish ESTIMATOR = "Linear" EXPERIMENT_NAME = "TaxifareModel" def __init__(self, X, y, **kwargs): """ FYI: __init__ is called every time you instatiate Trainer Consider kwargs as a dict containig all possible parameters given to your constructor Example: TT = Trainer(nrows=1000, estimator="Linear") ==> kwargs = {"nrows": 1000, "estimator": "Linear"} :param X: :param y: :param kwargs: """ self.pipeline = None self.kwargs = kwargs self.grid = kwargs.get("gridsearch", False) # apply gridsearch if True self.local = kwargs.get("local", True) # if True training is done locally self.optimize = kwargs.get( "optimize", False) # Optimizes size of Training Data if set to True self.mlflow = kwargs.get("mlflow", False) # if True log info to nlflow self.upload = kwargs.get("upload", False) # if True log info to nlflow self.experiment_name = kwargs.get("experiment_name", self.EXPERIMENT_NAME) # cf doc above self.model_params = None # for self.X_train = X self.y_train = y del X, y self.split = self.kwargs.get("split", True) # cf doc above if self.split: self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X_train, self.y_train, test_size=0.15) self.nrows = self.X_train.shape[0] # nb of rows to train on self.log_kwargs_params() self.log_machine_specs() def get_estimator(self): estimator = self.kwargs.get("estimator", self.ESTIMATOR) if estimator == "Lasso": model = Lasso() elif estimator == "Ridge": model = Ridge() elif estimator == "Linear": model = LinearRegression() elif estimator == "GBM": model = GradientBoostingRegressor() elif estimator == "RandomForest": model = RandomForestRegressor() self.model_params = { # 'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)], 'max_features': ['auto', 'sqrt'] } # 'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]} elif estimator == "xgboost": model = XGBRegressor(objective='reg:squarederror', n_jobs=-1, max_depth=10, learning_rate=0.05, gamma=3) self.model_params = { 'max_depth': range(10, 20, 2), 'n_estimators': range(60, 220, 40), 'learning_rate': [0.1, 0.01, 0.05] } else: model = Lasso() estimator_params = self.kwargs.get("estimator_params", {}) self.mlflow_log_param("estimator", estimator) model.set_params(**estimator_params) print(colored(model.__class__.__name__, "red")) return model def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get( "feateng", ["distance", "time_features", 'direction', 'distance_to_center']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler()) # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)]) def add_grid_search(self): """" Apply Gridsearch on self.params defined in get_estimator {'rgs__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)], 'rgs__max_features' : ['auto', 'sqrt'], 'rgs__max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]} """ # Here to apply ramdom search to pipeline, need to follow naming "rgs__paramname" params = {"rgs__" + k: v for k, v in self.model_params.items()} self.pipeline = RandomizedSearchCV(estimator=self.pipeline, param_distributions=params, n_iter=10, cv=2, verbose=1, random_state=42, n_jobs=None) @simple_time_tracker def train(self, gridsearch=False): tic = time.time() self.set_pipeline() if gridsearch: self.add_grid_search() self.pipeline.fit(self.X_train, self.y_train) # mlflow logs self.mlflow_log_metric("train_time", int(time.time() - tic)) def evaluate(self): rmse_train = self.compute_rmse(self.X_train, self.y_train) self.mlflow_log_metric("rmse_train", rmse_train) if self.split: rmse_val = self.compute_rmse(self.X_val, self.y_val, show=True) self.mlflow_log_metric("rmse_val", rmse_val) print( colored( "rmse train: {} || rmse val: {}".format( rmse_train, rmse_val), "blue")) else: print(colored("rmse train: {}".format(rmse_train), "blue")) def compute_rmse(self, X_test, y_test, show=False): if self.pipeline is None: raise ("Cannot evaluate an empty pipeline") y_pred = self.pipeline.predict(X_test) if show: res = pd.DataFrame(y_test) res["pred"] = y_pred print(colored(res.sample(5), "blue")) rmse = compute_rmse(y_pred, y_test) return round(rmse, 3) def save_model(self): """Save the model into a .joblib and upload it on Google Storage /models folder HINTS : use sklearn.joblib (or jbolib) libraries and google-cloud-storage""" joblib.dump(self.pipeline, 'model.joblib') print(colored("model.joblib saved locally", "green")) if self.upload: storage_upload(model_version=MODEL_VERSION) ### MLFlow methods @memoized_property def mlflow_client(self): mlflow.set_tracking_uri(MLFLOW_URI) return MlflowClient() @memoized_property def mlflow_experiment_id(self): try: return self.mlflow_client.create_experiment(self.experiment_name) except BaseException: return self.mlflow_client.get_experiment_by_name( self.experiment_name).experiment_id @memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): if self.mlflow: self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): if self.mlflow: self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) def log_estimator_params(self): reg = self.get_estimator() self.mlflow_log_param('estimator_name', reg.__class__.__name__) params = reg.get_params() for k, v in params.items(): self.mlflow_log_param(k, v) def log_kwargs_params(self): if self.mlflow: for k, v in self.kwargs.items(): self.mlflow_log_param(k, v) def log_machine_specs(self): cpus = multiprocessing.cpu_count() mem = virtual_memory() ram = int(mem.total / 1000000000) self.mlflow_log_param("ram", ram) self.mlflow_log_param("cpus", cpus)
def do_fold(j): print("\tFold " + str(j+1)) train_idx = folds_i[j][0] valid_idx = folds_i[j][1] training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) # shuffle the folds training_stats_i_f = Statistics() validation_stats_i_f = Statistics() testing_stats_i_f = Statistics() # Init the label ranking lists. label_pred_proba_train = [] label_pred_proba_valid = [] label_pred_proba_test = [] label_y_train = [] label_y_valid = [] label_y_test = [] # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=True, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag) base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng) for label in sorted(labels): print("\t\tFitting for label {}...".format(label)) # SVMs make the assumption of standardised features. Hence we scale the features # avoiding the use of mean to maintain the structure of count sparsity. Scaling # May also help with linear model convergence speed. x_train_l = vectorizer.transform(training_fold['terms'].values) y_train_l = np.asarray(training_fold[label].values, dtype=int) x_valid_l = vectorizer.transform(validation_fold['terms'].values) y_valid_l = np.asarray(validation_fold[label].values, dtype=int) x_test_l = vectorizer.transform(testing_df['terms'].values) y_test_l = np.asarray(test_df_i[label].values, dtype=int) if scale: x_train_l = mean_center(x_train_l, with_mean=False) x_valid_l = mean_center(x_valid_l, with_mean=False) x_test_l = mean_center(x_test_l, with_mean=False) # We generate the folds for randomised search up-front. We hold out one of the folds for # Probability calibration so each sampled param set gets calibrated on the same data. # This leaves cv_folds-2 folds for randomised search cross-validation. # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng) base_estimator_l = base_estimators[label] fresh_estimator = clone(base_estimator_l) # Find the best params, then do a final proper calibration. params = sk_generate_params(method, selection) estimator_l = RandomizedSearchCV( estimator=base_estimator_l, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Test if there's any signal if we permute the labels. # Classifier should do poorly if we do so. if permute: y_train_l = rng.permutation(y_train_l) threshold = 0.5 estimator_l.fit(x_train_l, y_train_l) best_params_l = estimator_l.best_params_ # Calibrate the random forest with the best hyperparameters. if method not in ['lr']: estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l), cv=3, method='sigmoid') estimator_l.fit(x_train_l, y_train_l) # Evaluate Performance characteristics and test on training to check overfitting. y_train_prob_l = estimator_l.predict_proba(x_train_l) y_valid_prob_l = estimator_l.predict_proba(x_valid_l) y_test_prob_l = estimator_l.predict_proba(x_test_l) training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold)) validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold)) # Compute independent test data performance testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold)) # Get label ranking info label_pred_proba_train.append([p[1] for p in y_train_prob_l]) label_pred_proba_valid.append([p[1] for p in y_valid_prob_l]) label_pred_proba_test.append([p[1] for p in y_test_prob_l]) label_y_train.append(y_train_l) label_y_valid.append(y_valid_l) label_y_test.append(y_test_l) print(validation_stats_i_f.frame()) # Compute multi-label performance statistics y = np.vstack(zip(*label_y_train)) y_prob = np.vstack(zip(*label_pred_proba_train)) training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } pprint(random_grid) # search across 100 different combinations, and use all available cores rf_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=kfold, verbose=2, random_state=42, n_jobs=-1) rf_random.fit(x_train, y_train) rf_random.best_params_ rf_best_random = rf_random.best_estimator_ prediction_RF = rf_best_random.predict(x_test) print("for Random Forest we get " + str(round(accuracy_score(y_test, prediction_RF), 5))) CM_RF = confusion_matrix(y_test, prediction_RF) df_cm = pd.DataFrame(CM_RF, index=["Predicted No", "Predicted Yes"], columns=["Actual No", "Actual Yes"]) plt.figure()
def tune_xgb_params_randomized(estimator_cls, label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, strat_folds: KFold, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 0, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param strat_folds: A KFold object to cross validate the parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV( cv=strat_folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level ) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]
outputs = Dense(10, activation='softmax', name='outputs')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(optimizer=optimizer, metrics=['acc'], loss='categorical_crossentropy') return model def create_hyperparameters(): batchs = [10,20,30,40,50] optimizers = ['rmsprop', 'adam', 'adadelta'] dropout = [0.1, 0.2, 0.3] return{'batch_size' : batchs, 'optimizer': optimizers, 'drop':dropout} hyperparameters = create_hyperparameters() model2 = build_model() from tensorflow.keras.wrappers.scikit_learn import KerasClassifier #머신러닝이 케라스보다 더 먼저 나와서 랩핑을 해줘야 한다 model2 = KerasClassifier(build_fn=build_model, verbose=1) from sklearn.model_selection import GridSearchCV, RandomizedSearchCV search = RandomizedSearchCV(model2, hyperparameters, cv=3) #cv cross validation # search = GridSearchCV(model2, hyperparameters, cv=3) #cv cross validation search.fit(x_train, y_train, verbose=1) print(search.best_params_) #{'optimizer': 'rmsprop', 'drop': 0.1, 'batch_size': 50} print(search.best_estimator_) print(search.best_score_) #0.9588499863942465 acc = search.score(x_test, y_test) print('최종 스코어 : ', acc ) #최종 스코어 : 0.9682999849319458 #{'optimizer': 'rmsprop', 'drop': 0.2, 'batch_size': 30} 고쳐도 된다
print("R-squared:", metrics.r2_score(y_test_pred, test_minmax[:, 1200])) print("-----------------------------------------------------------") print("-----------------------------------------------------------") # ---------- HYPERPARAMETERS TUNING WITH RANDOM-SEARCH ------------ print("TUNED HYPERPARAMETERS WITH RANDOM-SEARCH") print('-------------------------------------------------') # KNN param_dist = {'n_neighbors': sp_randint(2, 20)} knnrs = RandomizedSearchCV(knn, param_distributions=param_dist, scoring='neg_mean_squared_error', cv=tr_val_partition, n_jobs=1, verbose=1) # Training the model with the random-search np.random.seed(123) knnrs.fit(train_closest, train_minmax[:, 1200]) # Making predictions on the testing partition y_test_pred = knnrs.predict(test_closest) # And finally computing the test accuracy print("Mean squared error of KNN with tuned hyperparameters:", metrics.mean_squared_error(y_test_pred, test_minmax[:, 1200])) print("R-squared:", metrics.r2_score(y_test_pred, test_minmax[:, 1200])) print("-----------------------------------------------------------")
def main(): """ Main function """ amigos_data = np.loadtxt('features_all_20s.csv',skiprows=1, delimiter=',') labels = np.loadtxt('Final_Personality_20s.csv',skiprows=1, delimiter=',') ids=np.loadtxt('ids_20s.csv',skiprows=1, delimiter=',') labels=labels[:, 1] kf = KFold(n_splits=2) gkf=GroupKFold(n_splits=5) # tune XGB classifier parameters grid_search_params_xgb = { 'max_depth': [3,4,5], 'n_estimators': [10,15,20] } other_tuning_params_xgb = { 'learning_rate': np.arange(0.01, 0.41, 0.01), 'gamma': np.arange(0, 10.1, 0.5), 'min_child_weight': np.arange(0.80, 1.21, 0.01), 'max_delta_step': np.arange(0, 2.05, 0.05), 'subsample': np.arange(1.00, 0.59, -0.01), 'colsample_bytree': np.arange(1.00, 0.09, -0.01), 'colsample_bylevel': np.arange(1.00, 0.09, -0.01), 'reg_alpha': np.arange(0, 2.05, 0.05), 'reg_lambda': np.arange(0.50, 2.55, 0.05), 'scale_pos_weight': np.arange(0.80, 1.21, 0.01), 'base_score': np.arange(0.40, 0.61, 0.01), 'seed': np.arange(0, 41) } # XGB grid search tuning best_params = { 'max_depth': 3, 'n_estimators': 20 } acc = 0 print('Tuning max_depth and n_estimators') for param in grid_search_params_xgb['max_depth']: print('in grid search') print('max_depth', param) xgb_clf = { 'a': xgb.XGBClassifier(max_depth=param, objective="binary:logistic"), } tuning_params = grid_search_params_xgb['n_estimators'] param, tmp_acc = tuning( xgb_clf, 'n_estimators', tuning_params, amigos_data, labels, kf) print('param',param,'tmp_acc',tmp_acc) if tmp_acc >= acc: best_params['max_depth'] = param best_params['n_estimators'] = param acc = tmp_acc # XGB tune other parameters for param_name, tuning_params in other_tuning_params_xgb.items(): print('Tuning', param_name) xgb_clf = { 'a': xgb.XGBClassifier(objective="binary:logistic"), } xgb_clf['a'].set_params(**best_params) param,_ = tuning( xgb_clf, param_name, tuning_params, amigos_data, labels, kf) best_params[param_name] = param # tune RF parameters grid_search_params_rf = { 'max_features': [10,15,20], 'max_depth': [3,5,10], } rf_clf=RandomForestClassifier() rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_search_params_rf, n_iter = 100, cv = gkf, verbose=2, random_state=42, n_jobs = 5) rf_random.fit(amigos_data,np.ravel(labels),groups=ids) #the optimized hyperparameters print('XGBoost best parameters:', best_params) print('Random forest best parameters:',rf_random.best_params_)
from sklearn.model_selection import RandomizedSearchCV from sklearn.linear_model import SGDClassifier from scipy.stats import lognorm as sp_lognormal import cs231n.cifar10 as cf np.random.seed(31337) X_train, y_train, X_test, y_test, scaler = cf.get_normalised_data() basic_svm = SGDClassifier(loss="hinge", penalty="l2", l1_ratio=0.0, random_state=31337, n_jobs=5) # From the Scipy docs: to sample a random variable Y such that Y=exp(X) where X~N(mu,sigma), use # scipy.stats.lognormal(s=sigma, scale=np.exp(mu)) random_search = RandomizedSearchCV(basic_svm, param_distributions={'alpha': sp_lognormal(s=2, scale=np.exp(-4))}, n_iter=20, verbose=1) random_search.fit(X_train, y_train) print("Chosen: ", random_search.best_params_["alpha"]) print("Best CV score: ", random_search.best_score_) chosen_svm = random_search.best_estimator_ os.makedirs("output/svc", exist_ok=True) labels = cf.get_label_names() for i in range(10): # Don't forget to rescale the hyperplanes to get human-readable versions---the l2 penalty makes # them close to the origin, so they look indistinguishable. this_hyperplane = 127*(chosen_svm.coef_[i]/np.max(np.abs(chosen_svm.coef_[i]))) + scaler.mean_ cf.plot_image(this_hyperplane, "output/svc/archetype " + labels[i] + ".png")
#%%Ridge Regresion Model model_name = "ridge_poly2" X = train.drop(['energy'], axis=1) cat_cols = ['hour', 'month', 'day_of_week'] cat_cols_idx = [X.columns.get_loc(c) for c in X.columns if c in cat_cols] onehot = OneHotEncoder(categorical_features=cat_cols_idx, sparse=False) regr = Ridge(fit_intercept=False) poly = PolynomialFeatures(2) tscv = TimeSeriesSplit(n_splits=3) param_dist = {'alpha': st.uniform(1e-4, 5.0)} regr_cv = RandomizedSearchCV(estimator=regr, param_distributions=param_dist, n_iter=20, scoring='mean_squared_error', iid=False, cv=tscv, verbose=2, n_jobs=1) regr_pipe = Pipeline([('onehot', onehot), ('poly', poly), ('regr_cv', regr_cv)]) regr_pipe.fit(X, y=train['energy']) cv_results = pd.DataFrame(regr_pipe.named_steps['regr_cv'].cv_results_) cv_results.sort_values(by='rank_test_score').head() #%% Linear regression with recursive feature elimination model_name = "linear_regression" X = train.drop(['timeStamp','energy'], axis=1) cat_cols = ['hour', 'month', 'day_of_week'] cat_cols_idx = [X.columns.get_loc(c) for c in X.columns if c in cat_cols] onehot = OneHotEncoder(categorical_features=cat_cols_idx, sparse=False)