def blending(data, folds): names = [ "LinearSVM", "LogisticRegression", "RandomForest", "XGBoost", "CatBoost", "LGMBoost" ] classifiers = [ svm.SVC(probability=True), linear_model.LogisticRegression(max_iter=10000), ensemble.RandomForestClassifier(), XGBClassifier(), CatBoostClassifier(verbose=0), LGBMClassifier(n_estimators=400, silent=True) ] parameters = [ # LinearSVM { 'C': loguniform(1e0, 1e3), 'gamma': loguniform(1e-4, 1e-3), 'kernel': ['rbf'] }, # LogisticRegression {}, # RandomForest { 'bootstrap': [True, False], 'max_depth': [int(x) for x in range(10, 50)], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [int(x) for x in range(1, 5)], 'min_samples_split': [int(x) for x in range(2, 10)], 'n_estimators': [int(x) for x in range(100, 500, 50)] }, # XGBoost { 'min_child_weight': [1, 5, 10], 'gamma': loguniform(1e-4, 1e-3), 'subsample': list(np.linspace(0.5, 1, 100)), 'colsample_bytree': list(np.linspace(0.6, 1, 10)), 'max_depth': [int(x) for x in range(3, 11)], 'n_estimators': [int(x) for x in range(100, 500, 50)] }, # CatBoost { 'max_depth': [int(x) for x in range(4, 11)], 'iterations': [int(x) for x in range(10, 100)] }, # LGMBoost {} ] for name, classifier, param_dist in zip(names, classifiers, parameters): print(name) print("-" * 50) train_data = data.copy() n_iter_search = 30 rs = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1) best_clf = run_training(train_data, rs, name, folds) print("\n") joblib.dump(best_clf, f"./model/{name}.bin", compress=5)
def forest(): vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)), ('clf', RandomForestClassifier(random_state=0))]) param_dist = { 'clf__n_estimators': np.array(np.power(10, np.arange(1, 3, step=0.1)), dtype=int), 'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': np.arange(2, 50, dtype=int).tolist() + [None], 'clf__min_samples_split': loguniform(1e-4, 1e-1), 'clf__min_samples_leaf': stats.randint(1, 200), 'clf__max_features': np.linspace(0.01, 1, num=10, dtype=float).tolist() + ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None] + np.array(np.power(10, np.arange(1, 4, step=0.5)), dtype=int).tolist(), 'clf__min_impurity_decrease': [0.0] + np.array( np.power(10, np.arange(-10, -4, step=0.5)), dtype=float).tolist(), 'clf__bootstrap': [True, False], 'clf__oob_score': [True, False], 'clf__warm_start': [True, False], 'clf__max_samples': stats.uniform(0, 1), 'clf__class_weight': [None, 'balanced', 'balanced_subsample'], 'clf__verbose': [True, False], 'clf__min_weight_fraction_leaf': loguniform(1e-4, 1e-1) } return vect_and_clf, param_dist
def LR(dataset): vect_and_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', LogisticRegression(random_state=0))]) param_dist = { 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'], 'clf__dual': [True, False], 'clf__C': loguniform(1e-3, 1e3), 'clf__tol': loguniform(1e-11, 1e-4), 'clf__fit_intercept': [True, False], 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'clf__max_iter': stats.randint(50, 200), 'clf__warm_start': [True, False], 'clf__multi_class': ['auto', 'ovr', 'multinomial'], 'clf__l1_ratio': stats.uniform(0, 1), 'vect__min_df': loguniform(1e-4, 1e-2), 'vect__max_df': np.linspace(0.5, 0.9, num=10, dtype=float), 'vect__stop_words': [None, 'english'], 'vect__token_pattern': ['\w{2,}', '\w{1,}'], 'vect__ngram_range': [(1, 2), (1, 1)] } n_iter_search = 200 random_search = RandomizedSearchCV(vect_and_clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, n_jobs=-1) train_data, test_data, train_label, test_label = get_dataset() start = time() random_search.fit(train_data, train_label) print("RandomizedSearchCV took %.2f seconds" % ((time() - start))) results = random_search.cv_results_ candidates = np.flatnonzero(results['rank_test_score'] == 1) with open('/Users/tianchima/Desktop/Trial1/LR.txt', 'w') as f: for candidate in candidates: print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) f.write('Mean validation score: ' + str(results['mean_test_score'][candidate]) + '\n') f.write('std: ' + str(results['std_test_score'][candidate]) + '\n') f.write('Parameters: ' + str(results['params'][candidate]) + '\n') test_label_predict = random_search.best_estimator_.predict(test_data) accruracy = accuracy_score(test_label, test_label_predict) print(results) print('accruracy = ', accruracy) f.write('accruracy = ' + str(accruracy) + '\n' + '\n') f.write(str(random_search.best_params_) + '\n' + '\n') f.write(str(results)) return random_search.best_estimator_
def RandomF(dataset): vect_and_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestClassifier(random_state=0))]) param_dist = {'clf__n_estimators': np.array(np.power(10, np.arange(1, 3, step=0.1)), dtype=int), 'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': np.arange(2, 50, dtype=int).tolist() + [None], 'clf__min_samples_split': loguniform(1e-4, 1e-1), 'clf__min_samples_leaf': stats.randint(1, 200), 'clf__max_features': np.linspace(0.01, 1, num=10, dtype=float).tolist() + ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None] + (np.power(10, np.arange(1, 4, step=0.5)).astype(np.int)).tolist(), 'clf__min_impurity_decrease': [0.0] + np.array(np.power(10, np.arange(-10, -4, step=0.5)), dtype=float).tolist(), 'clf__bootstrap': [True, False], 'clf__oob_score': [True, False], 'clf__warm_start': [True, False], 'clf__max_samples': stats.uniform(0, 1), 'clf__class_weight': [None, 'balanced', 'balanced_subsample'], 'clf__verbose': [True, False], 'clf__min_weight_fraction_leaf': loguniform(1e-4, 1e-1), 'vect__min_df': loguniform(1e-4, 1e-2), 'vect__max_df': np.linspace(0.5, 0.9, num=10, dtype=float), 'vect__stop_words': [None, 'english'], 'vect__token_pattern': ['\w{2,}', '\w{1,}'], 'vect__ngram_range': [(1, 2), (1, 1)]} n_iter_search = 200 random_search = RandomizedSearchCV(vect_and_clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, n_jobs=-1) train_data, test_data, train_label, test_label = get_dataset() start = time() random_search.fit(train_data, train_label) print("RandomizedSearchCV took %.2f seconds" % ((time() - start))) results = random_search.cv_results_ candidates = np.flatnonzero(results['rank_test_score'] == 1) with open('/Users/tianchima/Desktop/Trial2/RandomF.txt', 'w') as f: for candidate in candidates: print("Mean validation score: {0:.3f} (std: {1:.3f})".format(results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) f.write('Mean validation score: ' + str(results['mean_test_score'][candidate]) + '\n') f.write('std: ' + str(results['std_test_score'][candidate]) + '\n') f.write('Parameters: ' + str(results['params'][candidate]) + '\n') test_label_predict = random_search.best_estimator_.predict(test_data) accruracy = accuracy_score(test_label, test_label_predict) print(results) print('accruracy = ', accruracy) f.write('accruracy = ' + str(accruracy) + '\n' + '\n') f.write(str(random_search.best_params_) + '\n' + '\n') f.write(str(results)) return random_search.best_estimator_
def svm(): vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)), ('clf', LinearSVC(random_state=0, verbose=10))]) param_dist = { 'clf__dual': [True, False], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__C': loguniform(1e-3, 1e3), 'clf__tol': loguniform(1e-11, 1e-4), 'clf__fit_intercept': [True, False] } return vect_and_clf, param_dist
def est_ET(): hp = [{ 'n_estimators': ( 1, 100, ), 'min_weight_fraction_leaf': ( 0.0, 0.25, 0.5, ), 'max_features': ( 'sqrt', 'log2', 'auto', None, ), 'max_samples': loguniform(1, 1000), 'bootstrap': ( True, False, ), 'oob_score': ( True, False, ), 'warm_start': ( True, False, ), 'criterion': ( 'mse', 'mae', ), 'max_depth': ( 1, 10, 100, None, ), 'max_leaf_nodes': ( 2, 100, ), 'min_samples_split': (10, ), 'min_samples_leaf': loguniform(1, 100), }] est = ensemble.ExtraTreesRegressor() #regr = MultiOutputRegressor(estimator=est) return est, hp
def SVM(x_train, y_train, x_test, y_test): params = { 'C': loguniform(1e0, 1e3), 'gamma': loguniform(1e-4, 1e-3), 'kernel': ['rbf', 'linear'], 'class_weight': ['balanced', None] } svm = SVC() clf = GridSearchCV(svm, params) clf.fit(x_train, y_train) svm_predictions = clf.predict(x_test) accuracy = accuracy_score(y_test, svm_predictions) print("Accuracy: ", accuracy) print(clf.get_params) return accuracy
def Bag(dataset, estimator1, estimator2, estimator3, estimator4, estimator5): vect_and_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', BaggingClassifier(random_state=0))]) param_dist = { 'clf__base_estimator': [ None, estimator1, estimator2, estimator3, estimator4, estimator5, MultinomialNB() ], 'clf__n_estimators': stats.randint(10, 400), 'clf__max_features': stats.uniform(0, 1), 'clf__max_samples': stats.uniform(0, 1), 'clf__bootstrap': [True, False], 'clf__bootstrap_features': [True, False], 'clf__oob_score': [True, False], 'clf__warm_start': [True, False], 'vect__min_df': loguniform(1e-4, 1e-2), 'vect__max_df': np.linspace(0.5, 0.9, num=10, dtype=float), 'vect__stop_words': [None, 'english'], 'vect__token_pattern': ['\w{2,}', '\w{1,}'], 'vect__ngram_range': [(1, 2), (1, 1)] } n_iter_search = 200 random_search = RandomizedSearchCV(vect_and_clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, n_jobs=-1) train_data, test_data, train_label, test_label = get_dataset() start = time() random_search.fit(train_data, train_label) print("RandomizedSearchCV took %.2f seconds" % ((time() - start))) results = random_search.cv_results_ candidates = np.flatnonzero(results['rank_test_score'] == 1) with open('/Users/tianchima/Desktop/Trial2/Bag.txt', 'w') as f: for candidate in candidates: print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) f.write('Mean validation score: ' + str(results['mean_test_score'][candidate]) + '\n') f.write('std: ' + str(results['std_test_score'][candidate]) + '\n') f.write('Parameters: ' + str(results['params'][candidate]) + '\n') test_label_predict = random_search.best_estimator_.predict(test_data) accruracy = accuracy_score(test_label, test_label_predict) print(results) print('accruracy = ', accruracy) f.write('accruracy = ' + str(accruracy) + '\n' + '\n') f.write(str(random_search.best_params_) + '\n' + '\n') f.write(str(results)) return
def get_ensemble_model(params): """output a nonlinear XGBoost regressor with randomised parameter search over nested 5-fold CV params: dict, containing details on PCA if required returns: model: sklearn estimator """ ss = StandardScaler() xgb_reg = xgb.XGBRegressor(objective="reg:squarederror",n_jobs=1, base_score=12, learning_rate=0.05, random_state=42) if params['pca']: pca = PCA(n_components=params['pca_comps'], whiten=True) xgb_model = Pipeline(steps=(['scale', ss], ['pca', pca], ['model', xgb_reg])) # pipeline else: xgb_model = Pipeline(steps=(['scale', ss], ['model', xgb_reg])) xgb_model_params = { "model__n_estimators": [100,250,500], "model__colsample_bytree": uniform(0.5, 0.5), # default 1 "model__min_child_weight": randint(1,6), #deafult 1 "model__max_depth": randint(2, 5), # default 3, 3-10 - "model__subsample": uniform(0.5, 0.5), # default 1 "model__reg_lambda": loguniform(1e1,1e2) # l2 reg, default 1 } # model: classifier with randomised parameter search over nested 3-fold CV (more iters to account for large space) ensemble_model = RandomizedSearchCV(xgb_model, xgb_model_params, n_iter=500, cv=5, verbose=1, n_jobs=5) return clone(ensemble_model)
def get_linear_model(params): """output a sparse linear regressor with randomised parameter search over nested 5-fold CV params: dict, containing details on PCA if required returns: model: sklearn estimator """ ss = StandardScaler() lr = ElasticNet(selection='random', random_state=42) # EN if params['pca']: pca = PCA(n_components=params['pca_comps'], whiten=True) lr_model = Pipeline(steps=(['scale', ss], ['pca', pca], ['model', lr])) # pipeline else: lr_model = Pipeline(steps=(['scale', ss], ['model', lr])) # pipeline lr_model_params = { 'model__alpha': loguniform(1e-1, 1e3), 'model__l1_ratio': uniform(0.1, .9) } # model: classifier with randomised parameter search over nested 3-fold CV linear_model = RandomizedSearchCV(lr_model, lr_model_params, n_iter=500, cv=5) return clone(linear_model)
def get_ensemble_model(): """output a nonlinear XGBoost classifier with randomised parameter search over nested 3-fold CV returns: model: sklearn estimator """ ss = StandardScaler() xgb_clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42) xgb_model = Pipeline(steps=(['scale', ss], ['clf', xgb_clf])) xgb_model_params = { "clf__colsample_bytree": uniform(0.5, 0.5), # default 1 "clf__gamma": loguniform(1e-1, 1e3), # default 0 "clf__learning_rate": uniform(0.03, 0.57), # default 0.3 "clf__max_depth": randint(2, 5), # default 3 "clf__n_estimators": randint(10, 50), # default 100 "clf__subsample": uniform(0.5, 0.25), # default 1 "clf__min_child_weight": randint(1, 8) # default 1 } # model: classifier with randomised parameter search over nested 3-fold CV (more iters to account for large space) ensemble_model = RandomizedSearchCV(xgb_model, xgb_model_params, n_iter=250, cv=3) return clone(ensemble_model)
def lr(): vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)), ('clf', LogisticRegression(random_state=0))]) param_dist = { 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'], 'clf__dual': [True, False], 'clf__C': loguniform(1e-3, 1e3), 'clf__tol': loguniform(1e-11, 1e-4), 'clf__fit_intercept': [True, False], 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'clf__max_iter': stats.randint(50, 200), 'clf__warm_start': [True, False], 'clf__multi_class': ['auto', 'ovr', 'multinomial'], 'clf__l1_ratio': stats.uniform(0, 1) } return vect_and_clf, param_dist
def test_loguniform(low, high, base): rv = loguniform(base**low, base**high) assert isinstance(rv, scipy.stats._distn_infrastructure.rv_frozen) rvs = rv.rvs(size=2000, random_state=0) # Test the basics; right bounds, right size assert (base**low <= rvs).all() and (rvs <= base**high).all() assert len(rvs) == 2000 # Test that it's actually (fairly) uniform log_rvs = np.array([math.log(x, base) for x in rvs]) counts, _ = np.histogram(log_rvs) assert counts.mean() == 200 assert np.abs(counts - counts.mean()).max() <= 40 # Test that random_state works assert loguniform(base**low, base**high).rvs(random_state=0) == loguniform( base**low, base**high).rvs(random_state=0)
def test_sparse_methods(): c1 = [0.1, 0.2] c2 = [0.1, 0.2] param_grid = {"c": [c1, c2]} pmd_cv = GridSearchCV(PMD(random_state=rng), param_grid=param_grid).fit([X, Y]) cv_plot(pmd_cv.cv_results_) c1 = [5e-1] c2 = [1e-1] param_grid = {"c": [c1, c2]} scca_cv = GridSearchCV(SCCA(random_state=rng), param_grid=param_grid).fit([X, Y]) c1 = [1e-1] c2 = [1e-1] param_grid = {"c": [c1, c2]} parkhomenko_cv = GridSearchCV(ParkhomenkoCCA(random_state=rng), param_grid=param_grid).fit([X, Y]) c1 = [2e-2] c2 = [1e-2] param_grid = {"c": [c1, c2]} admm_cv = GridSearchCV(SCCA_ADMM(random_state=rng), param_grid=param_grid).fit([X, Y]) c1 = loguniform(1e-1, 2e-1) c2 = loguniform(1e-1, 2e-1) param_grid = {"c": [c1, c2], "l1_ratio": [[0.9], [0.9]]} elastic_cv = RandomizedSearchCV(ElasticCCA(random_state=rng), param_distributions=param_grid, n_iter=4).fit([X, Y]) assert (pmd_cv.best_estimator_.weights[0] == 0).sum() > 0 assert (pmd_cv.best_estimator_.weights[1] == 0).sum() > 0 assert (scca_cv.best_estimator_.weights[0] == 0).sum() > 0 assert (scca_cv.best_estimator_.weights[1] == 0).sum() > 0 assert (admm_cv.best_estimator_.weights[0] == 0).sum() > 0 assert (admm_cv.best_estimator_.weights[1] == 0).sum() > 0 assert (parkhomenko_cv.best_estimator_.weights[0] == 0).sum() > 0 assert (parkhomenko_cv.best_estimator_.weights[1] == 0).sum() > 0 assert (elastic_cv.best_estimator_.weights[0] == 0).sum() > 0 assert (elastic_cv.best_estimator_.weights[1] == 0).sum() > 0
def get_nonlinear_model(): """output a nonlinear SVM classifier with randomised parameter search over nested 3-fold CV returns: model: sklearn estimator """ ss = StandardScaler() svm = SVC(kernel='rbf', probability=True, random_state=42) # kernel SVM svm_model = Pipeline(steps=(['scale', ss], ['clf', svm])) svm_model_params = { 'clf__C': loguniform(1e-3, 1e3), 'clf__gamma': loguniform(1e-4, 1e1) } # model: classifier with randomised parameter search over nested 3-fold CV nonlinear_model = RandomizedSearchCV(svm_model, svm_model_params, n_iter=100, cv=3) return clone(nonlinear_model)
def get_linear_model(): """output a linear classifier with randomised parameter search over nested 3-fold CV returns: model: sklearn estimator """ ss = StandardScaler() lr = LogisticRegression(penalty='l2', max_iter=1000, class_weight=None) # ridge lr_model = Pipeline(steps=(['scale', ss], ['clf', lr])) # pipeline lr_model_params = {'clf__C': loguniform(1e-3, 1e3)} # model: classifier with randomised parameter search over nested 3-fold CV linear_model = RandomizedSearchCV(lr_model, lr_model_params, n_iter=100, cv=3) return clone(linear_model)
def perform_hyperparameter_tuning(self, X, y, model_name='ridge', n_values=100): if model_name == 'ridge': # model = Ridge() # reg_pipeline = Pipeline([('scaler', MinMaxScaler()), # ('Ridge', Ridge())]) # param_grid = [{'alpha': np.logspace(-5,5,100)}] param_dist = {'alpha': loguniform(1e-5, 1e0)} clf = RandomizedSearchCV(estimator=Ridge(normalize=True), param_distributions=param_dist, n_iter=50, n_jobs=10, random_state=self.random_state) clf.fit(X, y) return clf.best_params_ else: print("Only supporting Ridge for now")
def dt(): vect_and_clf = Pipeline([('vect', TfidfVectorizer(min_df=5)), ('clf', DecisionTreeClassifier(random_state=0))]) param_dist = { 'clf__criterion': ['gini', 'entropy'], 'clf__splitter': ['best', 'random'], 'clf__max_depth': np.arange(2, 50, dtype=int).tolist() + [None], 'clf__min_samples_split': loguniform(1e-4, 1e-1), 'clf__min_samples_leaf': stats.randint(1, 200), 'clf__max_features': np.linspace(0.01, 1, num=10, dtype=float).tolist() + ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None] + np.array(np.power(10, np.arange(1, 4, step=0.5)), dtype=int).tolist(), 'clf__min_impurity_decrease': [0.0] + np.array( np.power(10, np.arange(-10, -4, step=0.5)), dtype=float).tolist() } return vect_and_clf, param_dist
# fuzzy options to test: fuzzy_options = ["normal", "fuzzy_dist", "fuzzy_err"] # Features: features = ["N2-N4", "N3-N4", "N2-N3", "Y-N4", "Z-N4", "G-I", "G-R", "I-N4"] fuzzy_dist_column = ["fuzzy_dist"] fuzzy_err_column = ["fuzzy_err"] output_path = "./results" #------------------------------------ TRAINING: -------------------------------------- # scale features of the data: train_X, general_X = training_utils.scale_X_of_the_data(training_data[features], general_data[features]) params = {'C': loguniform(1e0, 1e3), 'gamma': loguniform(1e-4, 1e-2)} for fuzzy_option in fuzzy_options: print(fuzzy_option) clf = svm.SVC(gamma='scale', kernel='rbf', probability=True, class_weight='balanced', cache_size=5000, random_state=476) clf_for_eval = svm.SVC(gamma='scale', kernel='rbf',
# Our kernel has two parameters: the length-scale and the periodicity. For our # dataset, we use `sin` as the generative process, implying a # :math:`2 \pi`-periodicity for the signal. The default value of the parameter # being :math:`1`, it explains the high frequency observed in the predictions of # our model. # Similar conclusions could be drawn with the length-scale parameter. Thus, it # tell us that the kernel parameters need to be tuned. We will use a randomized # search to tune the different parameters the kernel ridge model: the `alpha` # parameter and the kernel parameters. # %% from sklearn.model_selection import RandomizedSearchCV from sklearn.utils.fixes import loguniform param_distributions = { "alpha": loguniform(1e0, 1e3), "kernel__length_scale": loguniform(1e-2, 1e2), "kernel__periodicity": loguniform(1e0, 1e1), } kernel_ridge_tuned = RandomizedSearchCV( kernel_ridge, param_distributions=param_distributions, n_iter=500, random_state=0, ) start_time = time.time() kernel_ridge_tuned.fit(training_data, training_noisy_target) print(f"Time for KernelRidge fitting: {time.time() - start_time:.3f} seconds") # %% # Fitting the model is now more computationally expensive since we have to try
'input_scaling': 0.4, 'bias_scaling': 0.0, 'spectral_radius': 0.0, 'reservoir_activation': 'tanh', 'leakage': 1.0, 'bidirectional': False, 'k_rec': 10, 'alpha': 1e-3, 'random_state': 42 } step1_esn_params = { 'input_scaling': uniform(loc=1e-2, scale=1), 'spectral_radius': uniform(loc=0, scale=2) } step2_esn_params = {'leakage': loguniform(1e-5, 1e0)} step3_esn_params = {'bias_scaling': np.linspace(0.0, 1.0, 11)} step4_esn_params = {'alpha': loguniform(1e-5, 1e1)} kwargs_step1 = { 'n_iter': 200, 'random_state': 42, 'verbose': 1, 'n_jobs': -1, 'scoring': make_scorer(mean_squared_error, greater_is_better=False, needs_proba=True) } kwargs_step2 = { 'n_iter': 50, 'random_state': 42, 'verbose': 1, 'n_jobs': -1, 'scoring': make_scorer(mean_squared_error, greater_is_better=False, needs_proba=True) } kwargs_step3 = { 'verbose': 1, 'n_jobs': -1,
gs.best_estimator_, "%s_models/%s_%s_regressor_best_estimator.pk" % (method, method, data_type)) return (gs) # + if classification_task: model = svm.SVC(max_iter=10000) else: model = svm.SVR(max_iter=10000) # Grid parameters param_svm = [ { 'C': loguniform(1e-1, 1e4), 'kernel': ['poly', 'rbf'], 'gamma': loguniform(1e-4, 1e1) }, ] n_iter = 200 scaler = preprocessing.MinMaxScaler() X_train_copy = scaler.fit_transform(X_train) if classification_task: svm_gs = supervised_learning_steps("svm", "roc_auc", data_type, classification_task, model, param_svm, X_train_copy, y_train, n_iter) else: svm_gs = supervised_learning_steps("svm", "r2", data_type,
for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from param_dist = { 'average': [True, False], 'l1_ratio': stats.uniform(0, 1), 'alpha': loguniform(1e-4, 1e0) } # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_) # use a full grid over all parameters
'leakage': 1.0, 'k_rec': 10, 'reservoir_activation': 'tanh', 'bidirectional': False, 'alpha': 1e-3, 'random_state': 42 } step1_esn_params = { 'input_scaling': uniform(loc=1e-2, scale=1), 'spectral_radius': uniform(loc=0, scale=2) } step2_esn_params = {'leakage': uniform(1e-5, 1e0)} step3_esn_params = {'bias_scaling': uniform(loc=0, scale=3)} step4_esn_params = {'alpha': loguniform(1e-5, 1e1)} kwargs_step1 = { 'n_iter': 200, 'random_state': 42, 'verbose': 1, 'n_jobs': -1, 'scoring': gpe_scorer } kwargs_step2 = { 'n_iter': 50, 'random_state': 42, 'verbose': 1, 'n_jobs': -1, 'scoring': gpe_scorer }
X=crime_filtered.loc[:, 'population':'PolicBudgPerPop'] imputer = KNNImputer(n_neighbors=10,weights='distance') X = imputer.fit_transform(X) Y = crime_filtered.loc[:, 'larcPerPop'] train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=.3,random_state=42) bds = [{'name': 'alpha', 'type': 'continuous', 'domain': (1e-10, 10)}, {'name': 'l1_ratio', 'type': 'continuous', 'domain': (0, 1)}] param_dist = {"alpha": loguniform(1e-10, 10e0), "l1_ratio": uniform(0, 1)} enet = ElasticNet(max_iter=10000,normalize=True) baseline = cross_val_score(enet, train_x, train_y, scoring='r2', cv=10).mean() rs = RandomizedSearchCV(enet, param_distributions=param_dist, scoring='r2', n_jobs=-1, verbose=2,
def getHyperParamBTC(typeofrun=1): if typeofrun == 1: # BitcoinTransformer param_dist = { 'classify__max_depth': [80, 90, 100, 110], 'classify__max_features': [2, 3], 'classify__min_samples_leaf': [3, 4, 5], 'classify__min_samples_split': [8, 10, 12], 'classify__n_estimators': [100, 200, 300, 1000], 'BitcoinTransformer__adosc_fastperiod': loguniform(2, 100), 'BitcoinTransformer__adosc_slowperiod': loguniform(2, 100), 'BitcoinTransformer__adx_period': loguniform(2, 100), 'BitcoinTransformer__adxr_period': loguniform(2, 100), 'BitcoinTransformer__apo_fastperiod': loguniform(2, 100), 'BitcoinTransformer__apo_slowperiod': loguniform(2, 100), 'BitcoinTransformer__aroon_period': loguniform(2, 100), 'BitcoinTransformer__aroonosc_period': loguniform(2, 100), 'BitcoinTransformer__bb_periods': loguniform(2, 100), 'BitcoinTransformer__cci_periods': loguniform(2, 100), 'BitcoinTransformer__cmo_period': loguniform(2, 100), 'BitcoinTransformer__dema_period': loguniform(2, 100), 'BitcoinTransformer__dx_period': loguniform(2, 100), 'BitcoinTransformer__ema_period': loguniform(2, 100), 'BitcoinTransformer__kama_period': loguniform(2, 100), 'BitcoinTransformer__ma_period': loguniform(2, 100), 'BitcoinTransformer__macd_period_longterm': loguniform(2, 100), 'BitcoinTransformer__macd_period_shortterm': loguniform(2, 100), 'BitcoinTransformer__macd_period_to_signal': loguniform(2, 100), 'BitcoinTransformer__mean_o_c_period': loguniform(2, 100), 'BitcoinTransformer__mfi_period': loguniform(2, 100), 'BitcoinTransformer__midpoint_period': loguniform(2, 100), 'BitcoinTransformer__midprice_period': loguniform(2, 100), 'BitcoinTransformer__minus_di_period': loguniform(2, 100), 'BitcoinTransformer__minus_dm_period': loguniform(2, 100), 'BitcoinTransformer__momentum_period': loguniform(2, 100), 'BitcoinTransformer__plus_di_period': loguniform(2, 100), 'BitcoinTransformer__plus_dm_period': loguniform(2, 100), 'BitcoinTransformer__ppo_fastperiod': loguniform(2, 100), 'BitcoinTransformer__ppo_slowperiod': loguniform(2, 100), 'BitcoinTransformer__roc_period': loguniform(2, 100), 'BitcoinTransformer__rocp_period': loguniform(2, 100), 'BitcoinTransformer__rocr100_period': loguniform(2, 100), 'BitcoinTransformer__rocr_period': loguniform(2, 100), 'BitcoinTransformer__rsi_period': loguniform(2, 100), 'BitcoinTransformer__sar_acceleration': loguniform(2, 100), 'BitcoinTransformer__sar_maximum': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationinitlong': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationinitshort': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationlong': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationmaxlong': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationmaxshort': loguniform(2, 100), 'BitcoinTransformer__sarext_accelerationshort': loguniform(2, 100), 'BitcoinTransformer__sarext_offsetonreverse': loguniform(2, 100), 'BitcoinTransformer__sarext_startvalue': loguniform(2, 100), 'BitcoinTransformer__sma_close_timeperiod': loguniform(2, 100), 'BitcoinTransformer__sma_h_l_c_o_period': loguniform(2, 100), 'BitcoinTransformer__sma_handl_period': loguniform(2, 100), 'BitcoinTransformer__sma_high_period': loguniform(2, 100), 'BitcoinTransformer__sma_low_period': loguniform(2, 100), 'BitcoinTransformer__so_d_n': loguniform(2, 100), 'BitcoinTransformer__so_n': loguniform(2, 100), 'BitcoinTransformer__t3_period': loguniform(2, 100), 'BitcoinTransformer__tema_period': loguniform(2, 100), 'BitcoinTransformer__trima_period': loguniform(2, 100), 'BitcoinTransformer__trix_period': loguniform(2, 100), 'BitcoinTransformer__ultosc_period1': loguniform(2, 100), 'BitcoinTransformer__ultosc_period2': loguniform(2, 100), 'BitcoinTransformer__ultosc_period3': loguniform(2, 100), 'BitcoinTransformer__var_close_period': loguniform(2, 100), 'BitcoinTransformer__var_open_period': loguniform(2, 100), 'BitcoinTransformer__wma_period': loguniform(2, 100), 'BitcoinTransformer__wr_lookback_period': loguniform(2, 100) } elif typeofrun == 2: # ParameterRelationsBTCTrans param_dist = { 'classify__max_depth': [80, 90, 100, 110], 'classify__max_features': [2, 3], 'classify__min_samples_leaf': [3, 4, 5], 'classify__min_samples_split': [8, 10, 12], 'classify__n_estimators': [100, 200, 300, 1000], 'BitcoinTransformer__fastperiod': loguniform(2, 100), 'BitcoinTransformer__longterm': loguniform(2, 100), 'BitcoinTransformer__midterm': loguniform(2, 100), 'BitcoinTransformer__shortterm': loguniform(2, 100), 'BitcoinTransformer__bb_cci': loguniform(2, 100), 'BitcoinTransformer__var_t3': loguniform(2, 100), 'BitcoinTransformer__dema_trema': loguniform(2, 100), 'BitcoinTransformer__zero': loguniform(2, 100), 'BitcoinTransformer__rocperiod': loguniform(2, 100) } elif typeofrun == 3: # Only Random Forest hyperparameters param_dist = { 'classify__max_depth': [80, 90, 100, 110], 'classify__max_features': [2, 3], 'classify__min_samples_leaf': [3, 4, 5], 'classify__min_samples_split': [8, 10, 12], 'classify__n_estimators': [100, 200, 300, 1000], } return param_dist
# Hyperparameter ranges / distributions that should be considered during the random search PARAM_SEARCH = { "kernel": ["RFB", "Matern12", "Matern32", "Matern52", "RQ"], "n_neighbors": np.arange(5, 50, 5), "n_inducing_points": np.arange(10, 100, 10), "coeff": np.linspace(0.5, 4, 10), "n_components": range(2, 20), "hidden_sizes": [ [hidden_size] * num_layers for hidden_size in [25, 30, 50, 75, 100] for num_layers in range(1, 4) ], "latent_dim": [5, 10, 15, 20], "batch_size": [64, 128, 256], "lr": loguniform(1e-4, 0.1), # Intervals become [loc, loc + scale] for uniform "C": [10 ** i for i in range(0, 5)], # Regularization for logistic regression baseline "n_mix_components": range(1, 11), # Intervals become [loc, loc + scale] for uniform "dropout_rate": uniform(loc=0, scale=0.5), # [0, 0.5] "posterior_rho_init": uniform(loc=-8, scale=6), # [-8, -2] "posterior_mu_init": uniform(loc=-0.6, scale=1.2), # [-0.6, 0.6] "prior_pi": uniform(loc=0.1, scale=0.8), # [0.1, 0.9] "prior_sigma_1": [np.exp(d) for d in np.arange(-0.8, 0, 0.1)], "prior_sigma_2": [np.exp(d) for d in np.arange(-0.8, 0, 0.1)], "reconstr_error_weight": loguniform(0.01, 0.9), "anneal": [True, False], "beta": uniform(loc=0.1, scale=2.4), # [0.1, 2.5] }
import numpy as np from vectorizer import load_vectors from scipy.stats import uniform from sklearn.utils.fixes import loguniform from sklearn.linear_model import LogisticRegression from sklearn.model_selection import RandomizedSearchCV X_test_vect, X_train_vect, test, train = load_vectors() lr_model = LogisticRegression(n_jobs=-1, solver='sag', C=1.55, max_iter=500) lr_params = dict(tol=loguniform(1e-7, 5e-4)) lr_grid = RandomizedSearchCV(lr_model, lr_params, verbose=5, n_iter=12, n_jobs=-1) lr_search = lr_grid.fit(X_train_vect, train.label) print(lr_search.best_params_) print(lr_search.best_score_)
# TransformedTargetRegressor(regressor=SVR(), transformer=StandardScaler()), # ), # ( # "kernel_ridge", # TransformedTargetRegressor( # regressor=KernelRidge(), transformer=StandardScaler() # ), # ), # ("knn", KNeighborsRegressor()), # ("xgb", XGBRegressor(objective="reg:squarederror")), ], memory="cache", ) param_distributions = { "svr__C": loguniform(50, 200), "svr__epsilon": loguniform(1e-4, 1), # "knn__n_neighbors": stats.randint(low=2, high=50), # "xgb__n_estimators": stats.randint(low=50, high=300), # "xgb__max_depth": stats.randint(low=2, high=10), # "target_svr__regressor__C": stats.expon(scale=100), # "target_svr__regressor__epsilon": stats.expon(), # "kernel_ridge__regressor__alpha": loguniform(1, 1e4), # "kernel_ridge__regressor__gamma": [0.1], } search = RandomizedSearchCV( fitting, param_distributions, n_iter=50, n_jobs=2, ).fit(X, y)
candidates = np.flatnonzero(results["rank_test_score"] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results["mean_test_score"][candidate], results["std_test_score"][candidate], )) print("Parameters: {0}".format(results["params"][candidate])) print("") # specify parameters and distributions to sample from param_dist = { "average": [True, False], "l1_ratio": stats.uniform(0, 1), "alpha": loguniform(1e-2, 1e0), } # run randomized search n_iter_search = 15 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print( "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_)