def build_params_space(model_type): assert_supported(model_type) if model_type == 'svr': space = { 'estimator__kernel': ['poly', 'rbf'], 'estimator__C': loguniform(1e-3, 1e3), 'estimator__degree': range(1, 8), } if model_type == 'lr': space = { 'estimator__alpha': loguniform(1e-3, 1e1), 'estimator__l1_ratio': uniform(0, 1), } if model_type == 'rf': space = { 'estimator__n_estimators': range(50, 200), 'estimator__max_depth': range(3, 10), 'estimator__criterion': ['mse', 'mae'], 'estimator__max_features': ['auto', 'sqrt', 'log2'] } if model_type == 'gbm': space = { 'estimator__n_estimators': range(25, 200), 'estimator__max_depth': range(3, 8), 'estimator__learning_rate': loguniform(1e-3, 1e-1) } return space
def otimizar(self): # Definindo os parâmetros a serem utilizados parametros = { 'C': loguniform(2**-5, 2**15), 'gamma': loguniform(2**-15, 2**3), 'epsilon': uniform(0.0, 1) } cv_ = ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.9) # Executando otimização dos parâmetros self.iniciar_tempo() randomSCV = RandomizedSearchCV(SVR(kernel='rbf'), parametros, scoring="neg_mean_absolute_error", cv=cv_, n_iter=self.num_combinacoes, n_jobs=-1) randomSCV.fit(self.X_treinamento, self.Y_treinamento) self.finalizar_tempo() # Identify optimal hyperparameter values C = randomSCV.best_params_['C'] gamma = randomSCV.best_params_['gamma'] epsilon = randomSCV.best_params_['epsilon'] # Treinando SVM final com os parâmetros encontrados self.svm = SVM(gamma, C, epsilon) self.svm.treinar(self.X_treinamento, self.Y_treinamento) self.svm.testar(self.X_teste, self.Y_teste)
def _get_params_random(model_type, is_cl, with_preprocessing): if model_type == "linear": ml_params = dict(penalty=["l1", "l2"], C=stats.loguniform( 1e-5, 10)) if is_cl else dict(alpha=stats.loguniform(1e-5, 10)) else: ml_params = dict(max_depth=list(range(5, 16))) return _convert_ml_params(ml_params) if with_preprocessing else ml_params
def fit_model_Randomize(self, X_train, y_train): # Create Pipeline pipeline = Pipeline([ ('tfidf', TfidfVectorizer(lowercase=False)), ('model', MultinomialNB()), ]) parameters = { 'tfidf__ngram_range': [ (1, 1), (2, 2), (1, 2), ], 'tfidf__min_df': stats.loguniform(0.01, 0.2), 'tfidf__max_df': stats.loguniform(0.01, 0.3), 'tfidf__norm': ['l1', 'l2'], 'model__alpha': stats.uniform(0.5, 1) } # Perform grid search on pipeline grid_search = RandomizedSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring="accuracy", cv=5, n_iter=200, refit=True) grid_search.fit(X_train, y_train) return grid_search.best_estimator_
def get_param_dist(model_name): """Get the parameter distribution for each machine learning model for random search.""" if model_name == 'logistic_regression': param_dist = dict(C=loguniform(1e-6, 1e+6)) elif model_name == 'elastic_net': param_dist = dict(l1_ratio=uniform(0, 1), C=loguniform(1e-6, 1e+6)) elif model_name == 'svm': kernel = ['linear', 'poly', 'rbf'] param_dist = dict(C=loguniform(1e-3, 1e+3), kernel=kernel) elif model_name == 'random_forest': max_features = ["auto", "log2"] param_dist = dict(n_estimators=range(100, 1001), max_features=max_features) elif model_name == 'gradient_boost': param_dist = dict(learning_rate=uniform(0, 1), subsample=uniform(0.1, 0.9), max_depth=range(0, 11), min_child_weight=range(0, 26)) else: raise ValueError( f'The entered model "{model_name}", was not found. Please check that you have chosen a valid model.' ) return param_dist
def get_random_size(random_state=None, larger=False): rng = check_random_state(random_state) rv = loguniform(3, 500) x1, x2 = rv.rvs(size=2, random_state=rng).astype(int) h1 = loguniform(3, 70).rvs(random_state=rng).astype(int) h2 = h1 * int(rng.uniform(1, 3)) if larger: h1, h2, x1, x2 = np.sort([h1, h2, x1, x2]) return (x1, x2), (h1, h2)
def svm_parameter_space(): param_grid = [{ 'kernel': ['linear'], 'C': loguniform(0.001, 10), }, { 'kernel': ['rbf'], 'C': loguniform(0.1, 1000), 'gamma': loguniform(0.0001, 1.0), }] return param_grid
def sweep(): dataset = pd.read_csv("data/train.csv") Y = dataset["Survived"] X = dataset.drop(["Survived"], axis=1) pipe = Pipeline([('feature_removal', ppcs.get_feature_removal()), ('col_t', ppcs.get_col_transf()), ('model', ensemble.AdaBoostClassifier())]) param_dist = [{ 'model': [LogisticRegression()], 'model__C': expon(scale=1), 'col_t__num__poly': [PolynomialFeatures(degree=2)] }, { 'model': [LinearSVC()], 'model__C': loguniform(0.000001, 1000000), 'model__max_iter': [5000] }, { 'model': [SVC()], 'model__C': loguniform(0.000001, 1e6), 'model__kernel': ['rbf', 'poly'], 'model__gamma': ['scale', 'auto'] }, { 'model': [KNeighborsClassifier()], 'model__n_neighbors': range(1, 10), 'model__weights': ['uniform', 'distance'] }, { 'model': [ensemble.RandomForestClassifier()], 'model__n_estimators': [10, 30, 100, 300, 1000, 3000], 'model__criterion': ['gini', 'entropy'], 'model__min_samples_split': range(2, 30), }, { 'model': [ensemble.AdaBoostClassifier()] }, { 'model': [ensemble.GradientBoostingClassifier()], 'model__loss': ['deviance', 'exponential'], 'model__n_estimators': [10, 30, 100, 300, 1000, 3000], 'model__min_samples_split': range(2, 30), }] search = RandomizedSearchCV(pipe, param_dist, n_iter=100, cv=3, n_jobs=2, verbose=1, random_state=42, return_train_score=True, scoring='accuracy') search.fit(X, Y) dump(search.cv_results_, "models/results2.joblib")
def test_gp_samples_to_params(): space = { 'a': range(10), 'b': uniform(-10, 20), 'c': ['cat1', 1, 'cat2'], 'e': [1, 2, 3], 'f': ['const'], 'g': loguniform(0.001, 100), 'h': [10] } X = np.array([ # 4, -8, 'cat2', 1, 'const', 1 , 10 [0.4444, 0.1, 0, 0, 1, 0, 1, 0.6, 0], # 0, -10.0, 'cat1', 3, 'const', 0.001 , 10 [0.0, 0.0, 1, 0, 0, 1, 1, 0.0, 0], # 9, 10.0, 1, 2, 'const', 100 , 10 [1.0, 1.0, 0, 1, 0, 0.5, 1, 1.0, 0], ]) expected = [ dict(a=4, b=-8.0, c='cat2', e=1, f='const', g=1, h=10), dict(a=0, b=-10.0, c='cat1', e=3, f='const', g=.001, h=10), dict(a=9, b=10.0, c=1, e=2, f='const', g=100, h=10), ] ds = domain_space(space, domain_size=1000) params = ds.convert_to_params(X) for act, exp in zip(params, expected): for k, v in act.items(): if k == 'g': assert np.isclose(v, exp[k]) else: assert v == exp[k]
def test_keras(c, s, a, b): # Mirror the mnist dataset X, y = make_classification(n_classes=10, n_features=784, n_informative=100) X = X.astype("float32") assert y.dtype == np.dtype("int64") model = KerasClassifier(build_fn=_keras_build_fn, lr=0.01, verbose=False) params = {"lr": loguniform(1e-3, 1e-1)} search = IncrementalSearchCV(model, params, max_iter=3, n_initial_parameters=5, decay_rate=None) yield search.fit(X, y) # search.fit(X, y) assert search.best_score_ >= 0 # Make sure the model trains, and scores aren't constant scores = { ident: [h["score"] for h in hist] for ident, hist in search.model_history_.items() } assert all(len(hist) == 3 for hist in scores.values()) nuniq_scores = [pd.Series(v).nunique() for v in scores.values()] assert max(nuniq_scores) > 1
async def test_pytorch(c, s, a, b): n_features = 10 defaults = { "callbacks": False, "warm_start": False, "train_split": None, "max_epochs": 1, } model = NeuralNetRegressor( module=ShallowNet, module__n_features=n_features, criterion=nn.MSELoss, optimizer=optim.SGD, optimizer__lr=0.1, batch_size=64, **defaults, ) model2 = clone(model) assert model.callbacks is False assert model.warm_start is False assert model.train_split is None assert model.max_epochs == 1 params = {"optimizer__lr": loguniform(1e-3, 1e0)} X, y = make_regression(n_samples=100, n_features=n_features) X = X.astype("float32") y = y.astype("float32").reshape(-1, 1) search = IncrementalSearchCV(model2, params, max_iter=5, decay_rate=None) await search.fit(X, y) assert search.best_score_ >= 0
def _get_geodamps(n_params): model = GeoDamp(seed=42) # Don't decay the learning rate: # damping delay = 250,000 examples (5 epochs) # Tune close to that. powers = [5, 6, 7] param_space = { "initial_batch_size": [2**i for i in powers], "max_batch_size": [100, 200, 500, 1000, 2000, 5000], "dampingfactor": loguniform(1, 10), "dampingdelay": loguniform(50e3, 500e3), "weight_decay": [1e-3, 1e-4, 1e-5, 1e-6, 0, 0, 0], } params = ParameterSampler(param_space, n_iter=n_params, seed=42) models = [clone(model).set_params(**p) for p in params] return models
def _get_data(a, b, ndim, rng): if ndim == 2: s1 = loguniform(min(3, a // 2), a * 2).rvs(random_state=rng).astype(int) s2 = loguniform(min(3, b // 2), b * 2).rvs(random_state=rng).astype(int) x = rng.randn(a, s1) h = rng.randn(b, s2) if mode == "valid": s = np.sort([a, b, s1, s2]) h = rng.randn(*s[:2]) x = rng.randn(*s[2:]) assert all(h.shape[i] <= x.shape[j] for i in [0, 1] for j in [0, 1]) elif ndim == 1: x = rng.randn(a) h = rng.randn(b) else: raise ValueError("ndim") return x, h
def convert_to_sklearn(self): from scipy.stats import loguniform, uniform if self.log: sampler = loguniform(self.lower, self.upper) else: sampler = uniform(self.lower, self.upper - self.lower) return sampler
def load_params_svm(p, label): c = p[label]['C'] gamma = p[label]['gamma'] kernel = p[label]['kernel'] class_weight = p[label]['class_weight'] for i, cw in enumerate(class_weight): if cw == 'None': class_weight[i] = None params = { 'C': loguniform(c[0], c[1]), 'gamma': loguniform(gamma[0], gamma[1]), 'kernel': kernel, 'class_weight': class_weight } return params
def hyper_xgboost_rs(): cs = { 'eta': loguniform(1e-5, 1), 'subsample': uniform(0.1, 0.9), 'max_depth': list(range(1, 99)), 'gamma': uniform(0.001, 1.999), 'min_child_weight': uniform(1, 69) } return cs
def hyper_catboost_rs(): cs = { 'max_depth': list(range(1, 15)), 'learning_rate': loguniform(0.001, 1), 'l2_leaf_reg': uniform(1, 29), 'bagging_temperature': uniform(0.1, 9.9), 'random_strength': uniform(0.1, 9.9) } return cs
def _time_2d(seed, mode): rng = np.random.RandomState(2**31 - seed) n = loguniform(1e1, 500).rvs(size=1, random_state=rng).astype(int).item() k = loguniform(3, 75).rvs(size=1, random_state=rng).astype(int).item() r1, r2 = uniform(1, 2).rvs(size=2, random_state=rng) n2, k2 = int(r1 * n), int(r2 * k) if mode == "valid": k, k2, n, n2 = np.sort([n, n2, k, k2]) x = rng.randn(n, n2) h = rng.randn(k, k2) assert x.ndim == 2 and h.ndim == 2 datum = {"x_shape0": n, "h_shape0": k, "x_shape1": n2, "h_shape1": k2, "seed": seed, "mode": mode, "ndim": 2} datum["choose_conv_method"] = choose_conv_method(x, h, mode) for method in ["fft", "direct", "auto"]: start = time() y = convolve(x, h, mode=mode, method=method) datum[method + "_time"] = time() - start return datum
def fit_gbdt(X, y, n_iter): """Fit a gradient boosted decision trees model""" model = LGBMClassifier(n_estimators=2000, random_state=42) model = make_pipeline(columns_transform(), model) param_space = { "lgbmclassifier__min_data_in_leaf": loguniform_int(5, 500), "lgbmclassifier__num_leaves": loguniform_int(31, 500), "lgbmclassifier__reg_alpha": st.loguniform(1e-10, 1.0), "lgbmclassifier__reg_lambda": st.loguniform(1e-10, 1.0), "lgbmclassifier__learning_rate": st.loguniform(1e-4, 1e-1), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def _time_1d(seed, mode): rng = np.random.RandomState(2**31 - seed) n, k = loguniform(3, 5e4).rvs(size=2, random_state=rng).astype(int) x = rng.randn(n) h = rng.randn(k) assert x.ndim == 1 and h.ndim == 1 datum = {"x_shape": n, "h_shape": k, "seed": seed, "mode": mode, "ndim": 1} datum["choose_conv_method"] = choose_conv_method(x, h, mode) for method in ["fft", "direct", "auto"]: start = time() y = convolve(x, h, mode=mode, method=method) datum[method + "_time"] = time() - start return datum
def fit_mlp(X, y, n_iter): """Fit a simple multi-layer perceptron model""" model = MLPClassifier(random_state=42, early_stopping=True) model = make_pipeline(columns_transform(), model) layers_options = [ [n_units] * n_layers for n_units, n_layers in it.product([32, 64, 128, 256, 512], [1, 2]) ] param_space = { "mlpclassifier__hidden_layer_sizes": layers_options, "mlpclassifier__alpha": st.loguniform(1e-5, 1e-2), "mlpclassifier__learning_rate_init": st.loguniform(1e-4, 1e-1), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def _get_padadamps(n_params): powers = [5, 5.5, 6, 6.5, 7] param_space = { "initial_batch_size": [2**i for i in powers], "max_batch_size": [100, 200, 500, 1000, 2000, 5000], "batch_growth_rate": loguniform(1e-3, 1e-1), "dwell": [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000], "weight_decay": [1e-3, 1e-4, 1e-5, 1e-6, 0, 0, 0], } model = PadaDamp(seed=42) params = ParameterSampler(param_space, n_iter=n_params, seed=42) models = [clone(model).set_params(**p) for p in params] return models
def cv(self, x_train, y_train, x_val=None, y_val=None, hpo="random"): from sklearn.svm import SVR params = self.params params["verbose"] = 0 best_param = None best_param_score = None if hpo == "grid": search_params = [{"kernel": ["rbf"], "gamma": [0.001, 0.01, 0.1, 1], "C": [0.1, 1, 10, 100]}, {"kernel": ["sigmoid"], "gamma": [0.001, 0.01, 0.1, 1], "C": [0.1, 1, 10, 100]}] search = GridSearchCV(SVR(**params), search_params, n_jobs=-1, cv=3, scoring="neg_mean_squared_error", verbose=1) search.fit(x_train, y_train) best_param = search.best_params_ best_param_score = search.best_score_ elif hpo == "random": search_params = {"kernel": ["rbf", "sigmoid", "linear"], "gamma": loguniform(0.001, 1), "C": loguniform(0.1, 100)} search = RandomizedSearchCV(SVR(**params), search_params, n_jobs=-1, cv=3, random_state=1234, scoring="neg_mean_squared_error", verbose=1) search.fit(x_train, y_train) best_param = search.best_params_ best_param_score = search.best_score_ if best_param is not None: print("Best Param: {}, with scores: {}".format(best_param, best_param_score)) self.params.update(best_param) self.build_model(**self.params)
def test_continous_induced_measure_ppf(self): degree = 2 alpha_stat, beta_stat = 3, 3 ab = jacobi_recurrence( degree+1, alpha=beta_stat-1, beta=alpha_stat-1, probability=True) tol = 1e-15 var = stats.beta(alpha_stat, beta_stat, -5, 10) can_lb, can_ub = -1, 1 lb, ub = var.support() print(lb, ub) cx = np.linspace(can_lb, can_ub, 51) def can_pdf(xx): loc, scale = lb+(ub-lb)/2, (ub-lb)/2 return var.pdf(xx*scale+loc)*scale cdf_vals = continuous_induced_measure_cdf( can_pdf, ab, degree, can_lb, can_ub, tol, cx) assert np.all(cdf_vals <= 1.0) ppf_vals = continuous_induced_measure_ppf( var, ab, degree, cdf_vals, 1e-10, 1e-8) assert np.allclose(cx, ppf_vals) try: var = stats.loguniform(1.e-5, 1.e-3) except: var = stats.reciprocal(1.e-5, 1.e-3) ab = get_recursion_coefficients_from_variable(var, degree+5, {}) can_lb, can_ub = -1, 1 cx = np.linspace(can_lb, can_ub, 51) lb, ub = var.support() def can_pdf(xx): loc, scale = lb+(ub-lb)/2, (ub-lb)/2 return var.pdf(xx*scale+loc)*scale cdf_vals = continuous_induced_measure_cdf( can_pdf, ab, degree, can_lb, can_ub, tol, cx) # differences caused by root finding optimization tolerance assert np.all(cdf_vals <= 1.0) ppf_vals = continuous_induced_measure_ppf( var, ab, degree, cdf_vals, 1e-10, 1e-8) # import matplotlib.pyplot as plt # plt.plot(cx, cdf_vals) # plt.plot(ppf_vals, cdf_vals, 'r*', ms=2) # plt.show() assert np.allclose(cx, ppf_vals)
def cv(self, x_train, y_train, x_val=None, y_val=None, hpo="random"): from xgboost import XGBRegressor params = self.params params["verbosity"] = 0 if x_val is not None: fit_params = {"early_stopping_rounds": 20, "eval_set": [(x_val, y_val)], "verbose": False} else: fit_params = {"verbose": False} best_param = None best_param_score = None if hpo == "grid": search_params = { "n_estimators": [100, 500, 1000], "max_depth": [5, 10, 15, 20], "learning_rate": [0.1, 0.05, 0.01] } search = GridSearchCV(XGBRegressor(**params), search_params, n_jobs=1, cv=3, scoring="neg_mean_squared_error", verbose=True) search.fit(x_train, y_train, **fit_params) best_param = search.best_params_ best_param_score = search.best_score_ elif hpo == "random": search_params = { "n_estimators": [100, 500, 1000], "max_depth": [5, 10, 15, 20], "learning_rate": loguniform(loc=0.01, scale=0.1) } search = RandomizedSearchCV(XGBRegressor(**params), search_params, n_jobs=1, cv=3, random_state=1234, scoring="neg_mean_squared_error", verbose=True) search.fit(x_train, y_train, **fit_params) best_param = search.best_params_ best_param_score = search.best_score_ if best_param is not None: print("Best Param: {}, with scores: {}".format(best_param, best_param_score)) self.params.update(best_param) self.params["n_jobs"] = 0 self.build_model(**self.params)
def fit_linear(X, y, n_iter): """Fit a logistic regression model""" model = LogisticRegression(max_iter=500, penalty="elasticnet", solver="saga") model = make_pipeline(columns_transform(), model) param_space = { "logisticregression__l1_ratio": st.uniform(0, 1), "logisticregression__C": st.loguniform(1e-4, 1e4), } model = dcv.RandomizedSearchCV(model, param_space, scoring="neg_log_loss", n_iter=n_iter, random_state=42, cv=5) model.fit(X, y) return model
def create_logistic_regression_model(random_state, tune=True, class_balanced=True): """Create a logistic regression model using best hyperparameters or tuning Parameters ---------- tune : bool, optional tune the hyperparameter or using the best values, by default True Returns ------- Logistic Regression Model The model we want to create """ class_weight = "balanced" if not class_balanced: class_weight = None model = { "clf": LogisticRegression(class_weight=class_weight, random_state=random_state, max_iter=1000) } if tune: model[PARAM_DIST] = {"logisticregression__C": loguniform(1e-3, 1e3)} else: if class_balanced: model[PARAM_DIST] = { "logisticregression__C": [0.008713608033492446] } else: model[PARAM_DIST] = { "logisticregression__C": [0.008713608033492446] } return model
def test_gp_space(): space = { 'f': range(10), 'h': uniform(-10, 20), 'e': ['cat1', 1, 'cat2'], 'c': [1, 2, 3], 'a': ['const'], 'g': loguniform(0.001, 100), 'b': [10], 'd': uniform(0, 1), 'i': [True, False] } ds = domain_space(space, domain_size=10000) X = ds.sample_gp_space() assert (X <= 1.0).all() assert (X >= 0.0).all() assert (X[:, 0] == 1.).all() # a assert (X[:, 1] == 0.).all() # b assert np.isin(X[:, 2], [0.0, 0.5, 1.0]).all() # c assert np.isin(X[:, 4:7], np.eye(3)).all() # e assert X.shape == (ds.domain_size, 12) params = ds.convert_to_params(X) for param in params: assert param['a'] == 'const' assert param['b'] == 10 assert param['c'] in space['c'] assert 0.0 <= param['d'] <= 1.0 assert param['e'] in space['e'] assert param['f'] in space['f'] assert 0.001 <= param['g'] <= 100 assert -10 <= param['h'] <= 10 assert param['i'] in space['i'] X2 = ds.convert_to_gp(params) assert np.isclose(X2, X).all()
def parse_config(config_file): with open(config_file, "r") as f: config = load(f, Loader=Loader) sbatch_args = [] for k, v in config["sbatch"].items(): if len(k) == 1: sbatch_args.append(f"-{k} {v}") else: sbatch_args.append(f"--{k}={v}") param_dists = {} for k, v in config["hyperparams"].items(): if isinstance(v, dict): lo, hi = v["range"] if v["dist"] == "uniform": param_dists[k] = uniform(lo, hi) elif v["dist"] == "loguniform": param_dists[k] = loguniform(lo, hi) else: # list or constant if not isinstance(v, (list, tuple)): param_dists[k] = [v] else: param_dists[k] = v return sbatch_args, param_dists
def hyper_parameter_search(classifier_type, trainX, RTrain): hyper_params = dict() space = dict() space['solver'] = ['lbfgs'] space['penalty'] = ['none', 'l2'] space['C'] = loguniform(1e-5, 100) model = LogisticRegression(multi_class='multinomial') #perform topic-conditional hyper-parameter search for topic in trainX: if topic != 'R175': if classifier_type == 'logistic': search = RandomizedSearchCV(model, space, n_iter=100, scoring='accuracy', n_jobs=-1, random_state=1) # execute search result = search.fit(trainX[topic], RTrain[topic]) # summarize result print('Best Score: %s' % result.best_score_) print('Best Hyperparameters: %s' % result.best_params_) hyper_params[topic] = result.best_params_ return hyper_params