def _svm_hp_space( name_func, kernel, n_features=1, C=None, gamma=None, coef0=None, degree=None, shrinking=None, tol=None, max_iter=None, probability=False, verbose=False, cache_size=_svm_default_cache_size): '''Generate SVM hyperparamters search space ''' if kernel in ['linear', 'rbf', 'sigmoid']: degree_ = 1 else: degree_ = (_svm_degree(name_func('degree')) if degree is None else degree) if kernel in ['linear']: gamma_ = 'auto' else: gamma_ = (_svm_gamma(name_func('gamma'), n_features=1) if gamma is None else gamma) gamma_ /= n_features # make gamma independent of n_features. if kernel in ['linear', 'rbf']: coef0_ = 0.0 elif coef0 is None: if kernel == 'poly': coef0_ = hp.pchoice(name_func('coef0'), [ (0.3, 0), (0.7, gamma_ * hp.uniform(name_func('coef0val'), 0., 10.)) ]) elif kernel == 'sigmoid': coef0_ = hp.pchoice(name_func('coef0'), [ (0.3, 0), (0.7, gamma_ * hp.uniform(name_func('coef0val'), -10., 10.)) ]) else: pass else: coef0_ = coef0 hp_space = dict( kernel=kernel, C=_svm_C(name_func('C')) if C is None else C, gamma=gamma_, coef0=coef0_, degree=degree_, shrinking=(hp_bool(name_func('shrinking')) if shrinking is None else shrinking), tol=_svm_tol(name_func('tol')) if tol is None else tol, max_iter=(_svm_max_iter(name_func('maxiter')) if max_iter is None else max_iter), probability=probability, verbose=verbose, cache_size=cache_size) return hp_space
def hyperopt_search(variant: cco.Variant, data, max_evals, mix_algo_ratio=None, random_state=None): timer = stopwatch.Timer() if variant.is_gb(): clf_hp_gb.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.40, 1), (0.60, 2)]), random_state=random_state, ) elif variant.is_rf(): clf_hp_rf.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.45, 1), (0.55, 2)]), random_state=random_state, ) elif variant.is_cb_native(): pools = clf_cat_tools.make_pools(data) clf_hp_cb.do_fmin( pools, max_evals, mix_algo_ratio=mix_algo_ratio, random_state=random_state, how="native", ) sec_elapsed = timer.elapsed cco.out("{} done for {}".format(variant.name, cco.fmt(sec_elapsed)))
def test_basic3(self): space = hp.pchoice('something', [ (.2, hp.pchoice('number', [(.8, 2), (.2, 1)])), (.8, hp.pchoice('number1', [(.7, 5), (.3, 6)])) ]) a, b, c, d = 0, 0, 0, 0 rng = np.random.RandomState(123) for i in range(0, 2000): nesto = hyperopt.pyll.stochastic.sample(space, rng=rng) if nesto == 2: a += 1 elif nesto == 1: b += 1 elif nesto == 5: c += 1 elif nesto == 6: d += 1 else: assert 0, nesto print((a, b, c, d)) assert a + b + c + d == 2000 assert 300 < a + b < 500 assert 1500 < c + d < 1700 assert a * .3 > b # a * 1.2 > 4 * b assert c * 3 * 1.2 > d * 7
def ridge(name, alpha=None, #default - 1.0 normalize=None, #default - False, tol=None, #default - 0.001 solver=None, #default - 'auto' fit_intercept=None, #default - True ): def _name(msg): return '%s.%s_%s' % (name, 'sgd', msg) rval = scope.sklearn_Ridge( alpha=hp.loguniform( _name('alpha'), np.log(1e-3), np.log(1e3)) if alpha is None else alpha, normalize=hp.pchoice( _name('normalize'), [ (0.8, True), (0.2, False) ]) if normalize is None else normalize, fit_intercept=hp.pchoice( _name('fit_intercept'), [ (0.8, True), (0.2, False) ]) if fit_intercept is None else fit_intercept, tol=0.001 if tol is None else tol, solver="auto" if solver is None else solver, ) return rval
def test_basic3(self): space = hp.pchoice( "something", [ (0.2, hp.pchoice("number", [(0.8, 2), (0.2, 1)])), (0.8, hp.pchoice("number1", [(0.7, 5), (0.3, 6)])), ], ) a, b, c, d = 0, 0, 0, 0 rng = np.random.RandomState(123) for i in range(0, 2000): nesto = hyperopt.pyll.stochastic.sample(space, rng=rng) if nesto == 2: a += 1 elif nesto == 1: b += 1 elif nesto == 5: c += 1 elif nesto == 6: d += 1 else: assert 0, nesto print((a, b, c, d)) assert a + b + c + d == 2000 assert 300 < a + b < 500 assert 1500 < c + d < 1700 assert a * 0.3 > b # a * 1.2 > 4 * b assert c * 3 * 1.2 > d * 7
class DecisionTreeModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return DecisionTreeRegressor(random_state=RANDOM_STATE, presort=True, **args) hp_space = { "criterion": hp.choice("criterion", ["mse", "friedman_mse", "mae"]), "max_depth": hp.pchoice( "max_depth_enabled", [ (0.7, None), (0.3, 1 + scope.int(hp.qlognormal("max_depth", np.log(30), 0.5, 3))), ], ), "splitter": hp.choice("splitter_str", ["best", "random"]), "max_features": hp.pchoice( "max_features_str", [ (0.2, "sqrt"), # most common choice. (0.1, "log2"), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform("max_features_str_frac", 0.0, 1.0)), ], ), "min_samples_split": scope.int(hp.quniform("min_samples_split_str", 2, 10, 1)), "min_samples_leaf": hp.choice( "min_samples_leaf_enabled", [ 1, scope.int( hp.qloguniform("min_samples_leaf", np.log(1.5), np.log(50.5), 1) ), ], ), }
class SVMModel(NonTreeBasedModel): @staticmethod def build_estimator(args, train_data=None, test=False): return SVC( random_state=RANDOM_STATE, # Prevent very long training time for some hyper-parameters max_iter=int(1e3) if test else int(2e6), # Use 4GB of cache to speed up cache_size=4096, **args) # Hyper-parameters distributions C_hp = hp.loguniform('C', np.log(1e-5), np.log(1e4)) gamma_hp = hp.pchoice( 'gamma_choice', [(0.3, 'auto'), (0.3, 'scale'), (0.2, hp.loguniform('gamma', np.log(1e-4), np.log(1e2)))]) class_weight_hp = hp.pchoice('class_weight', [ (0.5, None), (0.5, 'balanced'), ]) coef0_hp = hp.pchoice( 'coef0_null', [(0.3, 0.0), (0.7, hp.loguniform('coef0', np.log(1e-3), np.log(1e3)))]) # Defintions of hyper-parameters spaces for each kernel linear_hp_space = { 'kernel': 'linear', 'C': C_hp, 'class_weight': class_weight_hp } poly_hp_space = { 'kernel': 'poly', 'C': C_hp, 'degree': hp.quniform('degree', 1.5, 6.5, 1), 'gamma': gamma_hp, 'coef0': coef0_hp, 'class_weight': class_weight_hp } rbf_hp_space = { 'kernel': 'rbf', 'C': C_hp, 'gamma': gamma_hp, 'class_weight': class_weight_hp } sigmoid_hp_space = { 'kernel': 'sigmoid', 'C': C_hp, 'gamma': gamma_hp, 'coef0': coef0_hp, 'class_weight': class_weight_hp } # Final hyper-parameters space hp_space = hp.pchoice('kernel', [(0.5, rbf_hp_space), (0.2, linear_hp_space), (0.2, poly_hp_space), (0.1, sigmoid_hp_space)])
def _svm_hp_space( name_func, kernel, n_features=1, C=None, gamma=None, coef0=None, degree=None, shrinking=None, tol=None, max_iter=None, verbose=False, cache_size=_svm_default_cache_size): '''Generate SVM hyperparamters search space ''' if kernel in ['linear', 'rbf', 'sigmoid']: degree_ = 1 else: degree_ = (_svm_degree(name_func('degree')) if degree is None else degree) if kernel in ['linear']: gamma_ = 'auto' else: gamma_ = (_svm_gamma(name_func('gamma'), n_features=1) if gamma is None else gamma) gamma_ /= n_features # make gamma independent of n_features. if kernel in ['linear', 'rbf']: coef0_ = 0.0 elif coef0 is None: if kernel == 'poly': coef0_ = hp.pchoice(name_func('coef0'), [ (0.3, 0), (0.7, gamma_ * hp.uniform(name_func('coef0val'), 0., 10.)) ]) elif kernel == 'sigmoid': coef0_ = hp.pchoice(name_func('coef0'), [ (0.3, 0), (0.7, gamma_ * hp.uniform(name_func('coef0val'), -10., 10.)) ]) else: pass else: coef0_ = coef0 hp_space = dict( kernel=kernel, C=_svm_C(name_func('C')) if C is None else C, gamma=gamma_, coef0=coef0_, degree=degree_, shrinking=(hp_bool(name_func('shrinking')) if shrinking is None else shrinking), tol=_svm_tol(name_func('tol')) if tol is None else tol, max_iter=(_svm_max_iter(name_func('maxiter')) if max_iter is None else max_iter), verbose=verbose, cache_size=cache_size) return hp_space
def sgd_regression(name, loss=None, #default - 'hinge' penalty=None, #default - 'l2' alpha=None, #default - 0.0001 l1_ratio=None, #default - 0.15, must be within [0, 1] fit_intercept=None, #default - True n_iter=None, #default - 5 shuffle=None, #default - False random_state=None, #default - None epsilon=None, #default - 0.1 learning_rate=None, #default - 'invscaling' eta0=None, #default - 0.01 power_t=None, #default - 0.5 warm_start=False, verbose=0, ): def _name(msg): return '%s.%s_%s' % (name, 'sgd', msg) rval = scope.sklearn_SGDRegressor( loss=hp.pchoice( _name('loss'), [ (0.25, 'squared_loss'), (0.25, 'huber'), (0.25, 'epsilon_insensitive'), (0.25, 'squared_epsilon_insensitive') ] ) if loss is None else loss, penalty=hp.pchoice( _name('penalty'), [ (0.40, 'l2'), (0.35, 'l1'), (0.25, 'elasticnet') ] ) if penalty is None else penalty, alpha=hp.loguniform( _name('alpha'), np.log(1e-7), np.log(1)) if alpha is None else alpha, l1_ratio=hp.uniform( _name('l1_ratio'), 0, 1 ) if l1_ratio is None else l1_ratio, fit_intercept=hp.pchoice( _name('fit_intercept'), [ (0.8, True), (0.2, False) ]) if fit_intercept is None else fit_intercept, epsilon=hp.loguniform( _name('epsilon'), np.log(1e-7), np.log(1)) if epsilon is None else epsilon, learning_rate='invscaling' if learning_rate is None else learning_rate, eta0=hp.loguniform( _name('eta0'), np.log(1e-5), np.log(1e-1)) if eta0 is None else eta0, power_t=hp.uniform( _name('power_t'), 0, 1) if power_t is None else power_t, verbose=verbose, random_state=random_state, ) return rval
class GradientBoostingModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return GradientBoostingRegressor(random_state=RANDOM_STATE, presort=True, **args) loss_alpha = hp.choice('loss_alpha', [('ls', 0.9), ('lad', 0.9), ('huber', hp.uniform('gbr_alpha', 0.85, 0.95)), ('quantile', 0.5)]) hp_space = { 'n_estimators': scope.int( hp.qloguniform('n_estimators', np.log(10.5), np.log(1000.5), 1)), 'learning_rate': hp.lognormal('learning_rate', np.log(0.01), np.log(10.0)), 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), 'max_depth': hp.pchoice('max_depth', [(0.2, 2), (0.5, 3), (0.2, 4), (0.1, 5)]), 'min_samples_leaf': hp.choice( 'min_samples_leaf_enabled', [ 1, # most common choice. scope.int( hp.qloguniform('min_samples_leaf', np.log(1.5), np.log(50.5), 1)) ]), 'subsample': hp.pchoice( 'subsample_enabled', [ (0.2, 1.0), # default choice. (0.8, hp.uniform('subsample', 0.5, 1.0) ) # stochastic grad boosting. ]), 'max_features': hp.pchoice( 'max_features_str', [ (0.1, 'sqrt'), # most common choice. (0.2, 'log2'), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform('max_features_str_frac', 0., 1.)) ]), 'loss': loss_alpha[0], 'alpha': loss_alpha[1] }
class GradientBoostingModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return GradientBoostingRegressor( random_state=RANDOM_STATE, presort=True, **args ) loss_alpha = hp.choice( "loss_alpha", [ ("ls", 0.9), ("lad", 0.9), ("huber", hp.uniform("gbr_alpha", 0.85, 0.95)), ("quantile", 0.5), ], ) hp_space = { "n_estimators": scope.int( hp.qloguniform("n_estimators", np.log(10.5), np.log(1000.5), 1) ), "learning_rate": hp.lognormal("learning_rate", np.log(0.01), np.log(10.0)), "criterion": hp.choice("criterion", ["mse", "friedman_mse", "mae"]), "max_depth": hp.pchoice("max_depth", [(0.2, 2), (0.5, 3), (0.2, 4), (0.1, 5)]), "min_samples_leaf": hp.choice( "min_samples_leaf_enabled", [ 1, # most common choice. scope.int( hp.qloguniform("min_samples_leaf", np.log(1.5), np.log(50.5), 1) ), ], ), "subsample": hp.pchoice( "subsample_enabled", [ (0.2, 1.0), # default choice. (0.8, hp.uniform("subsample", 0.5, 1.0)), # stochastic grad boosting. ], ), "max_features": hp.pchoice( "max_features_str", [ (0.1, "sqrt"), # most common choice. (0.2, "log2"), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform("max_features_str_frac", 0.0, 1.0)), ], ), "loss": loss_alpha[0], "alpha": loss_alpha[1], }
def sgd( name, loss=None, #default - 'hinge' penalty=None, #default - 'l2' alpha=None, #default - 0.0001 l1_ratio=None, #default - 0.15, must be within [0, 1] fit_intercept=None, #default - True n_iter=None, #default - 5 shuffle=None, #default - False random_state=None, #default - None epsilon=None, n_jobs=1, #default - 1 (-1 means all CPUs) learning_rate=None, #default - 'invscaling' eta0=None, #default - 0.01 power_t=None, #default - 0.5 class_weight=None, warm_start=False, verbose=False, ): def _name(msg): return '%s.%s_%s' % (name, 'sgd', msg) rval = scope.sklearn_SGDClassifier( loss=hp.pchoice(_name('loss'), [(0.25, 'hinge'), (0.25, 'log'), (0.25, 'modified_huber'), (0.05, 'squared_hinge'), (0.05, 'perceptron'), (0.05, 'squared_loss'), (0.05, 'huber'), (0.03, 'epsilon_insensitive'), (0.02, 'squared_epsilon_insensitive')]) if loss is None else loss, penalty=hp.pchoice(_name('penalty'), [(0.40, 'l2'), (0.35, 'l1'), (0.25, 'elasticnet')]) if penalty is None else penalty, alpha=hp.loguniform(_name('alpha'), np.log(1e-7), np.log(1)) if alpha is None else alpha, l1_ratio=hp.uniform(_name('l1_ratio'), 0, 1) if l1_ratio is None else l1_ratio, fit_intercept=hp.pchoice(_name('fit_intercept'), [(0.8, True), (0.2, False)]) if fit_intercept is None else fit_intercept, learning_rate='invscaling' if learning_rate is None else learning_rate, eta0=hp.loguniform(_name('eta0'), np.log(1e-5), np.log(1e-1)) if eta0 is None else eta0, power_t=hp.uniform(_name('power_t'), 0, 1) if power_t is None else power_t, n_jobs=n_jobs, verbose=verbose, ) return rval
def _trees_max_features(name): return hp.pchoice(name, [ (0.2, 'sqrt'), # most common choice. (0.1, 'log2'), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform(name + '.frac', 0., 1.)) ])
def setUp(self): self.space = hp.pchoice('a', [ (.1, 0), (.2, 1), (.3, 2), (.4, 3)]) self.trials = Trials()
def test_basic2(self): space = hp.choice('normal_choice', [ hp.pchoice('fsd', [(.1, 'first'), (.8, 'second'), (.1, 2)]), hp.choice('something_else', [10, 20]) ]) a, b, c = 0, 0, 0 rng=np.random.RandomState(123) for i in range(0, 1000): nesto = hyperopt.pyll.stochastic.sample(space, rng=rng) if nesto == 'first': a += 1 elif nesto == 'second': b += 1 elif nesto == 2: c += 1 elif nesto in (10, 20): pass else: assert 0, nesto print((a, b, c)) assert b > 2 * a assert b > 2 * c
def test_parse_hyperopt_param_error(): """Test invalid input to parse_hyperopt_param. """ # This will raise an error since pchoice is a categorical distribution parameter = hp.pchoice('test', [(0.5, 'no'), (0.5, 'yes')]) with pytest.raises(ValueError): parse_hyperopt_param(str(parameter))
def test_basic2(self): space = hp.choice( "normal_choice", [ hp.pchoice("fsd", [(0.1, "first"), (0.8, "second"), (0.1, 2)]), hp.choice("something_else", [10, 20]), ], ) a, b, c = 0, 0, 0 rng = np.random.RandomState(123) for i in range(0, 1000): nesto = hyperopt.pyll.stochastic.sample(space, rng=rng) if nesto == "first": a += 1 elif nesto == "second": b += 1 elif nesto == 2: c += 1 elif nesto in (10, 20): pass else: assert 0, nesto print((a, b, c)) assert b > 2 * a assert b > 2 * c
class BasicNeuralNetworkModel(NonTreeBasedModel): @staticmethod def build_estimator(args, train_data=None, test=False): return MLPClassifier(random_state=RANDOM_STATE, max_iter=(1 if test else 300), **args) """ In order to facilitate hyper-parameter tuning, we choose not to tune betas hyperparameters used by the Adam optimizer, the power exponent used by the invscaling learning rate schedule and the batch size. We choose to use only stochastic gradient-based optimizers because other optimizers would not work with some of the big datasets. """ layer_size = scope.int(hp.quniform("layer_size", 10, 100, 5)) hp_space = { "hidden_layer_sizes": hp.choice("n_layers", [ [layer_size], [layer_size] * 2, [layer_size] * 3, ]), "early_stopping": True, "activation": hp.pchoice("activation", [(0.25, "logistic"), (0.25, "tanh"), (0.5, "relu")]), "solver": hp.choice("solver", ["sgd", "adam"]), "alpha": hp.loguniform("alpha", np.log(1e-7), np.log(1e-2)), "batch_size": 128, "learning_rate": hp.pchoice( "learning_rate_schedule", [(0.2, "constant"), (0.3, "invscaling"), (0.5, "adaptive")], ), "learning_rate_init": hp.loguniform("learning_rate", np.log(1e-4), np.log(1e-1)), "momentum": hp.uniform("momentum", 0.87, 0.99), "nesterovs_momentum": hp.pchoice("nesterovs_momentum", [(0.7, True), (0.3, False)]), }
def _trees_max_depth(name): return hp.pchoice(name, [ (0.7, None), # most common choice. # Try some shallow trees. (0.1, 2), (0.1, 3), (0.1, 4), ])
class BasicNeuralNetworkModel(NonTreeBasedModel): @staticmethod def build_estimator(args, train_data=None, test=False): return MLPClassifier(random_state=RANDOM_STATE, max_iter=(1 if test else 300), **args) """ In order to facilitate hyper-parameter tuning, we choose not to tune betas hyperparameters used by the Adam optimizer, the power exponent used by the invscaling learning rate schedule and the batch size. We choose to use only stochastic gradient-based optimizers because other optimizers would not work with some of the big datasets. """ layer_size = scope.int(hp.quniform('layer_size', 10, 100, 5)) hp_space = { 'hidden_layer_sizes': hp.choice('n_layers', [ [layer_size], [layer_size] * 2, [layer_size] * 3, ]), 'early_stopping': True, 'activation': hp.pchoice('activation', [(0.25, 'logistic'), (0.25, 'tanh'), (0.5, 'relu')]), 'solver': hp.choice('solver', ['sgd', 'adam']), 'alpha': hp.loguniform('alpha', np.log(1e-7), np.log(1e-2)), 'batch_size': 128, 'learning_rate': hp.pchoice('learning_rate_schedule', [(0.2, 'constant'), (0.3, 'invscaling'), (0.5, 'adaptive')]), 'learning_rate_init': hp.loguniform('learning_rate', np.log(1e-4), np.log(1e-1)), 'momentum': hp.uniform('momentum', 0.87, 0.99), 'nesterovs_momentum': hp.pchoice('nesterovs_momentum', [(0.7, True), (0.3, False)]), }
def _grad_boosting_subsample(name): return hp.pchoice( name, [ (0.2, 1.0), # default choice. (0.8, hp.uniform(name + '.sgb', 0.5, 1.0) ) # stochastic grad boosting. ])
def generic_space(name='space'): model = hp.pchoice('%s' % name, [ (.8, {'preprocessing': [pca(name + '.pca')], 'classifier': any_classifier(name + '.pca_clsf') }), (.2, {'preprocessing': [min_max_scaler(name + '.min_max_scaler')], 'classifier': any_classifier(name + '.min_max_clsf'), }), ]) return as_apply({'model': model})
def space_rf(parts, n_features, max_depth_space, random_state): if max_depth_space is not None: max_depth = max_depth_space else: max_depth = hp.pchoice( "max_depth", [ (0.22, 1), (0.22, 2), (0.24, 3), (0.22, 4), (0.10, 5), ], ) criterion = hp.pchoice("criterion", [(0.5, "gini"), (0.5, "entropy")]) max_features = hp.choice("max_features", cco.colsample_ratios(n_features)) min_samples_leaf = hp.choice("min_samples_leaf", [15, 20, 30, 40]) space = hp.pchoice( "split_parts", [( part.pratio, { "random_state": random_state, "criterion": criterion, "max_depth": max_depth, "max_features": max_features, "min_samples_leaf": min_samples_leaf, "n_estimators": hp.choice( "estimators_" + str(idx), list(range(part.n_estim1, part.n_estim2, part.n_estimq)), ), }, ) for idx, part in enumerate(parts)], ) return space
def test_bug1_anneal(): space = hp.choice( "preprocess_choice", [ {"pwhiten": hp.pchoice("whiten_randomPCA", [(0.3, False), (0.7, True)])}, {"palgo": False}, {"pthree": 7}, ], ) fmin(fn=lambda x: 1, space=space, algo=anneal.suggest, max_evals=50)
def test_bug1_anneal(): space = hp.choice('preprocess_choice', [ {'pwhiten': hp.pchoice('whiten_randomPCA', [(.3, False), (.7, True)])}, {'palgo': False}, {'pthree': 7}]) best = fmin(fn=lambda x: 1, space=space, algo=anneal.suggest, max_evals=50)
def space_gboost(parts, n_features, max_depth_space=None, random_state=None): # n_estimators = hp.choice('n_estimators', range(100, 1150, 10)) if max_depth_space is not None: max_depth = max_depth_space else: max_depth = hp.pchoice( "max_depth", [ (0.29, 2), (0.27, 3), (0.24, 4), (0.20, 5), # (0.20, 6), ], ) loss = hp.choice("loss", ["exponential", "deviance"]) max_features = hp.choice("max_features", cco.colsample_ratios(n_features)) space = hp.pchoice( "split_parts", [ ( part.pratio, { "random_state": random_state, "max_depth": max_depth, "loss": loss, "max_features": max_features, "n_estimators": hp.choice( "estimators_" + str(idx), list(range(part.n_estim1, part.n_estim2, part.n_estimq)), ), "learning_rate": hp.uniform( "learning_rate_" + str(idx), part.learn_rate1, part.learn_rate2 ), }, ) for idx, part in enumerate(parts) ], ) return space
def _starting_space(self) -> Any: """Create starting space for sampler. Parameters ---------- None Returns ------- dict Key/value pairs, key is hyperparameter name and value is statistical distribution that can be sampled """ params: Dict[str, Any] = {} params['n_hidden_layers'] = self.n_hidden_layers # Add all hidden layer units for i in range(1, self.n_hidden_layers + 1): params['n_hidden%s' % i] = hp.quniform('mlp_h%s' % i, 1, self.max_neurons, 1) params.update({ 'learning_rate': hp.loguniform('mlp_lr', log(1e-4), log(1)), 'dropout': hp.uniform('mlp_do', 0, 1), 'epochs': hp.quniform('mlp_e', 1, self.max_epochs, 1), 'batch_size': hp.quniform('mlp_bs', 2, 512, 1), 'batch_norm': hp.pchoice('mlp_bn', [(0.5, 'no'), (0.5, 'yes')]), 'optimizer': hp.pchoice('mlp_opt', [(0.5, 'adam'), (0.5, 'sgd')]), 'reg_l1': hp.loguniform('mlp_l1', log(1e-4), log(1)), 'reg_l2': hp.loguniform('mlp_l2', log(1e-4), log(1)) }) return params
def test_bug1_tpe(): space = hp.choice( 'preprocess_choice', [{ 'pwhiten': hp.pchoice('whiten_randomPCA', [(.3, False), (.7, True)]) }, { 'palgo': False }, { 'pthree': 7 }]) best = fmin(fn=lambda x: 1, space=space, algo=tpe.suggest, max_evals=50)
class DecisionTreeModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return DecisionTreeClassifier(random_state=RANDOM_STATE, presort=True, **args) hp_space = { 'max_depth': hp.pchoice( 'max_depth_enabled', [(0.7, None), (0.3, 1 + scope.int(hp.qlognormal('max_depth', np.log(30), 0.5, 3)))]), 'splitter': hp.choice('splitter_str', ['best', 'random']), 'max_features': hp.pchoice( 'max_features_str', [ (0.2, 'sqrt'), # most common choice. (0.1, 'log2'), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform('max_features_str_frac', 0., 1.)) ]), 'min_samples_split': scope.int(hp.quniform('min_samples_split_str', 2, 10, 1)), 'min_samples_leaf': hp.choice('min_samples_leaf_enabled', [ 1, scope.int( hp.qloguniform('min_samples_leaf', np.log(1.5), np.log(50.5), 1)) ]), 'class_weight': hp.pchoice('class_weight', [ (0.5, None), (0.5, 'balanced'), ]) }
class RandomForestsModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1, **args) hp_space = { "max_depth": hp.pchoice( "max_depth_enabled", [ (0.7, None), (0.3, 1 + scope.int(hp.qlognormal("max_depth", np.log(30), 0.5, 3))), ], ), "n_estimators": scope.int(hp.qloguniform("n_estimators", np.log(9.5), np.log(300), 1)), "min_samples_leaf": hp.choice( "min_samples_leaf_enabled", [ 1, scope.int( hp.qloguniform("min_samples_leaf", np.log(1.5), np.log(50.5), 1)), ], ), "max_features": hp.pchoice( "max_features_str", [ (0.1, "sqrt"), # most common choice. (0.2, "log2"), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform("max_features_str_frac", 0.0, 1.0)), ], ), }
def ridge( name, alpha=None, #default - 1.0 normalize=None, #default - False, tol=None, #default - 0.001 solver=None, #default - 'auto' fit_intercept=None, #default - True ): def _name(msg): return '%s.%s_%s' % (name, 'sgd', msg) rval = scope.sklearn_Ridge( alpha=hp.loguniform(_name('alpha'), np.log(1e-3), np.log(1e3)) if alpha is None else alpha, normalize=hp.pchoice(_name('normalize'), [(0.8, True), (0.2, False)]) if normalize is None else normalize, fit_intercept=hp.pchoice(_name('fit_intercept'), [(0.8, True), (0.2, False)]) if fit_intercept is None else fit_intercept, tol=0.001 if tol is None else tol, solver="auto" if solver is None else solver, ) return rval
def sgd(name, loss=None, # default - 'hinge' penalty=None, # default - 'l2' alpha=None, # default - 0.0001 l1_ratio=None, # default - 0.15, must be within [0, 1] fit_intercept=True, # default - True n_iter=5, # default - 5 shuffle=True, # default - True random_state=None, # default - None epsilon=None, n_jobs=1, # default - 1 (-1 means all CPUs) learning_rate=None, # default - 'optimal' eta0=None, # default - 0.0 power_t=None, # default - 0.5 class_weight='choose', warm_start=False, verbose=False): def _name(msg): return '%s.%s_%s' % (name, 'sgdc', msg) rval = scope.sklearn_SGDClassifier( loss=hp.pchoice(_name('loss'), [ (0.25, 'hinge'), (0.25, 'log'), (0.25, 'modified_huber'), (0.05, 'squared_hinge'), (0.05, 'perceptron'), (0.05, 'squared_loss'), (0.05, 'huber'), (0.03, 'epsilon_insensitive'), (0.02, 'squared_epsilon_insensitive') ]) if loss is None else loss, penalty=_sgd_penalty(_name('penalty')) if penalty is None else penalty, alpha=_sgd_alpha(_name('alpha')) if alpha is None else alpha, l1_ratio=(_sgd_l1_ratio(_name('l1ratio')) if l1_ratio is None else l1_ratio), fit_intercept=fit_intercept, n_iter=n_iter, learning_rate=(_sgdc_learning_rate(_name('learning_rate')) if learning_rate is None else learning_rate), eta0=_sgd_eta0(_name('eta0')) if eta0 is None else eta0, power_t=_sgd_power_t(_name('power_t')) if power_t is None else power_t, class_weight=(_class_weight(_name('clsweight')) if class_weight == 'choose' else class_weight), n_jobs=n_jobs, verbose=verbose, random_state=_random_state(_name('rstate'), random_state), ) return rval
class RandomForestsModel(TreeBasedModel): @staticmethod def build_estimator(args, train_data=None): return RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, **args) hp_space = { 'max_depth': hp.pchoice( 'max_depth_enabled', [(0.7, None), (0.3, 1 + scope.int(hp.qlognormal('max_depth', np.log(30), 0.5, 3)))]), 'n_estimators': scope.int(hp.qloguniform('n_estimators', np.log(9.5), np.log(300), 1)), 'min_samples_leaf': hp.choice('min_samples_leaf_enabled', [ 1, scope.int( hp.qloguniform('min_samples_leaf', np.log(1.5), np.log(50.5), 1)) ]), 'max_features': hp.pchoice( 'max_features_str', [ (0.2, 'sqrt'), # most common choice. (0.1, 'log2'), # less common choice. (0.1, None), # all features, less common choice. (0.6, hp.uniform('max_features_str_frac', 0., 1.)) ]), 'class_weight': hp.pchoice('class_weight', [(0.5, None), (0.3, 'balanced'), (0.2, 'balanced_subsample')]) }
def many_dists(): a = hp.choice('a', [0, 1, 2]) b = hp.randint('b', 10) c = hp.uniform('c', 4, 7) d = hp.loguniform('d', -2, 0) e = hp.quniform('e', 0, 10, 3) f = hp.qloguniform('f', 0, 3, 2) g = hp.normal('g', 4, 7) h = hp.lognormal('h', -2, 2) i = hp.qnormal('i', 0, 10, 2) j = hp.qlognormal('j', 0, 2, 1) k = hp.pchoice('k', [(.1, 0), (.9, 1)]) z = a + b + c + d + e + f + g + h + i + j + k return {'loss': scope.float(scope.log(1e-12 + z ** 2)), 'status': base.STATUS_OK}
def many_dists(): a = hp.choice("a", [0, 1, 2]) b = hp.randint("b", 10) bb = hp.randint("bb", 12, 25) c = hp.uniform("c", 4, 7) d = hp.loguniform("d", -2, 0) e = hp.quniform("e", 0, 10, 3) f = hp.qloguniform("f", 0, 3, 2) g = hp.normal("g", 4, 7) h = hp.lognormal("h", -2, 2) i = hp.qnormal("i", 0, 10, 2) j = hp.qlognormal("j", 0, 2, 1) k = hp.pchoice("k", [(0.1, 0), (0.9, 1)]) z = a + b + bb + c + d + e + f + g + h + i + j + k return {"loss": scope.float(scope.log(1e-12 + z ** 2)), "status": base.STATUS_OK}
def knn_regression(name, sparse_data=False, n_neighbors=None, weights=None, leaf_size=None, metric=None, p=None, **kwargs): def _name(msg): return '%s.%s_%s' % (name, 'knn_regression', msg) if sparse_data: metric_args = {'metric': 'euclidean'} else: metric_args = hp.pchoice( _name('metric'), [ (0.05, { 'metric': 'euclidean' }), (0.10, { 'metric': 'manhattan' }), (0.10, { 'metric': 'chebyshev' }), (0.10, { 'metric': 'minkowski', 'p': scope.int(hp.quniform(_name('minkowski_p'), 1, 5, 1)) }), #(0.05, { 'metric':'wminkowski', # 'p':scope.int(hp.quniform(_name('wminkowski_p'), 1, 5, 1)), # 'w':hp.uniform( _name('wminkowski_w'), 0, 100 ) }), ]) rval = scope.sklearn_KNeighborsRegressor( n_neighbors=scope.int(hp.quniform(_name('n_neighbors'), 0.5, 50, 1)) if n_neighbors is None else n_neighbors, weights=hp.choice(_name('weights'), ['uniform', 'distance']) if weights is None else weights, leaf_size=scope.int(hp.quniform(_name('leaf_size'), 0.51, 100, 1)) if leaf_size is None else leaf_size, starstar_kwargs=metric_args) return rval
def sgd_regression(name, loss=None, # default - 'squared_loss' penalty=None, # default - 'l2' alpha=None, # default - 0.0001 l1_ratio=None, # default - 0.15, must be within [0, 1] fit_intercept=True, # default - True n_iter=5, # default - 5 shuffle=None, # default - False random_state=None, # default - None epsilon=None, # default - 0.1 learning_rate=None, # default - 'invscaling' eta0=None, # default - 0.01 power_t=None, # default - 0.5 warm_start=False, verbose=False): def _name(msg): return '%s.%s_%s' % (name, 'sgdr', msg) rval = scope.sklearn_SGDRegressor( loss=hp.pchoice(_name('loss'), [ (0.25, 'squared_loss'), (0.25, 'huber'), (0.25, 'epsilon_insensitive'), (0.25, 'squared_epsilon_insensitive') ]) if loss is None else loss, penalty=_sgd_penalty(_name('penalty')) if penalty is None else penalty, alpha=_sgd_alpha(_name('alpha')) if alpha is None else alpha, l1_ratio=(_sgd_l1_ratio(_name('l1ratio')) if l1_ratio is None else l1_ratio), fit_intercept=fit_intercept, n_iter=n_iter, # For regression, use the SVM epsilon instead of the SGD one. epsilon=_svm_epsilon(_name('epsilon')) if epsilon is None else epsilon, learning_rate=(_sgdr_learning_rate(_name('learning_rate')) if learning_rate is None else learning_rate), eta0=_sgd_eta0(_name('eta0')) if eta0 is None else eta0, power_t=_sgd_power_t(_name('power_t')) if power_t is None else power_t, verbose=verbose, random_state=_random_state(_name('rstate'), random_state), ) return rval
def test_basic(self): space = hp.pchoice('naive_type', [(.14, 'gaussian'), (.02, 'multinomial'), (.84, 'bernoulli')]) a, b, c = 0, 0, 0 rng = np.random.RandomState(123) for i in range(0, 1000): nesto = hyperopt.pyll.stochastic.sample(space, rng=rng) if nesto == 'gaussian': a += 1 elif nesto == 'multinomial': b += 1 elif nesto == 'bernoulli': c += 1 print((a, b, c)) assert a + b + c == 1000 assert 120 < a < 160 assert 0 < b < 40 assert 800 < c < 900
def nystrom(name, n_components=None, kernel=None, max_components=np.Inf, copy=True): def _name(msg): return '%s.%s_%s' % (name, 'nystrom', msg) rval = scope.sklearn_Nystrom( n_components=4 * scope.int( hp.qloguniform( name + '.n_components', low=np.log(0.51), high=np.log(min(max_components / 4, 30.5)), q=1.0)) if n_components is None else n_components, kernel=hp.pchoice( _name('kernel'), [ (0.35, 'sigmoid'), (0.35, 'rbf'), (0.30, 'poly')]) if kernel is None else kernel, gamma=_svc_gamma('gamma'), coef0=hp.uniform(_name('coef0'), 0.0, 1.0) ) return rval
def _knn_metric_p(name, sparse_data=False, metric=None, p=None): if sparse_data: return ('euclidean', 2) elif metric == 'euclidean': return (metric, 2) elif metric == 'manhattan': return (metric, 1) elif metric == 'chebyshev': return (metric, 0) elif metric == 'minkowski': assert p is not None return (metric, p) elif metric is None: return hp.pchoice(name, [ (0.55, ('euclidean', 2)), (0.15, ('manhattan', 1)), (0.15, ('chebyshev', 0)), (0.15, ('minkowski', _knn_p(name + '.p'))), ]) else: return (metric, p) # undefined, simply return user input.
def knn_regression(name, sparse_data=False, n_neighbors=None, weights=None, leaf_size=None, metric=None, p=None, **kwargs): def _name(msg): return '%s.%s_%s' % (name, 'knn_regression', msg) if sparse_data: metric_args = { 'metric':'euclidean' } else: metric_args = hp.pchoice(_name('metric'), [ (0.05, { 'metric':'euclidean' }), (0.10, { 'metric':'manhattan' }), (0.10, { 'metric':'chebyshev' }), (0.10, { 'metric':'minkowski', 'p':scope.int(hp.quniform(_name('minkowski_p'), 1, 5, 1))}), #(0.05, { 'metric':'wminkowski', # 'p':scope.int(hp.quniform(_name('wminkowski_p'), 1, 5, 1)), # 'w':hp.uniform( _name('wminkowski_w'), 0, 100 ) }), ] ) rval = scope.sklearn_KNeighborsRegressor( n_neighbors=scope.int(hp.quniform( _name('n_neighbors'), 0.5, 50, 1)) if n_neighbors is None else n_neighbors, weights=hp.choice( _name('weights'), ['uniform', 'distance']) if weights is None else weights, leaf_size=scope.int(hp.quniform( _name('leaf_size'), 0.51, 100, 1)) if leaf_size is None else leaf_size, starstar_kwargs=metric_args ) return rval
'cd_epochs_2': hp.qloguniform('cd_epochs_2', np.log(1), np.log(1500), q=1), 'sample_v0s_2': hp.choice('sample_v0s_2', [0, 1]), # [False, True] 'lr_anneal_2': hp.qloguniform('lr_anneal_2', np.log(10), np.log(10000), q=1)}) space = {'preproc': hp.choice('preproc', [{ 'preproc': 0 }, { #'no' 'preproc': 1, #'zca' 'pca_energy': hp.uniform('pca_energy', .5, 1), }]), # Sqash is fixed at logistic, no need for a hyperparameter 'iseed': hp.choice('iseed', [0, 1, 2, 3]), # [5, 6, 7, 8] # This was called nnet_features 'depth': hp.pchoice('depth', [(0.5, 0), (0.25, layer1), (0.125, layer2), (0.125, layer3)]), # This is fine #'nnet_features': hp.pchoice('nnet_features', [(.5, 0), (.25, 1),(.125, 2), (.125, 3)]), 'batch_size': hp.choice('batch_size', [0, 1]), # [20, 100] 'lr': hp.lognormal('lr', np.log(.01), 3.), 'lr_anneal_start': hp.qloguniform('lr_anneal_start', np.log(100), np.log(10000), q=1), 'l2_penalty': hp.choice('l2_penalty', [{ 'l2_penalty' : 0}, { # Zero 'l2_penalty' : 1, # notzero 'l2_penalty_nz': hp.lognormal('l2_penalty_nz', np.log(1.0e-6), 2.)}]) }
def preproc_space( sup_min_epochs=300, sup_max_epochs=2000, max_seconds=60 * 60, ): """ Return a hyperopt-compatible pyll expression for a trained neural network. The trained neural network will have 0, 1, 2, or 3 hidden layers, and may have an affine first layer that does column normalization or PCA pre-processing. Each layer of the network will be pre-trained by some amount of contrastive divergence before being fine-tuning by SGD. The training program is built using stub literals `pyll_stubs.train_task` and `pyll_stubs.valid_task`. When evaluating the pyll program, these literals must be replaced with skdata Task objects with `vector_classification` semantics. See `skdata_learning_algo.py` for how to use the `use_obj_for_literal_in_memo` function to swap live Task objects in for these stubs. The search space described by this function corresponds to the DBN model used in [1] and [2]. """ train_task_x = scope.getattr(pyll_stubs.train_task, 'x') nnet0 = scope.NNet([], n_out=scope.getattr(train_task_x, 'shape')[1]) nnet1 = hp.choice('preproc', [ nnet0, # -- raw data scope.nnet_add_layers( # -- ZCA of data nnet0, scope.zca_layer( train_task_x, energy=hp.uniform('pca_energy', .5, 1), eps=1e-14, )), ]) param_seed = hp.choice('iseed', [5, 6, 7, 8]) time_limit = scope.time() + max_seconds nnets = [nnet1] nnet_i_pt = nnet1 for ii, cd_epochs_max in enumerate([3000, 2000, 1500]): layer = scope.random_sigmoid_layer( # -- hack to get different seeds for dif't layers seed=param_seed + cd_epochs_max, n_in=scope.getattr(nnet_i_pt, 'n_out'), n_out=hp.qloguniform('n_hid_%i' % ii, np.log(2**7), np.log(2**12), q=16), dist=hp.choice('W_idist_%i' % ii, ['uniform', 'normal']), scale_heuristic=hp.choice( 'W_ialgo_%i' % ii, [ ('old', hp.lognormal('W_imult_%i' % ii, 0, 1)), ('Glorot',)]), squash='logistic', ) nnet_i_raw = scope.nnet_add_layer(nnet_i_pt, layer) # -- repeatedly calculating lower-layers wastes some CPU, but keeps # memory usage much more stable across jobs (good for cluster) # and the wasted CPU is not so much overall. nnet_i_pt = scope.nnet_pretrain_top_layer_cd( nnet_i_raw, train_task_x, lr=hp.lognormal('cd_lr_%i' % ii, np.log(.01), 2), seed=1 + hp.randint('cd_seed_%i' % ii, 10), n_epochs=hp.qloguniform('cd_epochs_%i' % ii, np.log(1), np.log(cd_epochs_max), q=1), # -- for whatever reason (?), this was fixed at 100 batchsize=100, sample_v0s=hp.choice('sample_v0s_%i' % ii, [False, True]), lr_anneal_start=hp.qloguniform('lr_anneal_%i' % ii, np.log(10), np.log(10000), q=1), time_limit=time_limit, ) nnets.append(nnet_i_pt) # this prior is not what I would do now, but it is what I did then... nnet_features = hp.pchoice( 'depth', [(.5, nnets[0]), (.25, nnets[1]), (.125, nnets[2]), (.125, nnets[3])]) sup_nnet = scope.nnet_add_layer( nnet_features, scope.zero_softmax_layer( n_in=scope.getattr(nnet_features, 'n_out'), n_out=scope.getattr(pyll_stubs.train_task, 'n_classes'))) nnet4, report = scope.nnet_sgd_finetune_classifier( sup_nnet, pyll_stubs.train_task, pyll_stubs.valid_task, fixed_nnet=nnet1, max_epochs=sup_max_epochs, min_epochs=sup_min_epochs, batch_size=hp.choice('batch_size', [20, 100]), lr=hp.lognormal('lr', np.log(.01), 3.), lr_anneal_start=hp.qloguniform( 'lr_anneal_start', np.log(100), np.log(10000), q=1), l2_penalty=hp.choice('l2_penalty', [ 0, hp.lognormal('l2_penalty_nz', np.log(1.0e-6), 2.)]), time_limit=time_limit, ) return nnet4, report
import time import argparse # remove headers, footers, and citations from 20 newsgroups data REMOVE_HEADERS=False # use the default settings up TfidfVectorizer before doing optimization PRE_VECTORIZE=False # Record the test score for every evaluation point #TEST_ALL_EVALS=True suppress_output = False optional_pca = hp.pchoice('preproc', [ ( 0.8, [pca('pca')]), ( 0.1, [min_max_scaler('mms')]), ( 0.1, [] ) ]) def score( y1, y2 ): length = len( y1 ) correct = 0.0 for i in xrange(length): if y1[i] == y2[i]: correct += 1.0 return correct / length # TODO: currently does not use seed for anything def sklearn_newsgroups( classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None ): global suppress_output
from hyperopt import hp from hyperopt.pyll import scope from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier as DTree if __name__ == '__main__': scope.define(GaussianNB) scope.define(SVC) C = hp.lognormal('svm_C', 0, 1) space = hp.pchoice('estimator', [ (0.1, scope.GaussianNB()), (0.2, scope.SVC(C=C, kernel='linear'))])
def sgd(name, loss=None, # default - 'hinge' penalty=None, # default - 'l2' alpha=None, # default - 0.0001 l1_ratio=None, # default - 0.15, must be within [0, 1] fit_intercept=None, # default - True n_iter=None, # default - 5 shuffle=None, # default - False random_state=None, # default - None epsilon=None, n_jobs=1, # default - 1 (-1 means all CPUs) learning_rate=None, # default - 'invscaling' eta0=None, # default - 0.01 power_t=None, # default - 0.5 class_weight=None, warm_start=False, verbose=False, ): def _name(msg): return '%s.%s_%s' % (name, 'sgd', msg) rval = scope.sklearn_SGDClassifier( loss=hp.pchoice( _name('loss'), [ #(0.00, 'hinge'), # no probability (0.5, 'log'), (0.5, 'modified_huber'), #(0.00, 'squared_hinge'), # no probability #(0.05, 'perceptron'), #(0.05, 'squared_loss'), #(0.05, 'huber'), #(0.03, 'epsilon_insensitive'), #(0.02, 'squared_epsilon_insensitive'), ]) if loss is None else loss, penalty=hp.pchoice( _name('penalty'), [ (0.60, 'l2'), (0.15, 'l1'), (0.25, 'elasticnet') ]) if penalty is None else penalty, alpha=hp.loguniform( _name('alpha'), np.log(1e-5), np.log(1)) if alpha is None else alpha, l1_ratio=hp.uniform( _name('l1_ratio'), 0, 1) if l1_ratio is None else l1_ratio, fit_intercept=hp.pchoice( _name('fit_intercept'), [ (0.8, True), (0.2, False) ]) if fit_intercept is None else fit_intercept, learning_rate='invscaling' if learning_rate is None else learning_rate, eta0=hp.loguniform( _name('eta0'), np.log(1e-5), np.log(1e-1)) if eta0 is None else eta0, power_t=hp.uniform( _name('power_t'), 0, 1) if power_t is None else power_t, n_jobs=n_jobs, verbose=verbose, ) return rval
def _grad_boosting_subsample(name): return hp.pchoice(name, [ (0.2, 1.0), # default choice. (0.8, hp.uniform(name + '.sgb', 0.5, 1.0)) # stochastic grad boosting. ])
def _sgd_penalty(name): return hp.pchoice(name, [ (0.40, 'l2'), (0.35, 'l1'), (0.25, 'elasticnet') ])
def _sgdr_learning_rate(name): return hp.pchoice(name, [ (0.50, 'invscaling'), (0.25, 'optimal'), (0.25, 'constant') ])