def search_space(): # Create search space for optimization search_space = { "alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0) } return search_space
def extract_search_space(flat_search_config): """ Find the variable dimensions and convert them to a skopt search space. """ search_space = OrderedDict() for k,v in flat_search_config.items(): # Lists with more than one value are search dimensions if isinstance(v, list) and len(v) > 1: force_categorical = len(v) > 2 # Dedupe the list, escaping specials, and sort smallest to largest ds = sorted({escape_special(u) for u in v}) prior = flat_search_config.get(f'{k}__PRIOR', None) base = flat_search_config.get(f'{k}__BASE', 10) if force_categorical or isinstance(ds[0], str): transform = flat_search_config.get(f'{k}__TRANSFORM', 'onehot') dim = Categorical(ds, prior=prior, transform=transform, name=k) elif isinstance(ds[0], int): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Integer(*tuple(ds), prior=prior, transform=transform, base=base, name=k) elif isinstance(ds[0], float): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Real(*tuple(ds), prior=prior, transform=transform, base=base, name=k) search_space[k] = dim return search_space
def load_search_space(search_space): """ Load the search space from the json file :param search_space: dictionary of the search space (insertable in a json file) :type dict: :return: dictionary for the search space (for scikit optimize) :rtype: dict """ from skopt.space.space import Real, Categorical, Integer ss = dict() for key in list(search_space.keys()): if search_space[key][0] == 'Real': ss[key] = Real(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Integer': ss[key] = Integer(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Categorical': ss[key] = Categorical(categories=search_space[key][1]) return ss
def create_skopt_space(): from skopt.space.space import Real return [Real(1e-10, 1, prior="log-uniform"), (0.1, 0.9),], [ "lr", "momentum", ]
def test_warm_start_detection(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) tune_search2 = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search2._can_early_stop()) from sklearn.linear_model import LogisticRegression clf = LogisticRegression() tune_search3 = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertTrue(tune_search3._can_early_stop())
def test_warm_start_detection(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import VotingClassifier, RandomForestClassifier clf = VotingClassifier(estimators=[( "rf", RandomForestClassifier(n_estimators=50, random_state=0))]) tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search.early_stop_type, EarlyStopping.NO_EARLY_STOP) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) tune_search2 = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search2.early_stop_type, EarlyStopping.NO_EARLY_STOP) from sklearn.linear_model import LogisticRegression clf = LogisticRegression() tune_search3 = TuneSearchCV( clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search3.early_stop_type, EarlyStopping.NO_EARLY_STOP) tune_search4 = TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search4.early_stop_type, EarlyStopping.WARM_START_ITER) clf = RandomForestClassifier() tune_search5 = TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") self.assertEqual(tune_search5.early_stop_type, EarlyStopping.WARM_START_ENSEMBLE)
def test_local_mode(self): digits = datasets.load_digits() x = digits.data y = digits.target clf = SGDClassifier() parameter_grid = { "alpha": Real(1e-4, 1e-1, 1), "epsilon": Real(0.01, 0.1) } tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, max_iters=10, local_dir="./test-result") import ray with patch.object(ray, "init", wraps=ray.init) as wrapped_init: tune_search.fit(x, y) self.assertTrue(wrapped_init.call_args[1]["local_mode"])
def test_local_dir(self): digits = datasets.load_digits() x = digits.data y = digits.target clf = SGDClassifier() parameter_grid = { "alpha": Real(1e-4, 1e-1, 1), "epsilon": Real(0.01, 0.1) } scheduler = MedianStoppingRule(grace_period=10.0) tune_search = TuneSearchCV(clf, parameter_grid, early_stopping=scheduler, max_iters=10, local_dir="./test-result") tune_search.fit(x, y) self.assertTrue(len(os.listdir("./test-result")) != 0)
def map_dim(values): if isinstance(values, tuple): # linear subspace low, high, n_steps, value_type = values if value_type == 'i': return Integer(low, high) elif value_type == 'f': return Real(low, high) else: raise ValueError(f'Unknown value type "{value_type}"') else: # exhaustive list of options return Categorical(values)
def test_warn_reduce_maxiters(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) with self.assertWarnsRegex(UserWarning, "max_iters is set"): TuneSearchCV( clf, parameter_grid, max_iters=10, local_dir="./test-result") with self.assertWarnsRegex(UserWarning, "max_iters is set"): TuneSearchCV( SGDClassifier(), parameter_grid, max_iters=10, local_dir="./test-result")
def create_skopt_space(): from skopt.space.space import Real return [ Real(1e-10, 1, prior="log-uniform"), (0.1, 0.9), (0.1, 0.7), (32, 700), (32, 256), ], [ "lr", "momentum", "drop_out", "hidden_layer1", "hidden_layer2", ]
def test_warm_start_error(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import VotingClassifier, RandomForestClassifier clf = VotingClassifier(estimators=[( "rf", RandomForestClassifier(n_estimators=50, random_state=0))]) tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, early_stopping=False, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) with self.assertRaises(ValueError): tune_search = TuneSearchCV( clf, parameter_grid, n_jobs=1, early_stopping=True, max_iters=10, local_dir="./test-result") from sklearn.linear_model import LogisticRegression clf = LogisticRegression() with self.assertRaises(ValueError): parameter_grid = {"max_iter": [1, 2]} TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result") from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() with self.assertRaises(ValueError): parameter_grid = {"n_estimators": [1, 2]} TuneSearchCV( clf, parameter_grid, early_stopping=True, n_jobs=1, max_iters=10, local_dir="./test-result")
def test_warm_start_error(self): parameter_grid = {"alpha": Real(1e-4, 1e-1, 1)} from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0) tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, early_stopping=False, max_iters=10, local_dir="./test-result") self.assertFalse(tune_search._can_early_stop()) with self.assertRaises(ValueError): tune_search = TuneSearchCV(clf, parameter_grid, n_jobs=1, early_stopping=True, max_iters=10, local_dir="./test-result")
def create_skopt_space(): from skopt.space.space import Real return [ Real(1e-3, 1, prior="log-uniform"), (0.1, 0.9), (0.01, 0.1), (0.05, 0.2), (32, 128), (64, 256), (128, 1024), ], [ "lr", "momentum", "dropout_1", "dropout_2", "conv_1", "conv_1", "dense_1", ]
def test_mixed_categoricals(initgen): space = Space([ Categorical(name="x", categories=["1", "2", "3"]), Categorical(name="y", categories=[4, 5, 6]), Real(name="z", low=1.0, high=5.0) ]) def objective(param_list): x = param_list[0] y = param_list[1] z = param_list[2] loss = int(x) + y * z return loss res = gp_minimize(objective, space, n_calls=12, random_state=1, initial_point_generator=initgen) assert res["x"] in [['1', 4, 1.0], ['2', 4, 1.0]]
def main(arguments): global train_mode EVALS = 50 use_mp = True run_all = False selected_exp = [] selected_datasets = [] if '--build_datasets' in arguments: print('Building all necessary datasets required for the experiments. Disregarding other arguments! ' + 'You will need to run this script again without --build_datasets in order to run experiments!') # Make all datasets for d in all_datasets: load_URMs(d, dataset_kwargs) return if '--no_mp' in arguments: print('No multiprocessing requested! Falling back to serial execution of experiments!') use_mp = False arguments.remove('--no_mp') if '--run_all' in arguments: print('All datasets selected for each algorithm!') selected_datasets = all_datasets run_all = True # user-based 训练 if '--user' in arguments: train_mode = 'user' # item-based 训练 if '--item' in arguments: train_mode = 'item' for arg in arguments: if not run_all and arg in name_datasets: selected_datasets.append(all_datasets[name_datasets.index(arg)]) if arg in all_recommenders: selected_exp.append(arg) dict_rec_classes = {} dict_dimensions = {} dict_fit_params = {} dict_init_configs = {} # Experiment parameters # puresvd参数 puresvd_dimensions = [ Integer(1, 250, name='num_factors', dtype=int) ] puresvd_fit_params = [d.name for d in puresvd_dimensions] # als参数 ials_dimensions = [ Integer(1, 250, name='num_factors', dtype=int), Categorical(["linear", "log"], name='confidence_scaling'), Real(low=1e-3, high=50, prior='log-uniform', name='alpha', dtype=float), Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg', dtype=float), Real(low=1e-3, high=10.0, prior='log-uniform', name='epsilon', dtype=float) ] ials_fit_params = [d.name for d in ials_dimensions] # bpr参数 150epcochs bpr_dimensions = [ Categorical([150], name='epochs'), Integer(1, 250, name='num_factors', dtype=int), Categorical([128, 256, 512, 1024], name='batch_size'), Categorical(["adagrad", "adam"], name='sgd_mode'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'), Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'), ] bpr_fit_params = [d.name for d in bpr_dimensions] # nmf参数 nmf_dimensions = [ Integer(1, 500, name='num_factors', dtype=int), Real(low=1e-5, high=1, prior='log-uniform', name='l1_ratio', dtype=float), Categorical(['coordinate_descent', 'multiplicative_update'], name='solver'), Categorical(['nndsvda'], name='init_type'), Categorical(['frobenius', 'kullback-leibler'], name='beta_loss') ] nmf_fit_params = [d.name for d in nmf_dimensions] # slimbpr参数 150epochs slimbpr_dimensions = [ Integer(low=5, high=1000, prior='uniform', name='topK', dtype=int), Categorical([150], name='epochs'), Categorical([True, False], name='symmetric'), Categorical(["sgd", "adagrad", "adam"], name='sgd_mode'), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_i', dtype=float), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_j', dtype=float), Real(low=1e-4, high=1e-1, prior='log-uniform', name='learning_rate', dtype=float) ] slimbpr_fit_names = [d.name for d in slimbpr_dimensions] # cfgan参数 cfgan_dimensions = [ Categorical([300], name='epochs'), Integer(1, 5, prior='uniform', name='d_steps', dtype=int), Integer(1, 5, prior='uniform', name='g_steps', dtype=int), Integer(1, 5, prior='uniform', name='d_layers', dtype=int), Integer(1, 5, prior='uniform', name='g_layers', dtype=int), Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'), Categorical(['ZR', 'PM', 'ZP'], name='scheme'), Categorical([64, 128, 256, 512, 1024], name='d_batch_size'), Categorical([64, 128, 256, 512, 1024], name='g_batch_size'), Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zr_coefficient', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] cfgan_fit_params = [d.name for d in cfgan_dimensions] # ganmf参数 ganmf_dimensions = [ Categorical([300], name='epochs'), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), # Integer(5, 400, name='emb_dim', dtype=int), # Integer(1, 10, name='d_steps', dtype=int), # Integer(1, 10, name='g_steps', dtype=int), # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] ganmf_fit_params = [d.name for d in ganmf_dimensions] # disgan参数 disgan_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Integer(low=1, high=5, prior='uniform', name='d_layers', dtype=int), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float) ] disgan_fit_params = [d.name for d in disgan_dimensions] # deepganmf参数 deepganmf_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_output_act'), Categorical([1, 3, 5], name='d_layers'), Categorical([1, 2, 3, 4, 5], name='g_layers'), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), ] deepganmf_fit_params = [d.name for d in deepganmf_dimensions] dict_rec_classes['TopPop'] = TopPop dict_rec_classes['Random'] = Random dict_rec_classes['PureSVD'] = PureSVDRecommender dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython dict_rec_classes['ALS'] = IALSRecommender dict_rec_classes['NMF'] = NMFRecommender dict_rec_classes['GANMF'] = GANMF dict_rec_classes['CFGAN'] = CFGAN dict_rec_classes['DisGANMF'] = DisGANMF dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython dict_rec_classes['DeepGANMF'] = DeepGANMF dict_dimensions['TopPop'] = [] dict_dimensions['Random'] = [] dict_dimensions['PureSVD'] = puresvd_dimensions dict_dimensions['BPR'] = bpr_dimensions dict_dimensions['ALS'] = ials_dimensions dict_dimensions['NMF'] = nmf_dimensions dict_dimensions['GANMF'] = ganmf_dimensions dict_dimensions['CFGAN'] = cfgan_dimensions dict_dimensions['DisGANMF'] = disgan_dimensions dict_dimensions['SLIMBPR'] = slimbpr_dimensions dict_dimensions['DeepGANMF'] = deepganmf_dimensions dict_fit_params['TopPop'] = [] dict_fit_params['Random'] = [] dict_fit_params['PureSVD'] = puresvd_fit_params dict_fit_params['BPR'] = bpr_fit_params dict_fit_params['ALS'] = ials_fit_params dict_fit_params['NMF'] = nmf_fit_params dict_fit_params['GANMF'] = ganmf_fit_params dict_fit_params['CFGAN'] = cfgan_fit_params dict_fit_params['DisGANMF'] = disgan_fit_params dict_fit_params['SLIMBPR'] = slimbpr_fit_names dict_fit_params['DeepGANMF'] = deepganmf_fit_params pool_list_experiments = [] pool_list_dimensions = [] for exp in selected_exp: for d in selected_datasets: new_exp = RecSysExp(dict_rec_classes[exp], dataset=d, fit_param_names=dict_fit_params[exp], method='bayesian', seed=seed) if use_mp: pool_list_experiments.append(new_exp) pool_list_dimensions.append(dict_dimensions[exp]) else: new_exp.tune(dict_dimensions[exp], evals=EVALS, init_config=dict_init_configs[exp] if exp in dict_init_configs else None) if use_mp: # Need to turn off MKL's own threading mechanism in order to use MP # https://github.com/joblib/joblib/issues/138 os.environ['MKL_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_DYNAMIC'] = 'FALSE' pool = mp.Pool(initializer=set_affinity_on_worker) pool.starmap_async(run_exp, zip(pool_list_experiments, pool_list_dimensions, [EVALS]*len(pool_list_experiments))) pool.close() pool.join()
num_batches = total_songs // batch_size num_batches # In[21]: curr_steps = np.sort(factors(num_batches)) #drop last because it does not make any sense curr_steps = curr_steps[:-1] curr_steps = curr_steps[curr_steps >= 10] curr_steps # In[22]: currStepsSpace = Categorical(curr_steps) learningRateSpace = Real(1e-5, 1e-2, "log-uniform") inputProbSpace = Real(0.4, 1.0, "uniform") hiddenProbSpace = Real(0.4, 1.0, "uniform") l2RegSpace = Real(1e-3, 1., "log-uniform") space = [ currStepsSpace, learningRateSpace, inputProbSpace, hiddenProbSpace, l2RegSpace ] # In[23]: def saveStatsCollection(filename, key, stats): statsCollection = np.load(filename)[( )] if os.path.isfile(filename) else dict() statsCollection[key] = stats
if plotting: fig_1, ax_1, fig_2, ax_2 = plotStats(stats, keys) plt.show() validAccs = stats[:, -1] length10percent = len(validAccs) // 10 best10percent = np.sort(validAccs)[-length10percent:] # We want to maximise the MEAN validation accuracy, # i.e. minimise minus return -np.mean(best10percent) # In[14]: inputKeepProbSpace = Real(0.5, 1.0, "uniform") hiddenKeepProbSpace = Real(0.5, 1.0, "uniform") hiddenDimSpace = Integer(20, 2000) lamda2Space = Real(1e-3, 10, "log-uniform") space = [inputKeepProbSpace, hiddenKeepProbSpace, hiddenDimSpace, lamda2Space] # TARGET IS 58% as the original Deep Neural Net # In[15]: if jupyterNotebookEnabled: get_ipython().magic(u'%time') #this might crash so you need to run it outside as a python file (file -> save as python) if not os.path.isfile(res_gp_save_filename): if os.path.isfile(statsCollectionFilename):
model.add(layers.Dropout(dropout)) model.add(layers.Flatten()) model.add(layers.Dense(256, activation='relu')) model.add(layers.Dense(10)) adam = optimizers.Adam(lr, beta1, beta2) model.compile(optimizer=adam, loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) return model if __name__ == "__main__": dim_lr = Real(low=1e-4, high=5e-3, prior='log-uniform', name='lr') dim_b1 = Real(low=0.7, high=0.99, name='beta1') dim_b2 = Real(low=0.9, high=0.999, name='beta2') dim_drop = Real(low=0.25, high=0.75, name='dropout') dimensions = [dim_lr, dim_b1, dim_b2, dim_drop] default_param = [0.0005, 0.75, 0.95, 0.4] best_accuracy = 0 @use_named_args(dimensions=dimensions) def fitness(lr, beta1, beta2, dropout): model = create_model(lr, beta1, beta2, dropout) history = model.fit(train_data, train_labels, epochs=5, validation_data=(test_data, test_labels))
from tune_sklearn import TuneSearchCV from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from skopt.space.space import Real digits = datasets.load_digits() X = digits.data y = digits.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) space = { "n_estimators": (100, 200), "min_weight_fraction_leaf": Real(0.0, 0.5), "min_samples_leaf": (1, 5) } tune_search = TuneSearchCV(RandomForestClassifier(), space, search_optimization="bayesian", n_iter=3, max_iters=10) tune_search.fit(X_train, y_train) print(tune_search.cv_results_) print(tune_search.best_params_)
def startExperiment(parameters): """ Starts an experiment with the given parameters :param parameters: parameters of the experiment :type parameters: Dict """ optimizationPath = str( os.path.join(parameters["path"], parameters["experimentId"])) json_file = str( os.path.join(optimizationPath, parameters["experimentId"] + ".json")) if os.path.isfile(json_file): Optimizer = importOptimizer() optimizer = Optimizer() optimizer.resume_optimization(json_file) else: # Import dataset class and initialize an instance with the chosen dataset dataset_class = importDataset() dataset = dataset_class() dataset_path = str( os.path.join(pathDataset, "preprocessed_datasets", parameters["dataset"])) dataset.load_custom_dataset_from_folder(dataset_path) model_class = importModel(parameters["model"]["name"]) model = model_class() model.hyperparameters.update(parameters["model"]["parameters"]) model.partitioning(parameters["partitioning"]) search_space = {} for key, value in parameters["optimization"]["search_spaces"].items(): if "low" in value: if isinstance(value["low"], float) or isinstance( value["high"], float): search_space[key] = Real(low=value["low"], high=value["high"]) else: search_space[key] = Integer(low=value["low"], high=value["high"]) else: search_space[key] = Categorical(value) metric_parameters = parameters["optimize_metrics"][0]["parameters"] for key in metric_parameters: if metric_parameters[key] == "use dataset texts": metric_parameters[key] = dataset.get_corpus() elif metric_parameters[key] == "use selected dataset": metric_parameters[key] = dataset elif os.path.isdir(str(metric_parameters[key])): metricDataset = dataset_class() metricDataset.load_custom_dataset_from_folder( metric_parameters[key]) metric_parameters[key] = metricDataset.get_corpus() metric_class = importMetric(parameters["optimize_metrics"][0]["name"]) metric = metric_class(**metric_parameters) metrics_to_track = [] for single_metric in parameters["track_metrics"]: metric_class = importMetric(single_metric["name"]) single_metric_parameters = single_metric["parameters"] for key in single_metric_parameters: if single_metric_parameters[key] == "use dataset texts": single_metric_parameters[key] = dataset.get_corpus() elif single_metric_parameters[key] == "use selected dataset": single_metric_parameters[key] = dataset new_metric = metric_class(**single_metric_parameters) metrics_to_track.append(new_metric) vocabulary_path = str( os.path.join(parameters["path"], parameters["experimentId"], "models")) Path(vocabulary_path).mkdir(parents=True, exist_ok=True) vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json")) file = open(vocabulary_path, "w") json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file) file.close() Optimizer = importOptimizer() optimizer = Optimizer() optimizer.optimize( model, dataset, metric, search_space, metrics_to_track, random_state=True, initial_point_generator="random", surrogate_model=parameters["optimization"]["surrogate_model"], model_runs=parameters["optimization"]["model_runs"], n_random_starts=parameters["optimization"]["n_random_starts"], acq_func=parameters["optimization"]["acquisition_function"], number_of_call=parameters["optimization"]["iterations"], save_models=True, save_name=parameters["experimentId"], save_path=optimizationPath)
# numerical pipeline numeric_pipeline = Pipeline([('select_numeric', TypeSelector(dtype='number'))]) # processing pipeline cat_num_featun = FeatureUnion([('categorical', categorical_pipeline), ('numerical', numeric_pipeline)]) # combined pipeline estimator_pipeline = Pipeline([('Features', feature_pipeline), ('Categorical_Numeric', cat_num_featun), ('Estimator', LogisticRegression(penalty="l1")) ]) # search space search_space = { "Estimator__C": Real(.000001, 2), "Estimator__class_weight": Categorical(['balanced', None]), } # scorer metric = make_scorer(score_func=log_loss, greater_is_better=False, needs_proba=True, labels=train['Category'].unique()) # cv kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42) # bayessearch cv bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline, search_spaces=search_space,
statsCollection[(state_size, num_steps, learning_rate)] = stats np.save(filename, statsCollection) if plotting: fig_1, ax_1, fig_2, ax_2 = plotStats(stats, DynStats.keys) plt.show() # We want to minimize the amount of epochs required to reach 23% accuracy return metric # In[13]: stateSizeSpace = Integer(15, 1000) numStepSpace = Categorical(numLens) learningRateSpace = Real(1e-6, 1e-1, prior="log-uniform") space = [stateSizeSpace, numStepSpace, learningRateSpace] # In[14]: if jupyterNotebookEnabled: get_ipython().magic(u'%time') if not os.path.isfile(best_params_filename): if os.path.isfile(stats_coll_filename): os.remove(stats_coll_filename) res_gp = gp_minimize( func=objective_min_epochs, # function that we wish to minimise dimensions=space, #the search space for the hyper-parameters