def load_search_space(search_space): """ Load the search space from the json file :param search_space: dictionary of the search space (insertable in a json file) :type dict: :return: dictionary for the search space (for scikit optimize) :rtype: dict """ from skopt.space.space import Real, Categorical, Integer ss = dict() for key in list(search_space.keys()): if search_space[key][0] == 'Real': ss[key] = Real(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Integer': ss[key] = Integer(low=search_space[key][1][0], high=search_space[key][1][1], prior=search_space[key][2]) elif search_space[key][0] == 'Categorical': ss[key] = Categorical(categories=search_space[key][1]) return ss
def extract_search_space(flat_search_config): """ Find the variable dimensions and convert them to a skopt search space. """ search_space = OrderedDict() for k,v in flat_search_config.items(): # Lists with more than one value are search dimensions if isinstance(v, list) and len(v) > 1: force_categorical = len(v) > 2 # Dedupe the list, escaping specials, and sort smallest to largest ds = sorted({escape_special(u) for u in v}) prior = flat_search_config.get(f'{k}__PRIOR', None) base = flat_search_config.get(f'{k}__BASE', 10) if force_categorical or isinstance(ds[0], str): transform = flat_search_config.get(f'{k}__TRANSFORM', 'onehot') dim = Categorical(ds, prior=prior, transform=transform, name=k) elif isinstance(ds[0], int): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Integer(*tuple(ds), prior=prior, transform=transform, base=base, name=k) elif isinstance(ds[0], float): transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize') dim = Real(*tuple(ds), prior=prior, transform=transform, base=base, name=k) search_space[k] = dim return search_space
def test_mixed_categoricals2(initgen): space = Space([ Categorical(name="x", categories=["1", "2", "3"]), Categorical(name="y", categories=[4, 5, 6]) ]) def objective(param_list): x = param_list[0] y = param_list[1] loss = int(x) + y return loss res = gp_minimize(objective, space, n_calls=12, random_state=1, initial_point_generator=initgen) assert res["x"] == ['1', 4]
def test_custom_dimensions_for_bo(): """Assert that the BO runs when custom dimensions are provided.""" trainer = DirectRegressor( models="OLS", n_calls=5, bo_params={"dimensions": [Categorical([True, False], name="fit_intercept")]}, random_state=1, ) trainer.run(reg_train, reg_test) assert not trainer.ols.bo.empty
def test_mixed_categoricals(initgen): space = Space([ Categorical(name="x", categories=["1", "2", "3"]), Categorical(name="y", categories=[4, 5, 6]), Real(name="z", low=1.0, high=5.0) ]) def objective(param_list): x = param_list[0] y = param_list[1] z = param_list[2] loss = int(x) + y * z return loss res = gp_minimize(objective, space, n_calls=12, random_state=1, initial_point_generator=initgen) assert res["x"] in [['1', 4, 1.0], ['2', 4, 1.0]]
def map_dim(values): if isinstance(values, tuple): # linear subspace low, high, n_steps, value_type = values if value_type == 'i': return Integer(low, high) elif value_type == 'f': return Real(low, high) else: raise ValueError(f'Unknown value type "{value_type}"') else: # exhaustive list of options return Categorical(values)
def main(arguments): global train_mode EVALS = 50 use_mp = True run_all = False selected_exp = [] selected_datasets = [] if '--build_datasets' in arguments: print('Building all necessary datasets required for the experiments. Disregarding other arguments! ' + 'You will need to run this script again without --build_datasets in order to run experiments!') # Make all datasets for d in all_datasets: load_URMs(d, dataset_kwargs) return if '--no_mp' in arguments: print('No multiprocessing requested! Falling back to serial execution of experiments!') use_mp = False arguments.remove('--no_mp') if '--run_all' in arguments: print('All datasets selected for each algorithm!') selected_datasets = all_datasets run_all = True # user-based 训练 if '--user' in arguments: train_mode = 'user' # item-based 训练 if '--item' in arguments: train_mode = 'item' for arg in arguments: if not run_all and arg in name_datasets: selected_datasets.append(all_datasets[name_datasets.index(arg)]) if arg in all_recommenders: selected_exp.append(arg) dict_rec_classes = {} dict_dimensions = {} dict_fit_params = {} dict_init_configs = {} # Experiment parameters # puresvd参数 puresvd_dimensions = [ Integer(1, 250, name='num_factors', dtype=int) ] puresvd_fit_params = [d.name for d in puresvd_dimensions] # als参数 ials_dimensions = [ Integer(1, 250, name='num_factors', dtype=int), Categorical(["linear", "log"], name='confidence_scaling'), Real(low=1e-3, high=50, prior='log-uniform', name='alpha', dtype=float), Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg', dtype=float), Real(low=1e-3, high=10.0, prior='log-uniform', name='epsilon', dtype=float) ] ials_fit_params = [d.name for d in ials_dimensions] # bpr参数 150epcochs bpr_dimensions = [ Categorical([150], name='epochs'), Integer(1, 250, name='num_factors', dtype=int), Categorical([128, 256, 512, 1024], name='batch_size'), Categorical(["adagrad", "adam"], name='sgd_mode'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'), Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'), Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'), ] bpr_fit_params = [d.name for d in bpr_dimensions] # nmf参数 nmf_dimensions = [ Integer(1, 500, name='num_factors', dtype=int), Real(low=1e-5, high=1, prior='log-uniform', name='l1_ratio', dtype=float), Categorical(['coordinate_descent', 'multiplicative_update'], name='solver'), Categorical(['nndsvda'], name='init_type'), Categorical(['frobenius', 'kullback-leibler'], name='beta_loss') ] nmf_fit_params = [d.name for d in nmf_dimensions] # slimbpr参数 150epochs slimbpr_dimensions = [ Integer(low=5, high=1000, prior='uniform', name='topK', dtype=int), Categorical([150], name='epochs'), Categorical([True, False], name='symmetric'), Categorical(["sgd", "adagrad", "adam"], name='sgd_mode'), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_i', dtype=float), Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_j', dtype=float), Real(low=1e-4, high=1e-1, prior='log-uniform', name='learning_rate', dtype=float) ] slimbpr_fit_names = [d.name for d in slimbpr_dimensions] # cfgan参数 cfgan_dimensions = [ Categorical([300], name='epochs'), Integer(1, 5, prior='uniform', name='d_steps', dtype=int), Integer(1, 5, prior='uniform', name='g_steps', dtype=int), Integer(1, 5, prior='uniform', name='d_layers', dtype=int), Integer(1, 5, prior='uniform', name='g_layers', dtype=int), Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'), Categorical(['ZR', 'PM', 'ZP'], name='scheme'), Categorical([64, 128, 256, 512, 1024], name='d_batch_size'), Categorical([64, 128, 256, 512, 1024], name='g_batch_size'), Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float), Real(low=0, high=1, prior='uniform', name='zr_coefficient', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] cfgan_fit_params = [d.name for d in cfgan_dimensions] # ganmf参数 ganmf_dimensions = [ Categorical([300], name='epochs'), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), # Integer(5, 400, name='emb_dim', dtype=int), # Integer(1, 10, name='d_steps', dtype=int), # Integer(1, 10, name='g_steps', dtype=int), # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float), ] ganmf_fit_params = [d.name for d in ganmf_dimensions] # disgan参数 disgan_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Integer(low=1, high=5, prior='uniform', name='d_layers', dtype=int), Integer(low=1, high=250, name='num_factors', dtype=int), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float) ] disgan_fit_params = [d.name for d in disgan_dimensions] # deepganmf参数 deepganmf_dimensions = [ Categorical([300], name='epochs'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_hidden_act'), Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_output_act'), Categorical([1, 3, 5], name='d_layers'), Categorical([1, 2, 3, 4, 5], name='g_layers'), Categorical([64, 128, 256, 512, 1024], name='batch_size'), Integer(low=1, high=10, name='m', dtype=int), Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float), Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float), Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float), Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float), ] deepganmf_fit_params = [d.name for d in deepganmf_dimensions] dict_rec_classes['TopPop'] = TopPop dict_rec_classes['Random'] = Random dict_rec_classes['PureSVD'] = PureSVDRecommender dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython dict_rec_classes['ALS'] = IALSRecommender dict_rec_classes['NMF'] = NMFRecommender dict_rec_classes['GANMF'] = GANMF dict_rec_classes['CFGAN'] = CFGAN dict_rec_classes['DisGANMF'] = DisGANMF dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython dict_rec_classes['DeepGANMF'] = DeepGANMF dict_dimensions['TopPop'] = [] dict_dimensions['Random'] = [] dict_dimensions['PureSVD'] = puresvd_dimensions dict_dimensions['BPR'] = bpr_dimensions dict_dimensions['ALS'] = ials_dimensions dict_dimensions['NMF'] = nmf_dimensions dict_dimensions['GANMF'] = ganmf_dimensions dict_dimensions['CFGAN'] = cfgan_dimensions dict_dimensions['DisGANMF'] = disgan_dimensions dict_dimensions['SLIMBPR'] = slimbpr_dimensions dict_dimensions['DeepGANMF'] = deepganmf_dimensions dict_fit_params['TopPop'] = [] dict_fit_params['Random'] = [] dict_fit_params['PureSVD'] = puresvd_fit_params dict_fit_params['BPR'] = bpr_fit_params dict_fit_params['ALS'] = ials_fit_params dict_fit_params['NMF'] = nmf_fit_params dict_fit_params['GANMF'] = ganmf_fit_params dict_fit_params['CFGAN'] = cfgan_fit_params dict_fit_params['DisGANMF'] = disgan_fit_params dict_fit_params['SLIMBPR'] = slimbpr_fit_names dict_fit_params['DeepGANMF'] = deepganmf_fit_params pool_list_experiments = [] pool_list_dimensions = [] for exp in selected_exp: for d in selected_datasets: new_exp = RecSysExp(dict_rec_classes[exp], dataset=d, fit_param_names=dict_fit_params[exp], method='bayesian', seed=seed) if use_mp: pool_list_experiments.append(new_exp) pool_list_dimensions.append(dict_dimensions[exp]) else: new_exp.tune(dict_dimensions[exp], evals=EVALS, init_config=dict_init_configs[exp] if exp in dict_init_configs else None) if use_mp: # Need to turn off MKL's own threading mechanism in order to use MP # https://github.com/joblib/joblib/issues/138 os.environ['MKL_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_DYNAMIC'] = 'FALSE' pool = mp.Pool(initializer=set_affinity_on_worker) pool.starmap_async(run_exp, zip(pool_list_experiments, pool_list_dimensions, [EVALS]*len(pool_list_experiments))) pool.close() pool.join()
# In[20]: num_batches = total_songs // batch_size num_batches # In[21]: curr_steps = np.sort(factors(num_batches)) #drop last because it does not make any sense curr_steps = curr_steps[:-1] curr_steps = curr_steps[curr_steps >= 10] curr_steps # In[22]: currStepsSpace = Categorical(curr_steps) learningRateSpace = Real(1e-5, 1e-2, "log-uniform") inputProbSpace = Real(0.4, 1.0, "uniform") hiddenProbSpace = Real(0.4, 1.0, "uniform") l2RegSpace = Real(1e-3, 1., "log-uniform") space = [ currStepsSpace, learningRateSpace, inputProbSpace, hiddenProbSpace, l2RegSpace ] # In[23]: def saveStatsCollection(filename, key, stats): statsCollection = np.load(filename)[( )] if os.path.isfile(filename) else dict()
def startExperiment(parameters): """ Starts an experiment with the given parameters :param parameters: parameters of the experiment :type parameters: Dict """ optimizationPath = str( os.path.join(parameters["path"], parameters["experimentId"])) json_file = str( os.path.join(optimizationPath, parameters["experimentId"] + ".json")) if os.path.isfile(json_file): Optimizer = importOptimizer() optimizer = Optimizer() optimizer.resume_optimization(json_file) else: # Import dataset class and initialize an instance with the chosen dataset dataset_class = importDataset() dataset = dataset_class() dataset_path = str( os.path.join(pathDataset, "preprocessed_datasets", parameters["dataset"])) dataset.load_custom_dataset_from_folder(dataset_path) model_class = importModel(parameters["model"]["name"]) model = model_class() model.hyperparameters.update(parameters["model"]["parameters"]) model.partitioning(parameters["partitioning"]) search_space = {} for key, value in parameters["optimization"]["search_spaces"].items(): if "low" in value: if isinstance(value["low"], float) or isinstance( value["high"], float): search_space[key] = Real(low=value["low"], high=value["high"]) else: search_space[key] = Integer(low=value["low"], high=value["high"]) else: search_space[key] = Categorical(value) metric_parameters = parameters["optimize_metrics"][0]["parameters"] for key in metric_parameters: if metric_parameters[key] == "use dataset texts": metric_parameters[key] = dataset.get_corpus() elif metric_parameters[key] == "use selected dataset": metric_parameters[key] = dataset elif os.path.isdir(str(metric_parameters[key])): metricDataset = dataset_class() metricDataset.load_custom_dataset_from_folder( metric_parameters[key]) metric_parameters[key] = metricDataset.get_corpus() metric_class = importMetric(parameters["optimize_metrics"][0]["name"]) metric = metric_class(**metric_parameters) metrics_to_track = [] for single_metric in parameters["track_metrics"]: metric_class = importMetric(single_metric["name"]) single_metric_parameters = single_metric["parameters"] for key in single_metric_parameters: if single_metric_parameters[key] == "use dataset texts": single_metric_parameters[key] = dataset.get_corpus() elif single_metric_parameters[key] == "use selected dataset": single_metric_parameters[key] = dataset new_metric = metric_class(**single_metric_parameters) metrics_to_track.append(new_metric) vocabulary_path = str( os.path.join(parameters["path"], parameters["experimentId"], "models")) Path(vocabulary_path).mkdir(parents=True, exist_ok=True) vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json")) file = open(vocabulary_path, "w") json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file) file.close() Optimizer = importOptimizer() optimizer = Optimizer() optimizer.optimize( model, dataset, metric, search_space, metrics_to_track, random_state=True, initial_point_generator="random", surrogate_model=parameters["optimization"]["surrogate_model"], model_runs=parameters["optimization"]["model_runs"], n_random_starts=parameters["optimization"]["n_random_starts"], acq_func=parameters["optimization"]["acquisition_function"], number_of_call=parameters["optimization"]["iterations"], save_models=True, save_name=parameters["experimentId"], save_path=optimizationPath)
statsCollection = np.load(filename)[()] if os.path.isfile(filename) else dict() statsCollection[(state_size, num_steps, learning_rate)] = stats np.save(filename, statsCollection) if plotting: fig_1, ax_1, fig_2, ax_2 = plotStats(stats, DynStats.keys) plt.show() # We want to minimize the amount of epochs required to reach 23% accuracy return metric # In[13]: stateSizeSpace = Integer(15, 1000) numStepSpace = Categorical(numLens) learningRateSpace = Real(1e-6, 1e-1, prior="log-uniform") space = [stateSizeSpace, numStepSpace, learningRateSpace] # In[14]: if jupyterNotebookEnabled: get_ipython().magic(u'%time') if not os.path.isfile(best_params_filename): if os.path.isfile(stats_coll_filename): os.remove(stats_coll_filename) res_gp = gp_minimize( func=objective_min_epochs, # function that we wish to minimise
numeric_pipeline = Pipeline([('select_numeric', TypeSelector(dtype='number'))]) # processing pipeline cat_num_featun = FeatureUnion([('categorical', categorical_pipeline), ('numerical', numeric_pipeline)]) # combined pipeline estimator_pipeline = Pipeline([('Features', feature_pipeline), ('Categorical_Numeric', cat_num_featun), ('Estimator', LogisticRegression(penalty="l1")) ]) # search space search_space = { "Estimator__C": Real(.000001, 2), "Estimator__class_weight": Categorical(['balanced', None]), } # scorer metric = make_scorer(score_func=log_loss, greater_is_better=False, needs_proba=True, labels=train['Category'].unique()) # cv kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42) # bayessearch cv bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline, search_spaces=search_space, n_iter=10,