Exemple #1
0
def load_search_space(search_space):
    """
    Load the search space from the json file

    :param search_space: dictionary of the search space (insertable in a json file)
    :type dict:
    :return: dictionary for the search space (for scikit optimize)
    :rtype: dict
    """
    from skopt.space.space import Real, Categorical, Integer

    ss = dict()
    for key in list(search_space.keys()):
        if search_space[key][0] == 'Real':
            ss[key] = Real(low=search_space[key][1][0],
                           high=search_space[key][1][1],
                           prior=search_space[key][2])
        elif search_space[key][0] == 'Integer':
            ss[key] = Integer(low=search_space[key][1][0],
                              high=search_space[key][1][1],
                              prior=search_space[key][2])
        elif search_space[key][0] == 'Categorical':
            ss[key] = Categorical(categories=search_space[key][1])

    return ss
Exemple #2
0
def extract_search_space(flat_search_config):
    """ Find the variable dimensions and convert them to a skopt search space.
    """
    search_space = OrderedDict()
    for k,v in flat_search_config.items():
        # Lists with more than one value are search dimensions
        if isinstance(v, list) and len(v) > 1:
            force_categorical = len(v) > 2

            # Dedupe the list, escaping specials, and sort smallest to largest
            ds = sorted({escape_special(u) for u in v})
            prior = flat_search_config.get(f'{k}__PRIOR', None)
            base = flat_search_config.get(f'{k}__BASE', 10)

            if force_categorical or isinstance(ds[0], str):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'onehot')
                dim = Categorical(ds, prior=prior, transform=transform, name=k)
            elif isinstance(ds[0], int):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize')
                dim = Integer(*tuple(ds), prior=prior, transform=transform, base=base, name=k)
            elif isinstance(ds[0], float):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize')
                dim = Real(*tuple(ds), prior=prior, transform=transform, base=base, name=k)

            search_space[k] = dim
    return search_space
def test_mixed_categoricals2(initgen):
    space = Space([
        Categorical(name="x", categories=["1", "2", "3"]),
        Categorical(name="y", categories=[4, 5, 6])
    ])

    def objective(param_list):
        x = param_list[0]
        y = param_list[1]
        loss = int(x) + y
        return loss

    res = gp_minimize(objective,
                      space,
                      n_calls=12,
                      random_state=1,
                      initial_point_generator=initgen)
    assert res["x"] == ['1', 4]
Exemple #4
0
def test_custom_dimensions_for_bo():
    """Assert that the BO runs when custom dimensions are provided."""
    trainer = DirectRegressor(
        models="OLS",
        n_calls=5,
        bo_params={"dimensions": [Categorical([True, False], name="fit_intercept")]},
        random_state=1,
    )
    trainer.run(reg_train, reg_test)
    assert not trainer.ols.bo.empty
def test_mixed_categoricals(initgen):

    space = Space([
        Categorical(name="x", categories=["1", "2", "3"]),
        Categorical(name="y", categories=[4, 5, 6]),
        Real(name="z", low=1.0, high=5.0)
    ])

    def objective(param_list):
        x = param_list[0]
        y = param_list[1]
        z = param_list[2]
        loss = int(x) + y * z
        return loss

    res = gp_minimize(objective,
                      space,
                      n_calls=12,
                      random_state=1,
                      initial_point_generator=initgen)
    assert res["x"] in [['1', 4, 1.0], ['2', 4, 1.0]]
    def map_dim(values):
        if isinstance(values, tuple):  # linear subspace
            low, high, n_steps, value_type = values

            if value_type == 'i':
                return Integer(low, high)
            elif value_type == 'f':
                return Real(low, high)
            else:
                raise ValueError(f'Unknown value type "{value_type}"')
        else:  # exhaustive list of options
            return Categorical(values)
Exemple #7
0
def main(arguments):
    global train_mode
    EVALS = 50
    use_mp = True
    run_all = False
    selected_exp = []
    selected_datasets = []

    if '--build_datasets' in arguments:
        print('Building all necessary datasets required for the experiments. Disregarding other arguments! ' +
        'You will need to run this script again without --build_datasets in order to run experiments!')
        # Make all datasets
        for d in all_datasets:
            load_URMs(d, dataset_kwargs)
        return

    if '--no_mp' in arguments:
        print('No multiprocessing requested! Falling back to serial execution of experiments!')
        use_mp = False
        arguments.remove('--no_mp')

    if '--run_all' in arguments:
        print('All datasets selected for each algorithm!')
        selected_datasets = all_datasets
        run_all = True

    # user-based 训练
    if '--user' in arguments:
        train_mode = 'user'

    # item-based 训练
    if '--item' in arguments:
        train_mode = 'item'

    for arg in arguments:
        if not run_all and arg in name_datasets:
            selected_datasets.append(all_datasets[name_datasets.index(arg)])
        if arg in all_recommenders:
            selected_exp.append(arg)


    dict_rec_classes = {}
    dict_dimensions = {}
    dict_fit_params = {}
    dict_init_configs = {}


    # Experiment parameters
    # puresvd参数
    puresvd_dimensions = [
        Integer(1, 250, name='num_factors', dtype=int)
    ]
    puresvd_fit_params = [d.name for d in puresvd_dimensions]


    # als参数
    ials_dimensions = [
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical(["linear", "log"], name='confidence_scaling'),
        Real(low=1e-3, high=50, prior='log-uniform', name='alpha', dtype=float),
        Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg', dtype=float),
        Real(low=1e-3, high=10.0, prior='log-uniform', name='epsilon', dtype=float)
    ]
    ials_fit_params = [d.name for d in ials_dimensions]


    # bpr参数 150epcochs
    bpr_dimensions = [
        Categorical([150], name='epochs'),
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical([128, 256, 512, 1024], name='batch_size'),
        Categorical(["adagrad", "adam"], name='sgd_mode'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'),
        Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'),
    ]
    bpr_fit_params = [d.name for d in bpr_dimensions]


    # nmf参数
    nmf_dimensions = [
        Integer(1, 500, name='num_factors', dtype=int),
        Real(low=1e-5, high=1, prior='log-uniform', name='l1_ratio', dtype=float),
        Categorical(['coordinate_descent', 'multiplicative_update'], name='solver'),
        Categorical(['nndsvda'], name='init_type'),
        Categorical(['frobenius', 'kullback-leibler'], name='beta_loss')
    ]
    nmf_fit_params = [d.name for d in nmf_dimensions]


    # slimbpr参数 150epochs
    slimbpr_dimensions = [
        Integer(low=5, high=1000, prior='uniform', name='topK', dtype=int),
        Categorical([150], name='epochs'),
        Categorical([True, False], name='symmetric'),
        Categorical(["sgd", "adagrad", "adam"], name='sgd_mode'),
        Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_i', dtype=float),
        Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_j', dtype=float),
        Real(low=1e-4, high=1e-1, prior='log-uniform', name='learning_rate', dtype=float)
    ]
    slimbpr_fit_names = [d.name for d in slimbpr_dimensions]


    # cfgan参数
    cfgan_dimensions = [
        Categorical([300], name='epochs'),
        Integer(1, 5, prior='uniform', name='d_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='g_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='d_layers', dtype=int),
        Integer(1, 5, prior='uniform', name='g_layers', dtype=int),
        Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'),
        Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'),
        Categorical(['ZR', 'PM', 'ZP'], name='scheme'),
        Categorical([64, 128, 256, 512, 1024], name='d_batch_size'),
        Categorical([64, 128, 256, 512, 1024], name='g_batch_size'),
        Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float),
        Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float),
        Real(low=0, high=1, prior='uniform', name='zr_coefficient', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float),
    ]
    cfgan_fit_params = [d.name for d in cfgan_dimensions]


    # ganmf参数
    ganmf_dimensions = [
        Categorical([300], name='epochs'),
        Integer(low=1, high=250, name='num_factors', dtype=int),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Integer(low=1, high=10, name='m', dtype=int),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float),
        # Integer(5, 400, name='emb_dim', dtype=int),
        # Integer(1, 10, name='d_steps', dtype=int),
        # Integer(1, 10, name='g_steps', dtype=int),
        # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float),
    ]
    ganmf_fit_params = [d.name for d in ganmf_dimensions]


    # disgan参数
    disgan_dimensions = [
        Categorical([300], name='epochs'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'),
        Integer(low=1, high=5, prior='uniform', name='d_layers', dtype=int),
        Integer(low=1, high=250, name='num_factors', dtype=int),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float)
    ]
    disgan_fit_params = [d.name for d in disgan_dimensions]


    # deepganmf参数
    deepganmf_dimensions = [
        Categorical([300], name='epochs'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_hidden_act'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_output_act'),
        Categorical([1, 3, 5], name='d_layers'),
        Categorical([1, 2, 3, 4, 5], name='g_layers'),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Integer(low=1, high=10, name='m', dtype=int),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float),
    ]
    deepganmf_fit_params = [d.name for d in deepganmf_dimensions]



    dict_rec_classes['TopPop'] = TopPop
    dict_rec_classes['Random'] = Random
    dict_rec_classes['PureSVD'] = PureSVDRecommender
    dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython
    dict_rec_classes['ALS'] = IALSRecommender
    dict_rec_classes['NMF'] = NMFRecommender
    dict_rec_classes['GANMF'] = GANMF
    dict_rec_classes['CFGAN'] = CFGAN
    dict_rec_classes['DisGANMF'] = DisGANMF
    dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython
    dict_rec_classes['DeepGANMF'] = DeepGANMF

    dict_dimensions['TopPop'] = []
    dict_dimensions['Random'] = []
    dict_dimensions['PureSVD'] = puresvd_dimensions
    dict_dimensions['BPR'] = bpr_dimensions
    dict_dimensions['ALS'] = ials_dimensions
    dict_dimensions['NMF'] = nmf_dimensions
    dict_dimensions['GANMF'] = ganmf_dimensions
    dict_dimensions['CFGAN'] = cfgan_dimensions
    dict_dimensions['DisGANMF'] = disgan_dimensions
    dict_dimensions['SLIMBPR'] = slimbpr_dimensions
    dict_dimensions['DeepGANMF'] = deepganmf_dimensions

    dict_fit_params['TopPop'] = []
    dict_fit_params['Random'] = []
    dict_fit_params['PureSVD'] = puresvd_fit_params
    dict_fit_params['BPR'] = bpr_fit_params
    dict_fit_params['ALS'] = ials_fit_params
    dict_fit_params['NMF'] = nmf_fit_params
    dict_fit_params['GANMF'] = ganmf_fit_params
    dict_fit_params['CFGAN'] = cfgan_fit_params
    dict_fit_params['DisGANMF'] = disgan_fit_params
    dict_fit_params['SLIMBPR'] = slimbpr_fit_names
    dict_fit_params['DeepGANMF'] = deepganmf_fit_params

    pool_list_experiments = []
    pool_list_dimensions = []

    for exp in selected_exp:
        for d in selected_datasets:
            new_exp = RecSysExp(dict_rec_classes[exp], dataset=d, fit_param_names=dict_fit_params[exp],
                                method='bayesian', seed=seed)
            if use_mp:
                pool_list_experiments.append(new_exp)
                pool_list_dimensions.append(dict_dimensions[exp])
            else:
                new_exp.tune(dict_dimensions[exp], evals=EVALS,
                             init_config=dict_init_configs[exp] if exp in dict_init_configs else None)

    if use_mp:
        # Need to turn off MKL's own threading mechanism in order to use MP
        # https://github.com/joblib/joblib/issues/138
        os.environ['MKL_NUM_THREADS'] = '1'
        os.environ['OMP_NUM_THREADS'] = '1'
        os.environ['MKL_DYNAMIC'] = 'FALSE'
        
        pool = mp.Pool(initializer=set_affinity_on_worker)
        pool.starmap_async(run_exp, zip(pool_list_experiments, pool_list_dimensions, [EVALS]*len(pool_list_experiments)))
        pool.close()
        pool.join()
# In[20]:

num_batches = total_songs // batch_size
num_batches

# In[21]:

curr_steps = np.sort(factors(num_batches))
#drop last because it does not make any sense
curr_steps = curr_steps[:-1]
curr_steps = curr_steps[curr_steps >= 10]
curr_steps

# In[22]:

currStepsSpace = Categorical(curr_steps)
learningRateSpace = Real(1e-5, 1e-2, "log-uniform")
inputProbSpace = Real(0.4, 1.0, "uniform")
hiddenProbSpace = Real(0.4, 1.0, "uniform")
l2RegSpace = Real(1e-3, 1., "log-uniform")
space = [
    currStepsSpace, learningRateSpace, inputProbSpace, hiddenProbSpace,
    l2RegSpace
]

# In[23]:


def saveStatsCollection(filename, key, stats):
    statsCollection = np.load(filename)[(
    )] if os.path.isfile(filename) else dict()
def startExperiment(parameters):
    """
    Starts an experiment with the given parameters

    :param parameters: parameters of the experiment
    :type parameters: Dict
    """

    optimizationPath = str(
        os.path.join(parameters["path"], parameters["experimentId"]))
    json_file = str(
        os.path.join(optimizationPath, parameters["experimentId"] + ".json"))
    if os.path.isfile(json_file):
        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.resume_optimization(json_file)
    else:
        # Import dataset class and initialize an instance with the chosen dataset
        dataset_class = importDataset()
        dataset = dataset_class()
        dataset_path = str(
            os.path.join(pathDataset, "preprocessed_datasets",
                         parameters["dataset"]))
        dataset.load_custom_dataset_from_folder(dataset_path)

        model_class = importModel(parameters["model"]["name"])
        model = model_class()

        model.hyperparameters.update(parameters["model"]["parameters"])
        model.partitioning(parameters["partitioning"])

        search_space = {}

        for key, value in parameters["optimization"]["search_spaces"].items():
            if "low" in value:
                if isinstance(value["low"], float) or isinstance(
                        value["high"], float):
                    search_space[key] = Real(low=value["low"],
                                             high=value["high"])
                else:
                    search_space[key] = Integer(low=value["low"],
                                                high=value["high"])
            else:
                search_space[key] = Categorical(value)

        metric_parameters = parameters["optimize_metrics"][0]["parameters"]
        for key in metric_parameters:
            if metric_parameters[key] == "use dataset texts":
                metric_parameters[key] = dataset.get_corpus()
            elif metric_parameters[key] == "use selected dataset":
                metric_parameters[key] = dataset
            elif os.path.isdir(str(metric_parameters[key])):
                metricDataset = dataset_class()
                metricDataset.load_custom_dataset_from_folder(
                    metric_parameters[key])
                metric_parameters[key] = metricDataset.get_corpus()

        metric_class = importMetric(parameters["optimize_metrics"][0]["name"])
        metric = metric_class(**metric_parameters)

        metrics_to_track = []
        for single_metric in parameters["track_metrics"]:
            metric_class = importMetric(single_metric["name"])
            single_metric_parameters = single_metric["parameters"]
            for key in single_metric_parameters:
                if single_metric_parameters[key] == "use dataset texts":
                    single_metric_parameters[key] = dataset.get_corpus()
                elif single_metric_parameters[key] == "use selected dataset":
                    single_metric_parameters[key] = dataset
            new_metric = metric_class(**single_metric_parameters)
            metrics_to_track.append(new_metric)

        vocabulary_path = str(
            os.path.join(parameters["path"], parameters["experimentId"],
                         "models"))

        Path(vocabulary_path).mkdir(parents=True, exist_ok=True)

        vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json"))

        file = open(vocabulary_path, "w")
        json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file)
        file.close()

        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.optimize(
            model,
            dataset,
            metric,
            search_space,
            metrics_to_track,
            random_state=True,
            initial_point_generator="random",
            surrogate_model=parameters["optimization"]["surrogate_model"],
            model_runs=parameters["optimization"]["model_runs"],
            n_random_starts=parameters["optimization"]["n_random_starts"],
            acq_func=parameters["optimization"]["acquisition_function"],
            number_of_call=parameters["optimization"]["iterations"],
            save_models=True,
            save_name=parameters["experimentId"],
            save_path=optimizationPath)
    statsCollection = np.load(filename)[()] if os.path.isfile(filename) else dict()
    statsCollection[(state_size, num_steps, learning_rate)] = stats
    np.save(filename, statsCollection)
    
    if plotting:
        fig_1, ax_1, fig_2, ax_2 = plotStats(stats, DynStats.keys)
        plt.show()
    
    # We want to minimize the amount of epochs required to reach 23% accuracy
    return metric


# In[13]:

stateSizeSpace = Integer(15, 1000)
numStepSpace = Categorical(numLens)
learningRateSpace = Real(1e-6, 1e-1, prior="log-uniform")
space  = [stateSizeSpace, numStepSpace, learningRateSpace]


# In[14]:

if jupyterNotebookEnabled:
    get_ipython().magic(u'%time')

if not os.path.isfile(best_params_filename):
    if os.path.isfile(stats_coll_filename):
        os.remove(stats_coll_filename)
    
    res_gp = gp_minimize(
            func=objective_min_epochs, # function that we wish to minimise
Exemple #11
0
numeric_pipeline = Pipeline([('select_numeric', TypeSelector(dtype='number'))])

# processing pipeline
cat_num_featun = FeatureUnion([('categorical', categorical_pipeline),
                               ('numerical', numeric_pipeline)])

# combined pipeline
estimator_pipeline = Pipeline([('Features', feature_pipeline),
                               ('Categorical_Numeric', cat_num_featun),
                               ('Estimator', LogisticRegression(penalty="l1"))
                               ])

# search space
search_space = {
    "Estimator__C": Real(.000001, 2),
    "Estimator__class_weight": Categorical(['balanced', None]),
}

# scorer
metric = make_scorer(score_func=log_loss,
                     greater_is_better=False,
                     needs_proba=True,
                     labels=train['Category'].unique())

# cv
kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# bayessearch cv
bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline,
                                     search_spaces=search_space,
                                     n_iter=10,