Esempio n. 1
0
def main(arguments):
    global train_mode
    EVALS = 50
    use_mp = True
    run_all = False
    selected_exp = []
    selected_datasets = []

    list_experiments = [
        'TopPop', 'Random', 'PureSVD', 'ALS', 'NMF', 'BPR', 'IRGAN', 'CFGAN',
        'GANMF'
    ]
    list_datasets = [
        d if isinstance(d, str) else d.DATASET_NAME for d in all_datasets
    ]

    if '--build_datasets' in arguments:
        print(
            'Building all necessary datasets required for the experiments. Disregarding other arguments! '
            +
            'You will need to run this script again without --build_datasets in order to run experiments!'
        )
        # Make all datasets
        for d in all_datasets:
            load_URMs(d, dataset_kwargs)
        return

    if '--no_mp' in arguments:
        print(
            'No multiprocessing requested! Falling back to serial execution of experiments!'
        )
        use_mp = False
        arguments.remove('--no_mp')

    if '--run_all' in arguments:
        print('All datasets selected for each algorithm!')
        selected_datasets = all_datasets
        run_all = True

    if '--user' in arguments:
        train_mode = 'user'

    if '--item' in arguments:
        train_mode = 'item'

    for arg in arguments:
        if not run_all and arg in list_datasets:
            selected_datasets.append(all_datasets[list_datasets.index(arg)])
        if arg in list_experiments:
            selected_exp.append(arg)

    dict_rec_classes = {}
    dict_dimensions = {}
    dict_fit_params = {}
    dict_init_configs = {}

    # Experiment parameters
    puresvd_dimensions = [Integer(1, 250, name='num_factors', dtype=int)]
    puresvd_fit_params = [d.name for d in puresvd_dimensions]

    ials_dimensions = [
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical(["linear", "log"], name='confidence_scaling'),
        Real(low=1e-3, high=50, prior='log-uniform', name='alpha',
             dtype=float),
        Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg',
             dtype=float),
        Real(low=1e-3,
             high=10.0,
             prior='log-uniform',
             name='epsilon',
             dtype=float)
    ]
    ials_fit_params = [d.name for d in ials_dimensions]

    bpr_dimensions = [
        Categorical([1500], name='epochs'),
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical([128, 256, 512, 1024], name='batch_size'),
        Categorical(["adagrad", "adam"], name='sgd_mode'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'),
        Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'),
    ]
    bpr_fit_params = [d.name for d in bpr_dimensions]

    nmf_dimensions = [
        Integer(1, 500, name='num_factors', dtype=int),
        Real(low=1e-5,
             high=1,
             prior='log-uniform',
             name='l1_ratio',
             dtype=float),
        Categorical(['coordinate_descent', 'multiplicative_update'],
                    name='solver'),
        Categorical(['nndsvda'], name='init_type'),
        Categorical(['frobenius', 'kullback-leibler'], name='beta_loss')
    ]
    nmf_fit_params = [
        'num_factors', 'l1_ratio', 'solver', 'init_type', 'beta_loss'
    ]

    cfgan_dimensions = [
        Categorical([300], name='epochs'),
        Integer(1, 5, prior='uniform', name='d_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='g_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='d_layers', dtype=int),
        Integer(1, 5, prior='uniform', name='g_layers', dtype=int),
        Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'),
        Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'),
        Categorical(['ZR', 'PM', 'ZP'], name='scheme'),
        Categorical([64, 128, 256, 512, 1024], name='d_batch_size'),
        Categorical([64, 128, 256, 512, 1024], name='g_batch_size'),
        Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float),
        Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float),
        Real(low=0,
             high=1,
             prior='uniform',
             name='zr_coefficient',
             dtype=float),
        Real(low=1e-4,
             high=1e-2,
             prior='log-uniform',
             name='d_lr',
             dtype=float),
        Real(low=1e-4,
             high=1e-2,
             prior='log-uniform',
             name='g_lr',
             dtype=float),
        Real(low=1e-6,
             high=1e-4,
             prior='log-uniform',
             name='d_reg',
             dtype=float),
        Real(low=1e-6,
             high=1e-4,
             prior='log-uniform',
             name='g_reg',
             dtype=float),
    ]
    cfgan_fit_params = [d.name for d in cfgan_dimensions]

    ganmf_dimensions = [
        Categorical([300], name='epochs'),
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Integer(1, 10, name='m', dtype=int),
        Real(low=1e-4,
             high=1e-2,
             prior='log-uniform',
             name='d_lr',
             dtype=float),
        Real(low=1e-4,
             high=1e-2,
             prior='log-uniform',
             name='g_lr',
             dtype=float),
        Real(low=1e-6,
             high=1e-4,
             prior='log-uniform',
             name='d_reg',
             dtype=float),
        Real(low=1e-2,
             high=0.5,
             prior='uniform',
             name='recon_coefficient',
             dtype=float),
        # Integer(5, 400, name='emb_dim', dtype=int),
        # Integer(1, 10, name='d_steps', dtype=int),
        # Integer(1, 10, name='g_steps', dtype=int),
        # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float),
    ]
    ganmf_fit_params = [d.name for d in ganmf_dimensions]

    dict_rec_classes['TopPop'] = TopPop
    dict_rec_classes['Random'] = Random
    dict_rec_classes['PureSVD'] = PureSVDRecommender
    dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython
    dict_rec_classes['ALS'] = IALSRecommender
    dict_rec_classes['NMF'] = NMFRecommender
    dict_rec_classes['GANMF'] = GANMF
    dict_rec_classes['CFGAN'] = CFGAN

    dict_dimensions['TopPop'] = []
    dict_dimensions['Random'] = []
    dict_dimensions['PureSVD'] = puresvd_dimensions
    dict_dimensions['BPR'] = bpr_dimensions
    dict_dimensions['ALS'] = ials_dimensions
    dict_dimensions['NMF'] = nmf_dimensions
    dict_dimensions['GANMF'] = ganmf_dimensions
    dict_dimensions['CFGAN'] = cfgan_dimensions

    dict_fit_params['TopPop'] = []
    dict_fit_params['Random'] = []
    dict_fit_params['PureSVD'] = puresvd_fit_params
    dict_fit_params['BPR'] = bpr_fit_params
    dict_fit_params['ALS'] = ials_fit_params
    dict_fit_params['NMF'] = nmf_fit_params
    dict_fit_params['GANMF'] = ganmf_fit_params
    dict_fit_params['CFGAN'] = cfgan_fit_params

    # dict_init_configs['GANMF'] = [300, 10, 128, 3, 1e-3, 1e-3, 1e-4, 0.05, 128]

    pool_list_experiments = []
    pool_list_dimensions = []

    for exp in selected_exp:
        for d in selected_datasets:
            new_exp = RecSysExp(dict_rec_classes[exp],
                                dataset=d,
                                fit_param_names=dict_fit_params[exp],
                                method='bayesian',
                                seed=seed)
            if use_mp:
                pool_list_experiments.append(new_exp)
                pool_list_dimensions.append(dict_dimensions[exp])
            else:
                new_exp.tune(dict_dimensions[exp],
                             evals=EVALS,
                             init_config=dict_init_configs[exp]
                             if exp in dict_init_configs else None)

    if use_mp:
        # Need to turn off MKL's own threading mechanism in order to use MP
        # https://github.com/joblib/joblib/issues/138
        os.environ['MKL_NUM_THREADS'] = '1'
        os.environ['OMP_NUM_THREADS'] = '1'
        os.environ['MKL_DYNAMIC'] = 'FALSE'

        pool = mp.Pool(initializer=set_affinity_on_worker)
        pool.starmap_async(
            run_exp,
            zip(pool_list_experiments, pool_list_dimensions,
                [EVALS] * len(pool_list_experiments)))
        pool.close()
        pool.join()
Esempio n. 2
0
def startExperiment(parameters):
    """
    Starts an experiment with the given parameters

    :param parameters: parameters of the experiment
    :type parameters: Dict
    """

    optimizationPath = str(
        os.path.join(parameters["path"], parameters["experimentId"]))
    json_file = str(
        os.path.join(optimizationPath, parameters["experimentId"] + ".json"))
    if os.path.isfile(json_file):
        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.resume_optimization(json_file)
    else:
        # Import dataset class and initialize an instance with the chosen dataset
        dataset_class = importDataset()
        dataset = dataset_class()
        dataset_path = str(
            os.path.join(pathDataset, "preprocessed_datasets",
                         parameters["dataset"]))
        dataset.load_custom_dataset_from_folder(dataset_path)

        model_class = importModel(parameters["model"]["name"])
        model = model_class()

        model.hyperparameters.update(parameters["model"]["parameters"])
        model.partitioning(parameters["partitioning"])

        search_space = {}

        for key, value in parameters["optimization"]["search_spaces"].items():
            if "low" in value:
                if isinstance(value["low"], float) or isinstance(
                        value["high"], float):
                    search_space[key] = Real(low=value["low"],
                                             high=value["high"])
                else:
                    search_space[key] = Integer(low=value["low"],
                                                high=value["high"])
            else:
                search_space[key] = Categorical(value)

        metric_parameters = parameters["optimize_metrics"][0]["parameters"]
        for key in metric_parameters:
            if metric_parameters[key] == "use dataset texts":
                metric_parameters[key] = dataset.get_corpus()
            elif metric_parameters[key] == "use selected dataset":
                metric_parameters[key] = dataset
            elif os.path.isdir(str(metric_parameters[key])):
                metricDataset = dataset_class()
                metricDataset.load_custom_dataset_from_folder(
                    metric_parameters[key])
                metric_parameters[key] = metricDataset.get_corpus()

        metric_class = importMetric(parameters["optimize_metrics"][0]["name"])
        metric = metric_class(**metric_parameters)

        metrics_to_track = []
        for single_metric in parameters["track_metrics"]:
            metric_class = importMetric(single_metric["name"])
            single_metric_parameters = single_metric["parameters"]
            for key in single_metric_parameters:
                if single_metric_parameters[key] == "use dataset texts":
                    single_metric_parameters[key] = dataset.get_corpus()
                elif single_metric_parameters[key] == "use selected dataset":
                    single_metric_parameters[key] = dataset
            new_metric = metric_class(**single_metric_parameters)
            metrics_to_track.append(new_metric)

        vocabulary_path = str(
            os.path.join(parameters["path"], parameters["experimentId"],
                         "models"))

        Path(vocabulary_path).mkdir(parents=True, exist_ok=True)

        vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json"))

        file = open(vocabulary_path, "w")
        json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file)
        file.close()

        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.optimize(
            model,
            dataset,
            metric,
            search_space,
            metrics_to_track,
            random_state=True,
            initial_point_generator="random",
            surrogate_model=parameters["optimization"]["surrogate_model"],
            model_runs=parameters["optimization"]["model_runs"],
            n_random_starts=parameters["optimization"]["n_random_starts"],
            acq_func=parameters["optimization"]["acquisition_function"],
            number_of_call=parameters["optimization"]["iterations"],
            save_models=True,
            save_name=parameters["experimentId"],
            save_path=optimizationPath)