Ejemplo n.º 1
0
    def fit(self, df: pd.DataFrame):
        df_features = self._to_feature_df(df, True)

        df_features = df_features.dropna()
        df_features = df_features.sample(frac=1, random_state=42)

        X = self._get_X(df_features)
        y = self._get_y(df_features)

        if self.optimize_hyperparams:

            def scorer(estimator, X, y):
                y_pred = np.clip(np.squeeze(estimator.predict(X)), 0.0, 1.0)
                return -mean_absolute_error(y, y_pred)

            print(
                f'IouEstimator: optimizing hyperparams with Bayesian Optimization'
            )
            opt = BayesSearchCV(
                LGBMRegressor(),
                {
                    'num_leaves':
                    Integer(2, 128, prior='log-uniform', base=2),
                    'min_child_samples':
                    Integer(2, 512, prior='log-uniform', base=2),
                    'max_bin':
                    Integer(2, 8192, prior='log-uniform', base=2),
                },
                n_iter=60,
                optimizer_kwargs={
                    'n_initial_points': 20,
                    'base_estimator': 'GP',
                },
                scoring=scorer,
                cv=3,
                refit=False,
                random_state=42,
                return_train_score=True,
            )
            opt.fit(X, y)
            print(f'Found hyperparams {opt.best_params_}')
            print(
                f"Train score: {opt.cv_results_['mean_train_score'][opt.best_index_]}"
            )
            print(f'Test score: {opt.best_score_}')
            estimator = LGBMRegressor(**opt.best_params_)
        elif self.hyperparams is not None:
            print(f'IOUEstimator: using using hyperparams {self.hyperparams}')
            estimator = LGBMRegressor(**self.hyperparams)
        else:
            print(
                f'IOUEstimator: using default hyperparams {self.DEFAULT_HYPERPARAMS}'
            )
            estimator = LGBMRegressor(**self.DEFAULT_HYPERPARAMS)

        self.estimator_ = estimator.fit(X, y)

        return self
Ejemplo n.º 2
0
def load_search_space(search_space):
    """
    Load the search space from the json file

    :param search_space: dictionary of the search space (insertable in a json file)
    :type dict:
    :return: dictionary for the search space (for scikit optimize)
    :rtype: dict
    """
    from skopt.space.space import Real, Categorical, Integer

    ss = dict()
    for key in list(search_space.keys()):
        if search_space[key][0] == 'Real':
            ss[key] = Real(low=search_space[key][1][0],
                           high=search_space[key][1][1],
                           prior=search_space[key][2])
        elif search_space[key][0] == 'Integer':
            ss[key] = Integer(low=search_space[key][1][0],
                              high=search_space[key][1][1],
                              prior=search_space[key][2])
        elif search_space[key][0] == 'Categorical':
            ss[key] = Categorical(categories=search_space[key][1])

    return ss
Ejemplo n.º 3
0
def extract_search_space(flat_search_config):
    """ Find the variable dimensions and convert them to a skopt search space.
    """
    search_space = OrderedDict()
    for k,v in flat_search_config.items():
        # Lists with more than one value are search dimensions
        if isinstance(v, list) and len(v) > 1:
            force_categorical = len(v) > 2

            # Dedupe the list, escaping specials, and sort smallest to largest
            ds = sorted({escape_special(u) for u in v})
            prior = flat_search_config.get(f'{k}__PRIOR', None)
            base = flat_search_config.get(f'{k}__BASE', 10)

            if force_categorical or isinstance(ds[0], str):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'onehot')
                dim = Categorical(ds, prior=prior, transform=transform, name=k)
            elif isinstance(ds[0], int):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize')
                dim = Integer(*tuple(ds), prior=prior, transform=transform, base=base, name=k)
            elif isinstance(ds[0], float):
                transform = flat_search_config.get(f'{k}__TRANSFORM', 'normalize')
                dim = Real(*tuple(ds), prior=prior, transform=transform, base=base, name=k)

            search_space[k] = dim
    return search_space
Ejemplo n.º 4
0
def test_custom_dimensions_per_model():
    """Assert that the custom dimensions are distributed over the models."""
    trainer = DirectClassifier(
        models=["LR1", "LR2"],
        n_calls=2,
        n_initial_points=2,
        bo_params={
            "dimensions": {
                "lr1": [Integer(100, 200, name="max_iter")],
                "lr2": [Integer(300, 400, name="max_iter")],
            },
        },
        random_state=1,
    )
    trainer.run(bin_train, bin_test)
    assert 100 <= trainer.lr1.best_params["max_iter"] <= 200
    assert 300 <= trainer.lr2.best_params["max_iter"] <= 400
Ejemplo n.º 5
0
def test_custom_dimensions_all_models():
    """Assert that the custom dimensions are for all models if not dict."""
    trainer = DirectClassifier(
        models=["LR1", "LR2"],
        n_calls=2,
        n_initial_points=2,
        bo_params={"dimensions": [Integer(100, 1000, name="max_iter")]},
        random_state=1,
    )
    trainer.run(bin_train, bin_test)
    assert list(trainer.lr1.best_params.keys()) == ["max_iter"]
    assert list(trainer.lr2.best_params.keys()) == ["max_iter"]
    def map_dim(values):
        if isinstance(values, tuple):  # linear subspace
            low, high, n_steps, value_type = values

            if value_type == 'i':
                return Integer(low, high)
            elif value_type == 'f':
                return Real(low, high)
            else:
                raise ValueError(f'Unknown value type "{value_type}"')
        else:  # exhaustive list of options
            return Categorical(values)
Ejemplo n.º 7
0
 def opt_space(self, groups):
     """Internal - generates optimization space for given groups for the board"""
     space = []
     for g in groups:
         gp = self.board.params.group_params(g)
         for p in gp:
             if not p.no_opt:
                 ##min_v = p.min_v
                 ##max_v = p.max_v
                 min_v = p.opt_minv if p.opt_minv is not None else p.min_v
                 max_v = p.opt_maxv if p.opt_maxv is not None else p.max_v
                 space.extend([Integer(min_v, max_v, name=p.key)])
     return space
Ejemplo n.º 8
0
def main(arguments):
    global train_mode
    EVALS = 50
    use_mp = True
    run_all = False
    selected_exp = []
    selected_datasets = []

    if '--build_datasets' in arguments:
        print('Building all necessary datasets required for the experiments. Disregarding other arguments! ' +
        'You will need to run this script again without --build_datasets in order to run experiments!')
        # Make all datasets
        for d in all_datasets:
            load_URMs(d, dataset_kwargs)
        return

    if '--no_mp' in arguments:
        print('No multiprocessing requested! Falling back to serial execution of experiments!')
        use_mp = False
        arguments.remove('--no_mp')

    if '--run_all' in arguments:
        print('All datasets selected for each algorithm!')
        selected_datasets = all_datasets
        run_all = True

    # user-based 训练
    if '--user' in arguments:
        train_mode = 'user'

    # item-based 训练
    if '--item' in arguments:
        train_mode = 'item'

    for arg in arguments:
        if not run_all and arg in name_datasets:
            selected_datasets.append(all_datasets[name_datasets.index(arg)])
        if arg in all_recommenders:
            selected_exp.append(arg)


    dict_rec_classes = {}
    dict_dimensions = {}
    dict_fit_params = {}
    dict_init_configs = {}


    # Experiment parameters
    # puresvd参数
    puresvd_dimensions = [
        Integer(1, 250, name='num_factors', dtype=int)
    ]
    puresvd_fit_params = [d.name for d in puresvd_dimensions]


    # als参数
    ials_dimensions = [
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical(["linear", "log"], name='confidence_scaling'),
        Real(low=1e-3, high=50, prior='log-uniform', name='alpha', dtype=float),
        Real(low=1e-5, high=1e-2, prior='log-uniform', name='reg', dtype=float),
        Real(low=1e-3, high=10.0, prior='log-uniform', name='epsilon', dtype=float)
    ]
    ials_fit_params = [d.name for d in ials_dimensions]


    # bpr参数 150epcochs
    bpr_dimensions = [
        Categorical([150], name='epochs'),
        Integer(1, 250, name='num_factors', dtype=int),
        Categorical([128, 256, 512, 1024], name='batch_size'),
        Categorical(["adagrad", "adam"], name='sgd_mode'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='positive_reg'),
        Real(low=1e-12, high=1e-3, prior='log-uniform', name='negative_reg'),
        Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate'),
    ]
    bpr_fit_params = [d.name for d in bpr_dimensions]


    # nmf参数
    nmf_dimensions = [
        Integer(1, 500, name='num_factors', dtype=int),
        Real(low=1e-5, high=1, prior='log-uniform', name='l1_ratio', dtype=float),
        Categorical(['coordinate_descent', 'multiplicative_update'], name='solver'),
        Categorical(['nndsvda'], name='init_type'),
        Categorical(['frobenius', 'kullback-leibler'], name='beta_loss')
    ]
    nmf_fit_params = [d.name for d in nmf_dimensions]


    # slimbpr参数 150epochs
    slimbpr_dimensions = [
        Integer(low=5, high=1000, prior='uniform', name='topK', dtype=int),
        Categorical([150], name='epochs'),
        Categorical([True, False], name='symmetric'),
        Categorical(["sgd", "adagrad", "adam"], name='sgd_mode'),
        Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_i', dtype=float),
        Real(low=1e-9, high=1e-3, prior='log-uniform', name='lambda_j', dtype=float),
        Real(low=1e-4, high=1e-1, prior='log-uniform', name='learning_rate', dtype=float)
    ]
    slimbpr_fit_names = [d.name for d in slimbpr_dimensions]


    # cfgan参数
    cfgan_dimensions = [
        Categorical([300], name='epochs'),
        Integer(1, 5, prior='uniform', name='d_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='g_steps', dtype=int),
        Integer(1, 5, prior='uniform', name='d_layers', dtype=int),
        Integer(1, 5, prior='uniform', name='g_layers', dtype=int),
        Categorical(['linear', 'tanh', 'sigmoid'], name='d_hidden_act'),
        Categorical(['linear', 'tanh', 'sigmoid'], name='g_hidden_act'),
        Categorical(['ZR', 'PM', 'ZP'], name='scheme'),
        Categorical([64, 128, 256, 512, 1024], name='d_batch_size'),
        Categorical([64, 128, 256, 512, 1024], name='g_batch_size'),
        Real(low=0, high=1, prior='uniform', name='zr_ratio', dtype=float),
        Real(low=0, high=1, prior='uniform', name='zp_ratio', dtype=float),
        Real(low=0, high=1, prior='uniform', name='zr_coefficient', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float),
    ]
    cfgan_fit_params = [d.name for d in cfgan_dimensions]


    # ganmf参数
    ganmf_dimensions = [
        Categorical([300], name='epochs'),
        Integer(low=1, high=250, name='num_factors', dtype=int),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Integer(low=1, high=10, name='m', dtype=int),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float),
        # Integer(5, 400, name='emb_dim', dtype=int),
        # Integer(1, 10, name='d_steps', dtype=int),
        # Integer(1, 10, name='g_steps', dtype=int),
        # Real(low=1e-6, high=1e-4, prior='log-uniform', name='g_reg', dtype=float),
    ]
    ganmf_fit_params = [d.name for d in ganmf_dimensions]


    # disgan参数
    disgan_dimensions = [
        Categorical([300], name='epochs'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'),
        Integer(low=1, high=5, prior='uniform', name='d_layers', dtype=int),
        Integer(low=1, high=250, name='num_factors', dtype=int),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float)
    ]
    disgan_fit_params = [d.name for d in disgan_dimensions]


    # deepganmf参数
    deepganmf_dimensions = [
        Categorical([300], name='epochs'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='d_hidden_act'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_hidden_act'),
        Categorical(['linear', 'tanh', 'relu', 'sigmoid'], name='g_output_act'),
        Categorical([1, 3, 5], name='d_layers'),
        Categorical([1, 2, 3, 4, 5], name='g_layers'),
        Categorical([64, 128, 256, 512, 1024], name='batch_size'),
        Integer(low=1, high=10, name='m', dtype=int),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='d_lr', dtype=float),
        Real(low=1e-4, high=1e-2, prior='log-uniform', name='g_lr', dtype=float),
        Real(low=1e-6, high=1e-4, prior='log-uniform', name='d_reg', dtype=float),
        Real(low=1e-2, high=0.5, prior='uniform', name='recon_coefficient', dtype=float),
    ]
    deepganmf_fit_params = [d.name for d in deepganmf_dimensions]



    dict_rec_classes['TopPop'] = TopPop
    dict_rec_classes['Random'] = Random
    dict_rec_classes['PureSVD'] = PureSVDRecommender
    dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython
    dict_rec_classes['ALS'] = IALSRecommender
    dict_rec_classes['NMF'] = NMFRecommender
    dict_rec_classes['GANMF'] = GANMF
    dict_rec_classes['CFGAN'] = CFGAN
    dict_rec_classes['DisGANMF'] = DisGANMF
    dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython
    dict_rec_classes['DeepGANMF'] = DeepGANMF

    dict_dimensions['TopPop'] = []
    dict_dimensions['Random'] = []
    dict_dimensions['PureSVD'] = puresvd_dimensions
    dict_dimensions['BPR'] = bpr_dimensions
    dict_dimensions['ALS'] = ials_dimensions
    dict_dimensions['NMF'] = nmf_dimensions
    dict_dimensions['GANMF'] = ganmf_dimensions
    dict_dimensions['CFGAN'] = cfgan_dimensions
    dict_dimensions['DisGANMF'] = disgan_dimensions
    dict_dimensions['SLIMBPR'] = slimbpr_dimensions
    dict_dimensions['DeepGANMF'] = deepganmf_dimensions

    dict_fit_params['TopPop'] = []
    dict_fit_params['Random'] = []
    dict_fit_params['PureSVD'] = puresvd_fit_params
    dict_fit_params['BPR'] = bpr_fit_params
    dict_fit_params['ALS'] = ials_fit_params
    dict_fit_params['NMF'] = nmf_fit_params
    dict_fit_params['GANMF'] = ganmf_fit_params
    dict_fit_params['CFGAN'] = cfgan_fit_params
    dict_fit_params['DisGANMF'] = disgan_fit_params
    dict_fit_params['SLIMBPR'] = slimbpr_fit_names
    dict_fit_params['DeepGANMF'] = deepganmf_fit_params

    pool_list_experiments = []
    pool_list_dimensions = []

    for exp in selected_exp:
        for d in selected_datasets:
            new_exp = RecSysExp(dict_rec_classes[exp], dataset=d, fit_param_names=dict_fit_params[exp],
                                method='bayesian', seed=seed)
            if use_mp:
                pool_list_experiments.append(new_exp)
                pool_list_dimensions.append(dict_dimensions[exp])
            else:
                new_exp.tune(dict_dimensions[exp], evals=EVALS,
                             init_config=dict_init_configs[exp] if exp in dict_init_configs else None)

    if use_mp:
        # Need to turn off MKL's own threading mechanism in order to use MP
        # https://github.com/joblib/joblib/issues/138
        os.environ['MKL_NUM_THREADS'] = '1'
        os.environ['OMP_NUM_THREADS'] = '1'
        os.environ['MKL_DYNAMIC'] = 'FALSE'
        
        pool = mp.Pool(initializer=set_affinity_on_worker)
        pool.starmap_async(run_exp, zip(pool_list_experiments, pool_list_dimensions, [EVALS]*len(pool_list_experiments)))
        pool.close()
        pool.join()
Ejemplo n.º 9
0
    def tune(self, params, evals=10, init_config=None, seed=None):
        """
        Runs the hyperparameter search using Gaussian Process as surrogate model or Random Search,
        saves the results of the trials and print the best found parameters.
        使用 高斯过程 作为 替代模型 进行 超参数 搜索 或 随机搜索
        保存 并 打印 训练 得到的 最佳 参数
        Parameters
        ----------
        params: list
            List of skopt.space.space.Dimensions to be searched.
        参数为 scikit-learn Base class for search space dimensions
        evals: int
            Number of evaluations to perform.

        init_config: list, default None
            An initial parameter configuration for seeding the Gaussian Process

        seed: int, default None
            Seed for random_state of `gp_minimize` or `dummy_minimize`.
            Set to a fixed integer for reproducibility.
        """

        msg = 'Started ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name
        subprocess.run(['telegram-send', msg])

        # URM_test CSR矩阵的shape
        U, I = self.URM_test.shape

        if self.recommender_class == GANMF:
            params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='emb_dim', dtype=int))
            self.fit_param_names.append('emb_dim')

        if self.recommender_class == CFGAN or self.recommender_class == DeepGANMF:
            params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int))
            params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='g_nodes', dtype=int))
            self.fit_param_names.append('d_nodes')
            self.fit_param_names.append('g_nodes')

        if self.recommender_class == DisGANMF:
            params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int))
            self.fit_param_names.append('d_nodes')

        self.dimension_names = [p.name for p in params]

        '''
        Need to make sure that the max. value of `num_factors` parameters must be lower than
        the max(U, I)
        '''
        try:
            idx = self.dimension_names.index('num_factors')
            maxval = params[idx].bounds[1]
            if maxval > min(U, I):
                params[idx] = Integer(1, min(U, I), name='num_factors', dtype=int)
        except ValueError:
            pass

        if len(params) > 0:

            # Check if there is already a checkpoint for this experiment 检查点
            checkpoint_path = os.path.join(self.logsdir, 'checkpoint.pkl')
            checkpoint_exists = True if os.path.exists(checkpoint_path) else False
            checkpoint_saver = CheckpointSaver(os.path.join(self.logsdir, 'checkpoint.pkl'), compress=3)

            if seed is None:
                seed = self.seed

            t_start = int(time.time())

            if checkpoint_exists:
                previous_run = skopt.load(checkpoint_path)
                if self.method == 'bayesian':
                    results = gp_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals),
                                          x0=previous_run.x_iters, y0=previous_run.func_vals, n_random_starts=0,
                                          random_state=seed, verbose=True, callback=[checkpoint_saver])
                else:
                    results = dummy_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals),
                                             x0=previous_run.x_iters, y0=previous_run.func_vals, random_state=seed,
                                             verbose=True, callback=[checkpoint_saver])
            else:
                # 超参数优化
                if self.method == 'bayesian':
                    results = gp_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True,
                                          callback=[checkpoint_saver])
                else:
                    results = dummy_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True,
                                          callback=[checkpoint_saver])

            t_end = int(time.time())

        # Save best parameters of this experiment
        # best_params = dict(zip(self.dimension_names, results.x))
        # with open(os.path.join(self.logsdir, 'best_params.pkl'), 'wb') as f:
        #     pickle.dump(best_params, f, pickle.HIGHEST_PROTOCOL)

            best_params = self.load_best_params()

            with open(os.path.join(self.logsdir, 'results.txt'), 'a') as f:
                f.write('Experiment ran for {}\n'.format(str(datetime.timedelta(seconds=t_end - t_start))))
                f.write('Best {} score: {}. Best result found at: {}\n'.format(self.metric, results.fun, best_params))

            if self.recommender_class in [IALSRecommender, MatrixFactorization_BPR_Cython]:
                self.dimension_names.append('epochs')
            self.build_fit_params(best_params.values())

        # Retrain with all training data
        set_seed(seed)
        if self.isGAN:
            model = self.recommender_class(self.URM_train, mode=train_mode, is_experiment=True)
            model.logsdir = self.logsdir
            model.fit(**self.fit_params)
            # load_models(model, save_dir='best_model', all_in_folder=True)

        else:
            model = self.recommender_class(self.URM_train)
            model.fit(**self.fit_params)
            # model.loadModel(os.path.join(self.logsdir, 'best_model'))

        _, results_run_string = self.evaluatorTest.evaluateRecommender(model)

        print('\n\nResults on test set:')
        print(results_run_string)
        print('\n\n')

        with open(os.path.join(self.logsdir, 'result_test.txt'), 'w') as f:
            f.write(results_run_string)

        msg = 'Finished ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name
        subprocess.run(['telegram-send', msg])
Ejemplo n.º 10
0
        fig_1, ax_1, fig_2, ax_2 = plotStats(stats, keys)
        plt.show()

    validAccs = stats[:, -1]
    length10percent = len(validAccs) // 10
    best10percent = np.sort(validAccs)[-length10percent:]
    # We want to maximise the MEAN validation accuracy,
    # i.e. minimise minus
    return -np.mean(best10percent)


# In[14]:

inputKeepProbSpace = Real(0.5, 1.0, "uniform")
hiddenKeepProbSpace = Real(0.5, 1.0, "uniform")
hiddenDimSpace = Integer(20, 2000)
lamda2Space = Real(1e-3, 10, "log-uniform")
space = [inputKeepProbSpace, hiddenKeepProbSpace, hiddenDimSpace, lamda2Space]

# TARGET IS 58% as the original Deep Neural Net

# In[15]:

if jupyterNotebookEnabled:
    get_ipython().magic(u'%time')

#this might crash so you need to run it outside as a python file (file -> save as python)
if not os.path.isfile(res_gp_save_filename):
    if os.path.isfile(statsCollectionFilename):
        os.remove(statsCollectionFilename)  #erase before executing
Ejemplo n.º 11
0
def startExperiment(parameters):
    """
    Starts an experiment with the given parameters

    :param parameters: parameters of the experiment
    :type parameters: Dict
    """

    optimizationPath = str(
        os.path.join(parameters["path"], parameters["experimentId"]))
    json_file = str(
        os.path.join(optimizationPath, parameters["experimentId"] + ".json"))
    if os.path.isfile(json_file):
        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.resume_optimization(json_file)
    else:
        # Import dataset class and initialize an instance with the chosen dataset
        dataset_class = importDataset()
        dataset = dataset_class()
        dataset_path = str(
            os.path.join(pathDataset, "preprocessed_datasets",
                         parameters["dataset"]))
        dataset.load_custom_dataset_from_folder(dataset_path)

        model_class = importModel(parameters["model"]["name"])
        model = model_class()

        model.hyperparameters.update(parameters["model"]["parameters"])
        model.partitioning(parameters["partitioning"])

        search_space = {}

        for key, value in parameters["optimization"]["search_spaces"].items():
            if "low" in value:
                if isinstance(value["low"], float) or isinstance(
                        value["high"], float):
                    search_space[key] = Real(low=value["low"],
                                             high=value["high"])
                else:
                    search_space[key] = Integer(low=value["low"],
                                                high=value["high"])
            else:
                search_space[key] = Categorical(value)

        metric_parameters = parameters["optimize_metrics"][0]["parameters"]
        for key in metric_parameters:
            if metric_parameters[key] == "use dataset texts":
                metric_parameters[key] = dataset.get_corpus()
            elif metric_parameters[key] == "use selected dataset":
                metric_parameters[key] = dataset
            elif os.path.isdir(str(metric_parameters[key])):
                metricDataset = dataset_class()
                metricDataset.load_custom_dataset_from_folder(
                    metric_parameters[key])
                metric_parameters[key] = metricDataset.get_corpus()

        metric_class = importMetric(parameters["optimize_metrics"][0]["name"])
        metric = metric_class(**metric_parameters)

        metrics_to_track = []
        for single_metric in parameters["track_metrics"]:
            metric_class = importMetric(single_metric["name"])
            single_metric_parameters = single_metric["parameters"]
            for key in single_metric_parameters:
                if single_metric_parameters[key] == "use dataset texts":
                    single_metric_parameters[key] = dataset.get_corpus()
                elif single_metric_parameters[key] == "use selected dataset":
                    single_metric_parameters[key] = dataset
            new_metric = metric_class(**single_metric_parameters)
            metrics_to_track.append(new_metric)

        vocabulary_path = str(
            os.path.join(parameters["path"], parameters["experimentId"],
                         "models"))

        Path(vocabulary_path).mkdir(parents=True, exist_ok=True)

        vocabulary_path = str(os.path.join(vocabulary_path, "vocabulary.json"))

        file = open(vocabulary_path, "w")
        json.dump(dict(corpora.Dictionary(dataset.get_corpus())), file)
        file.close()

        Optimizer = importOptimizer()
        optimizer = Optimizer()
        optimizer.optimize(
            model,
            dataset,
            metric,
            search_space,
            metrics_to_track,
            random_state=True,
            initial_point_generator="random",
            surrogate_model=parameters["optimization"]["surrogate_model"],
            model_runs=parameters["optimization"]["model_runs"],
            n_random_starts=parameters["optimization"]["n_random_starts"],
            acq_func=parameters["optimization"]["acquisition_function"],
            number_of_call=parameters["optimization"]["iterations"],
            save_models=True,
            save_name=parameters["experimentId"],
            save_path=optimizationPath)
    filename = stats_coll_filename
    statsCollection = np.load(filename)[()] if os.path.isfile(filename) else dict()
    statsCollection[(state_size, num_steps, learning_rate)] = stats
    np.save(filename, statsCollection)
    
    if plotting:
        fig_1, ax_1, fig_2, ax_2 = plotStats(stats, DynStats.keys)
        plt.show()
    
    # We want to minimize the amount of epochs required to reach 23% accuracy
    return metric


# In[13]:

stateSizeSpace = Integer(15, 1000)
numStepSpace = Categorical(numLens)
learningRateSpace = Real(1e-6, 1e-1, prior="log-uniform")
space  = [stateSizeSpace, numStepSpace, learningRateSpace]


# In[14]:

if jupyterNotebookEnabled:
    get_ipython().magic(u'%time')

if not os.path.isfile(best_params_filename):
    if os.path.isfile(stats_coll_filename):
        os.remove(stats_coll_filename)
    
    res_gp = gp_minimize(
Ejemplo n.º 13
0
# numerical pipeline
numeric_pipeline = Pipeline([('select_numeric', TypeSelector(dtype='number'))])

# processing pipeline
cat_num_featun = FeatureUnion([('categorical', categorical_pipeline),
                               ('numerical', numeric_pipeline)])

# combined pipeline
estimator_pipeline = Pipeline([('Features', feature_pipeline),
                               ('Categorical_Numeric', cat_num_featun),
                               ('Estimator', RandomForestClassifier())])

# search space
search_space = {
    "Estimator__n_estimators": Integer(35, 45),
    "Estimator__min_samples_split": Integer(95, 105),
}

# scorer
metric = make_scorer(score_func=log_loss,
                     greater_is_better=False,
                     needs_proba=True,
                     labels=train['Category'].unique())

# cv
kfold_cv = KFold(n_splits=10, shuffle=True, random_state=42)

# bayessearch cv
bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline,
                                     search_spaces=search_space,