Esempio n. 1
0
    def search_best_config(
        self,
        n_trials: int = 10,
        logger: Optional[Logger] = None,
        random_seed: Optional[int] = None,
    ) -> Optional[MLPTrainingConfig]:
        self.best_trial_score = float("inf")
        self.best_config = None
        study = optuna.create_study(sampler=optuna.samplers.TPESampler(
            seed=random_seed))
        if logger is None:
            logger = get_default_logger()
        r2 = (self.embedding_test**2).mean(axis=1).mean()
        logger.info("MSE baseline is %f", r2)

        def objective(trial: optuna.Trial) -> float:
            config = self.search_config.suggest(trial)
            mlp_function = hk.transform(lambda x, training: (create_mlp(
                self.embedding_train.shape[1],
                config,
            ))(x, training))
            score, epoch = self._train_nn_with_trial(mlp_function,
                                                     config=config,
                                                     trial=trial)
            config.best_epoch = epoch
            if score < self.best_trial_score:
                self.best_trial_score = score
                self.best_config = config
            return score

        study.optimize(objective, n_trials=n_trials)
        return self.best_config
Esempio n. 2
0
    def __init__(
            self,
            data: InteractionMatrix,
            val_evaluator: Evaluator,
            logger: Optional[logging.Logger] = None,
            suggest_overwrite: List[Suggestion] = list(),
            fixed_params: Dict[str, Any] = dict(),
    ):

        if logger is None:
            logger = get_default_logger()

        self.logger = logger
        self._data = data
        self.val_evaluator = val_evaluator

        self.current_trial: int = 0
        self.best_trial_index: Optional[int] = None
        self.best_val = float("inf")
        self.best_params: Optional[Dict[str, Any]] = None
        self.learnt_config_best: Dict[
            str, Any] = dict()  # to store early-stopped epoch

        self.valid_results: List[Dict[str, float]] = []
        self.tried_configs: List[Dict[str, Any]] = []
        self.suggestions = overwrite_suggestions(self.default_tune_range,
                                                 suggest_overwrite,
                                                 fixed_params)
        self.fixed_params = fixed_params
Esempio n. 3
0
    def __init__(
        self,
        X_train: InteractionMatrix,
        profile_train: ProfileMatrix,
        evaluator: UserColdStartEvaluator,
        target_metric: str = "ndcg",
        suggest_overwrite: List[Suggestion] = list(),
        fixed_params: Dict[str, Any] = dict(),
        logger: Optional[logging.Logger] = None,
    ):
        if logger is None:
            logger = get_default_logger()

        self.logger = logger

        self.X_train = X_train
        self.profile_train = profile_train
        self.evaluator = evaluator
        self.target_metric = target_metric
        self.current_trial: int = 0
        self.best_trial_index: Optional[int] = None
        self.best_val = float("inf")
        self.best_params: Optional[Dict[str, Any]] = None
        self.learnt_config_best: Dict[str, Any] = dict()

        self.valid_results: List[Dict[str, float]] = []
        self.tried_configs: List[Dict[str, Any]] = []
        self.suggestions = overwrite_suggestions(self.default_tune_range,
                                                 suggest_overwrite,
                                                 fixed_params)
        self.fixed_params = fixed_params
Esempio n. 4
0
    def search_all(
        self,
        n_trials: int = 40,
        logger: Optional[Logger] = None,
        timeout: Optional[int] = None,
        reconstruction_search_config: Optional[MLPSearchConfig] = None,
        cf_suggest_overwrite: List[Suggestion] = [],
        cf_fixed_params: Dict[str, Any] = dict(),
        random_seed: Optional[int] = None,
    ) -> Tuple[CB2CFUserColdStartRecommender, Dict[str, Any],
               MLPTrainingConfig]:
        if logger is None:
            logger = get_default_logger()
        recommender, best_config_recommender = self.search_embedding(
            n_trials,
            logger,
            timeout=timeout,
            suggest_overwrite=cf_suggest_overwrite,
            fixed_params=cf_fixed_params,
            random_seed=random_seed,
        )

        if logger is not None:
            logger.info("Start learning feature -> embedding map.")

        mlp, best_config_mlp = self.search_reconstruction(
            recommender,
            n_trials,
            logger=logger,
            config=reconstruction_search_config,
            random_seed=random_seed,
        )

        return (
            CB2CFUserColdStartRecommender(recommender, mlp),
            best_config_recommender,
            best_config_mlp,
        )
Esempio n. 5
0
    def __init__(
            self,
            data: InteractionMatrix,
            val_evaluator: Evaluator,
            logger: Optional[logging.Logger] = None,
            suggest_overwrite: List[Suggestion] = list(),
            fixed_params: Dict[str, Any] = dict(),
    ):

        if logger is None:
            logger = get_default_logger()

        self.logger = logger
        self._data = data
        self.val_evaluator = val_evaluator

        self.current_trial: int = 0
        self.best_val = float("inf")

        self.suggestions = overwrite_suggestions(self.default_tune_range,
                                                 suggest_overwrite,
                                                 fixed_params)
        self.fixed_params = fixed_params
Esempio n. 6
0
def autopilot(
    X: InteractionMatrix,
    evaluator: Evaluator,
    n_trials: int = 20,
    memory_budget: int = 4000,  # 4GB
    timeout_overall: Optional[int] = None,
    timeout_singlestep: Optional[int] = None,
    algorithms: List[str] = DEFAULT_SEARCHNAMES,
    random_seed: Optional[int] = None,
    logger: Optional[Logger] = None,
    callback: Optional[Callable[[int, pd.DataFrame], None]] = None,
    storage: Optional[RDBStorage] = None,
    study_name: Optional[str] = None,
    task_resource_provider: Type[TaskBackend] = MultiProcessingBackend,
) -> Tuple[Type[BaseRecommender], Dict[str, Any], pd.DataFrame]:

    r"""Given an interaction matrix and an evaluator, search for the best algorithm and its parameters
    (roughly) within the time & space constraints. You can specify how each search step will be executed.

    Args:
        X:
            Input interaction matrix.
        evaluator:
            Evaluator to measure the performance of the recommenders.
        n_trials: The maximal number of trials. Defaults to 20.
        memory_budget:
            Optimizers will try search parameters so that memory usage (in megabyte) will not exceed this values.
            An algorithm will not be searched if it inevitably violates this bound.
            Note that this value is quite rough one and will not be respected strictly.
        timeout_overall:
            If set, the total execution time of the trials will not exceed this value (roughly).
        timeout_singlestep:
            If set, a single trial (recommender and a set of its parameter) will not run for more than the value (in seconds).
            Such a trial is considered to have produced  a score value of 0,
            and optuna will avoid suggesting such values (if everything works fine).
            Defaults to `None`.
        algorithms:
            A list of algorithm names to be tried.
            Defaults to `["RP3beta", "IALS", "DenseSLIM", "AsymmetricCosineKNN", "SLIM"]`.
        random_seed:
            The random seed that controls the suggestion behavior.
            Defaults to `None`.
        logger:
            The logger to be used. If `None`, irspack's default logger will be used.
            Defaults to None.
        callback:
            If not `None`, called at the end of every single trial with the following arguments

                1. The current trial's number.
                2. A `pd.DataFrame` that holds history of trial execution.

            Defaults to `None`.
        storage:
            An instance of `optuna.storages.RDBStorage`. Defaults to `None`.
        study_name:
            If `storage` argument is given, you have to pass study_name
            argument.
        task_resource_provider:
            Specifies how each search step is executed. Defaults to `MultiProceesingBackend`.
    Raises:
        ValueError:
            If `storage` is given but `study_name` is not specified.
        RuntimeError:
            If no recommender algorithms are available within given memory budget.
        RuntimeError:
            If no trials have been completed within given timeout.


    Returns:

        * The best algorithm's recommender class.
        * The best parameters.
        * The dataframe containing the history of trials.

    """
    if storage is not None and study_name is None:
        raise ValueError('"study_name" must be specified if "storage" is given.')
    RNS = np.random.RandomState(random_seed)
    suggest_overwrites: Dict[str, List[Suggestion]] = {}
    optimizer_names: List[str] = []
    for rec_name in algorithms:
        optimizer_class_name = rec_name + "Optimizer"
        optimizer_class = get_optimizer_class(optimizer_class_name)
        try:
            suggest_overwrites[
                optimizer_class_name
            ] = optimizer_class.tune_range_given_memory_budget(X, memory_budget)
            optimizer_names.append(optimizer_class_name)
        except LowMemoryError:
            continue

    if not optimizer_names:
        raise RuntimeError("No available algorithm with given memory.")

    if logger is None:
        logger = get_default_logger()

    logger.info("Trying the following algorithms: %s", optimizer_names)

    optional_db_path = Path(f".autopilot-{uuid1()}.db")
    storage_: RDBStorage
    if storage is None:
        storage_ = RDBStorage(
            url=f"sqlite:///{optional_db_path.name}",
        )
    else:
        storage_ = storage

    if study_name is None:
        study_name_ = f"autopilot-{uuid1()}"
    else:
        study_name_ = study_name
    start = time.time()
    study = optuna.create_study(
        storage=storage_, study_name=study_name_, load_if_exists=True
    )
    study_id = storage_.get_study_id_from_name(study_name_)

    for _ in range(n_trials):

        task_start = time.time()
        elapsed_at_start = task_start - start

        timeout_for_this_process: Optional[int] = None
        if timeout_overall is None:
            timeout_for_this_process = timeout_singlestep
        else:
            remaining_time = int(timeout_overall - elapsed_at_start)
            if remaining_time <= 0:
                break

            if timeout_singlestep is not None:
                timeout_for_this_process = min(remaining_time, timeout_singlestep)
        task = task_resource_provider(
            X,
            evaluator,
            optimizer_names,
            suggest_overwrites,
            storage_.url,
            study_name_,
            RNS.randint(0, np.iinfo(np.int32).max, dtype=np.int32),
            logger,
        )

        task.start()
        trial_number = task.receive_trial_number()
        task.join(timeout=timeout_for_this_process)

        if task.exit_code is None:
            task.terminate()
            try:
                logger.info(f"Trial {trial_number} timeout.")
                storage_.read_trials_from_remote_storage(study_id)
                trial_id = storage_.get_trial_id_from_study_id_trial_number(
                    study_id, trial_number
                )
                trial_this = storage_.get_trial(trial_id)
                intermediate_values = sorted(
                    list(trial_this.intermediate_values.items()),
                    key=_sort_intermediate,
                )

                if intermediate_values:
                    # Though terminated, it resulted in some values.
                    # Regard it as a COMPLETE trial.
                    storage_.set_trial_values(
                        trial_id,
                        [intermediate_values[0][1]],
                    )
                    storage_.set_trial_user_attr(
                        trial_id, "max_epoch", intermediate_values[0][0] + 1
                    )
                else:
                    # Penalize such a time-consuming trial
                    storage_.set_trial_values(trial_id, [0.0])
                storage_.set_trial_state(trial_id, TrialState.COMPLETE)
            except RuntimeError:  # pragma: no cover
                pass  # pragma: no cover

        if callback is not None:
            callback(trial_number, study_to_dataframe(study))

        now = time.time()
        elapsed = now - start
        if timeout_overall is not None:
            if elapsed > timeout_overall:
                break
    best_params_with_prefix = dict(
        **study.best_trial.params,
        **{
            key: val
            for key, val in study.best_trial.user_attrs.items()
            if is_valid_param_name(key)
        },
    )
    best_params = {
        re.sub(r"^([^\.]*\.)", "", key): value
        for key, value in best_params_with_prefix.items()
    }
    optimizer_name: str = best_params.pop("optimizer_name")
    result_df = study_to_dataframe(study)

    if storage is None:
        optional_db_path.unlink()
    recommender_class = get_optimizer_class(optimizer_name).recommender_class

    return (recommender_class, best_params, result_df)