Exemple #1
0
def _run_ablation_experiments(
    directories: List[Tuple[str, str]],
    best_replicates: Optional[int] = None,
    dry_run: bool = False,
    move_to_cpu: bool = True,
    discard_replicates: bool = False,
):
    """Run ablation experiments."""
    if dry_run:
        return

    from pykeen.hpo import hpo_pipeline_from_path

    for output_directory, rv_config_path in directories:
        hpo_pipeline_result = hpo_pipeline_from_path(rv_config_path)
        hpo_pipeline_result.save_to_directory(output_directory)

        if not best_replicates:
            continue

        best_pipeline_dir = os.path.join(output_directory, 'best_pipeline')
        os.makedirs(best_pipeline_dir, exist_ok=True)
        logger.info('Re-training best pipeline and saving artifacts in %s', best_pipeline_dir)
        hpo_pipeline_result.replicate_best_pipeline(
            replicates=best_replicates,
            move_to_cpu=move_to_cpu,
            save_replicates=not discard_replicates,
            directory=best_pipeline_dir,
        )
Exemple #2
0
def _run_ablation_experiments(
    directories: Sequence[Tuple[Union[str, pathlib.Path], Union[str, pathlib.Path]]],
    best_replicates: Optional[int] = None,
    dry_run: bool = False,
    move_to_cpu: bool = True,
    discard_replicates: bool = False,
) -> None:
    """Run ablation experiments."""
    if dry_run:
        return

    from pykeen.hpo import hpo_pipeline_from_path

    for output_directory, rv_config_path in directories:
        if isinstance(output_directory, str):
            output_directory = pathlib.Path(output_directory).resolve()
        hpo_pipeline_result = hpo_pipeline_from_path(rv_config_path)
        hpo_pipeline_result.save_to_directory(output_directory)

        if not best_replicates:
            continue

        best_pipeline_dir = output_directory.joinpath('best_pipeline')
        best_pipeline_dir.mkdir(exist_ok=True, parents=True)
        logger.info('Re-training best pipeline and saving artifacts in %s', best_pipeline_dir)
        hpo_pipeline_result.replicate_best_pipeline(
            replicates=best_replicates,
            move_to_cpu=move_to_cpu,
            save_replicates=not discard_replicates,
            directory=best_pipeline_dir,
        )
Exemple #3
0
def ablation(
    path: str,
    directory: Optional[str],
    dry_run: bool,
    best_replicates: int,
    save_artifacts: bool,
    move_to_cpu: bool,
    discard_replicates: bool,
) -> None:
    """Generate a set of HPO configurations.

    A sample file can be run with ``pykeen experiment ablation tests/resources/hpo_complex_nations.json``.
    """
    from pykeen.ablation import prepare_ablation

    datetime = time.strftime('%Y-%m-%d-%H-%M')
    directory = os.path.join(directory, f'{datetime}_{uuid4()}')

    directories = prepare_ablation(path=path,
                                   directory=directory,
                                   save_artifacts=save_artifacts)
    if dry_run:
        return sys.exit(0)

    from pykeen.hpo import hpo_pipeline_from_path

    for output_directory, rv_config_path in directories:
        hpo_pipeline_result = hpo_pipeline_from_path(rv_config_path)
        hpo_pipeline_result.save_to_directory(output_directory)

        if not best_replicates:
            continue

        best_pipeline_dir = os.path.join(output_directory, 'best_pipeline')
        os.makedirs(best_pipeline_dir, exist_ok=True)
        click.echo(
            f'Re-training best pipeline and saving artifacts in {best_pipeline_dir}'
        )
        hpo_pipeline_result.replicate_best_pipeline(
            replicates=best_replicates,
            move_to_cpu=move_to_cpu,
            save_replicates=not discard_replicates,
            directory=best_pipeline_dir,
        )
Exemple #4
0
def optimize(path: str, directory: str):
    """Run a single HPO experiment."""
    from pykeen.hpo import hpo_pipeline_from_path
    hpo_pipeline_result = hpo_pipeline_from_path(path)
    hpo_pipeline_result.save_to_directory(directory)
Exemple #5
0
def ablation_pipeline(
    datasets: Union[str, List[str]],
    models: Union[str, List[str]],
    losses: Union[str, List[str]],
    optimizers: Union[str, List[str]],
    training_loops: Union[str, List[str]],
    *,
    create_inverse_triples: Union[bool, List[bool]] = False,
    regularizers: Union[None, str, List[str]] = None,
    model_to_model_kwargs: Optional[Mapping2D] = None,
    model_to_model_kwargs_ranges: Optional[Mapping2D] = None,
    model_to_trainer_to_training_kwargs: Optional[Mapping3D] = None,
    model_to_trainer_to_training_kwargs_ranges: Optional[Mapping3D] = None,
    ablation_config: Optional[Mapping3D] = None,
    evaluator: Optional[str] = None,
    optuna_config: Optional[Mapping[str, Any]] = None,
    evaluator_kwargs: Optional[Mapping[str, Any]] = None,
    evaluation_kwargs: Optional[Mapping[str, Any]] = None,
    directory: Optional[str] = None,
    dry_run: bool = False,
    best_replicates: Optional[int] = None,
    save_artifacts: bool = True,
    move_to_cpu: bool = True,
    discard_replicates: bool = False,
) -> None:
    """Generate a set of HPO configurations.

    A sample file can be run with``pykeen experiment ablation tests/resources/hpo_complex_nations.json``.

    :param datasets: A dataset name or list of dataset names
    :param models: A model name or list of model names
    :param losses: A loss function name or list of loss function names
    :param optimizers: An optimizer name or list of optimizer names
    :param training_loops: A training loop name or list of training loop names
    :param create_inverse_triples: Either a boolean for a single entry or a list of booleans
    :param regularizers: A regularizer name, list of regularizer names, or None if no regularizer is desired.
        Defaults to None.

    :param evaluator: The name of the evaluator to be used. Defaults to rank-based evaluator.
    :param evaluator_kwargs: The keyword arguments passed to the evaluator (in the pipeline)
    :param evaluation_kwargs: The keyword arguments passed during evaluation (in the pipeline)

    :param model_to_model_kwargs: A mapping from model name to dictionaries of default keyword arguments for
        the instantiation of that model
    :param model_to_model_kwargs_ranges: A mapping from model name to dictionaries of keyword argument
        ranges for that model to be used in HPO.
    :param model_to_trainer_to_training_kwargs: A mapping from model name to a mapping of trainer name to a mapping
        of default keyword arguments for the instantiation of that trainer. This is useful becuase for some models,
        you might want to set the number of epochs differently.
    :param model_to_trainer_to_training_kwargs_ranges: A mapping from model name to a mapping of trainer name
        to a mapping of keyword arguments for that trainer to be used in HPO.
    :param ablation_config: Additional third-order and fourth-order ablation configuration for all other ablation
        keys to models to either kwargs or kwarg ranges

    :param optuna_config: Configuration passed to optuna for HPO over all ablation studies
    :param directory: The directory in which the experimental artifacts will be saved.
    :param dry_run: Defines whether only the configurations for the single experiments should be created without
     running them.
    :param best_replicates: Defines how often the final model should be re-trained and evaluated based on the best
     hyper-parameters enabling to measure the variance in performance.
    :param save_artifacts: Defines, whether each trained model sampled during HPO should be saved.
    :param move_to_cpu: Defines, whether a replicate of the best model should be moved to CPU.
     We recommend to set this flag to 'True' to avoid unnecessary GPU usage.
    :param discard_replicates: Defines, whether the best model should be discarded after training and evaluation.
    """
    datetime = time.strftime('%Y-%m-%d-%H-%M')
    directory = os.path.join(directory, f'{datetime}_{uuid4()}')

    directories = prepare_ablation(
        datasets=datasets,
        create_inverse_triples=create_inverse_triples,
        models=models,
        model_to_model_kwargs=model_to_model_kwargs,
        model_to_model_kwargs_ranges=model_to_model_kwargs_ranges,
        model_to_trainer_to_training_kwargs=model_to_trainer_to_training_kwargs,
        model_to_trainer_to_training_kwargs_ranges=
        model_to_trainer_to_training_kwargs_ranges,
        losses=losses,
        regularizers=regularizers,
        optimizers=optimizers,
        training_loops=training_loops,
        evaluator=evaluator,
        optuna_config=optuna_config,
        ablation_config=ablation_config,
        evaluator_kwargs=evaluator_kwargs,
        evaluation_kwargs=evaluation_kwargs,
        directory=directory,
        save_artifacts=save_artifacts,
    )
    if dry_run:
        return

    from pykeen.hpo import hpo_pipeline_from_path

    for output_directory, rv_config_path in directories:
        hpo_pipeline_result = hpo_pipeline_from_path(rv_config_path)
        hpo_pipeline_result.save_to_directory(output_directory)

        if not best_replicates:
            continue

        best_pipeline_dir = os.path.join(output_directory, 'best_pipeline')
        os.makedirs(best_pipeline_dir, exist_ok=True)
        logger.info('Re-training best pipeline and saving artifacts in %s',
                    best_pipeline_dir)
        hpo_pipeline_result.replicate_best_pipeline(
            replicates=best_replicates,
            move_to_cpu=move_to_cpu,
            save_replicates=not discard_replicates,
            directory=best_pipeline_dir,
        )