Exemple #1
0
def _create_run(uri, experiment_id, work_dir, entry_point):
    """
    Create a ``Run`` against the current MLflow tracking server, logging metadata (e.g. the URI,
    entry point, and parameters of the project) about the run. Return an ``ActiveRun`` that can be
    used to report additional data about the run (metrics/params) to the tracking server.
    """
    if _is_local_uri(uri):
        source_name = tracking.utils._get_git_url_if_present(_expand_uri(uri))
    else:
        source_name = _expand_uri(uri)
    source_version = _get_git_commit(work_dir)
    existing_run = fluent.active_run()
    if existing_run:
        parent_run_id = existing_run.info.run_id
    else:
        parent_run_id = None

    tags = {
        MLFLOW_USER: _get_user(),
        MLFLOW_SOURCE_NAME: source_name,
        MLFLOW_SOURCE_TYPE: SourceType.to_string(SourceType.PROJECT),
        MLFLOW_PROJECT_ENTRY_POINT: entry_point
    }
    if source_version is not None:
        tags[MLFLOW_GIT_COMMIT] = source_version
    if parent_run_id is not None:
        tags[MLFLOW_PARENT_RUN_ID] = parent_run_id

    active_run = tracking.MlflowClient().create_run(experiment_id=experiment_id, tags=tags)
    return active_run
Exemple #2
0
def _get_docker_tag_name(imagename, work_dir):
    """Returns an appropriate Docker tag for a project based on name and git hash."""
    imagename = imagename if imagename else "docker-project"
    # Optionally include first 7 digits of git SHA in tag name, if available.
    git_commit = _get_git_commit(work_dir)
    version_string = ":" + git_commit[:7] if git_commit else ""
    return imagename + version_string
Exemple #3
0
def _create_run(uri, experiment_id, work_dir, version, entry_point,
                parameters):
    """
    Create a ``Run`` against the current MLflow tracking server, logging metadata (e.g. the URI,
    entry point, and parameters of the project) about the run. Return an ``ActiveRun`` that can be
    used to report additional data about the run (metrics/params) to the tracking server.
    """
    if _is_local_uri(uri):
        source_name = tracking._tracking_service.utils._get_git_url_if_present(
            _expand_uri(uri))
    else:
        source_name = _expand_uri(uri)
    source_version = _get_git_commit(work_dir)
    existing_run = fluent.active_run()
    if existing_run:
        parent_run_id = existing_run.info.run_id
    else:
        parent_run_id = None

    tags = {
        MLFLOW_USER: _get_user(),
        MLFLOW_SOURCE_NAME: source_name,
        MLFLOW_SOURCE_TYPE: SourceType.to_string(SourceType.PROJECT),
        MLFLOW_PROJECT_ENTRY_POINT: entry_point,
    }
    if source_version is not None:
        tags[MLFLOW_GIT_COMMIT] = source_version
    if parent_run_id is not None:
        tags[MLFLOW_PARENT_RUN_ID] = parent_run_id

    repo_url = _get_git_repo_url(work_dir)
    if repo_url is not None:
        tags[MLFLOW_GIT_REPO_URL] = repo_url
        tags[LEGACY_MLFLOW_GIT_REPO_URL] = repo_url

    # Add branch name tag if a branch is specified through -version
    if _is_valid_branch_name(work_dir, version):
        tags[MLFLOW_GIT_BRANCH] = version
        tags[LEGACY_MLFLOW_GIT_BRANCH_NAME] = version
    active_run = tracking.MlflowClient().create_run(
        experiment_id=experiment_id, tags=tags)

    project = _project_spec.load_project(work_dir)
    # Consolidate parameters for logging.
    # `storage_dir` is `None` since we want to log actual path not downloaded local path
    entry_point_obj = project.get_entry_point(entry_point)
    final_params, extra_params = entry_point_obj.compute_parameters(
        parameters, storage_dir=None)
    params_list = [
        Param(key, value) for key, value in list(final_params.items()) +
        list(extra_params.items())
    ]
    tracking.MlflowClient().log_batch(active_run.info.run_id,
                                      params=params_list)
    return active_run
Exemple #4
0
def _get_docker_image_uri(repository_uri, work_dir):
    """
    Returns an appropriate Docker image URI for a project based on the git hash of the specified
    working directory.

    :param repository_uri: The URI of the Docker repository with which to tag the image. The
                           repository URI is used as the prefix of the image URI.
    :param work_dir: Path to the working directory in which to search for a git commit hash
    """
    repository_uri = repository_uri if repository_uri else "docker-project"
    # Optionally include first 7 digits of git SHA in tag name, if available.
    git_commit = _get_git_commit(work_dir)
    version_string = ":" + git_commit[:7] if git_commit else ""
    return repository_uri + version_string
Exemple #5
0
def main(cfg: DictConfig) -> None:
    # set up mlflow experiment id
    mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}")
    experiment = mlflow.get_experiment_by_name(cfg.experiment_name)

    if experiment is not None:
        run_kwargs = {'experiment_id': experiment.experiment_id}
        if cfg["pretrained"] is not None:  # initialise with pretrained run, otherwise create a new run
            run_kwargs['run_id'] = cfg["pretrained"]["run_id"]
    else:  # create new experiment
        experiment_id = mlflow.create_experiment(cfg.experiment_name)
        run_kwargs = {'experiment_id': experiment_id}

    # run the training with mlflow tracking
    with mlflow.start_run(**run_kwargs) as main_run:
        if cfg["pretrained"] is not None:
            mlflow.start_run(experiment_id=run_kwargs['experiment_id'],
                             nested=True)
        active_run = mlflow.active_run()
        run_id = active_run.info.run_id

        setup_gpu(cfg.gpu_cfg)
        training_cfg = OmegaConf.to_object(
            cfg.training_cfg)  # convert to python dictionary
        scaling_cfg = to_absolute_path(cfg.scaling_cfg)
        dataloader = DataLoader.DataLoader(training_cfg, scaling_cfg)
        setup = dataloader.config["SetupNN"]
        TauLosses.SetSFs(*setup["TauLossesSFs"])
        print("loss consts:", TauLosses.Le_sf, TauLosses.Lmu_sf,
              TauLosses.Ltau_sf, TauLosses.Ljet_sf)

        if setup["using_new_loss"]: tf.config.run_functions_eagerly(True)
        netConf_full = dataloader.get_net_config()

        if dataloader.input_type == "Adversarial":
            model = create_model(
                netConf_full,
                dataloader.model_name,
                loss=setup["loss"],
                use_newloss=setup["using_new_loss"],
                use_AdvDataset=True,
                adv_param=dataloader.adversarial_parameter,
                n_adv_tau=dataloader.adv_batch_size,
                adv_learning_rate=dataloader.adv_learning_rate)
        else:
            model = create_model(netConf_full,
                                 dataloader.model_name,
                                 loss=setup["loss"],
                                 use_newloss=setup["using_new_loss"])

        if cfg.pretrained is None:
            print(
                "Warning: no pretrained NN -> training will be started from scratch"
            )
            old_opt = None
        else:
            print("Warning: training will be started from pretrained model.")
            print(
                f"Model: run_id={cfg.pretrained.run_id}, experiment_id={cfg.pretrained.experiment_id}, model={cfg.pretrained.starting_model}"
            )

            path_to_pretrain = to_absolute_path(
                f'{cfg.path_to_mlflow}/{cfg.pretrained.experiment_id}/{cfg.pretrained.run_id}/artifacts/'
            )
            old_model = load_model(
                path_to_pretrain +
                f"/model_checkpoints/{cfg.pretrained.starting_model}",
                compile=False,
                custom_objects=None)
            for layer in model.layers:
                weights_found = False
                for old_layer in old_model.layers:
                    if layer.name == old_layer.name:
                        layer.set_weights(old_layer.get_weights())
                        weights_found = True
                        break
                if not weights_found:
                    print(f"Weights for layer '{layer.name}' not found.")
            old_opt = old_model.optimizer
            old_vars = [var.name for var in old_model.trainable_variables]

        compile_model(model, setup["optimizer_name"], setup["learning_rate"],
                      setup["metrics"], setup["schedule_decay"])
        fit_hist = run_training(model,
                                dataloader,
                                False,
                                cfg.log_suffix,
                                setup["using_new_loss"],
                                old_opt=old_opt)

        # log NN params
        for net_type in [
                'tau_net', 'comp_net', 'comp_merge_net', 'conv_2d_net',
                'dense_net'
        ]:
            mlflow.log_params({
                f'{net_type}_{k}': v
                for k, v in cfg.training_cfg.SetupNN[net_type].items()
            })
        mlflow.log_params({
            f'TauLossesSFs_{i}': v
            for i, v in enumerate(cfg.training_cfg.SetupNN.TauLossesSFs)
        })
        with open(
                to_absolute_path(
                    f'{cfg.path_to_mlflow}/{run_kwargs["experiment_id"]}/{run_id}/artifacts/model_summary.txt'
                )) as f:
            for l in f:
                if (s := 'Trainable params: ') in l:
                    mlflow.log_param('n_train_params',
                                     int(l.split(s)[-1].replace(',', '')))

        # log training related files
        mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml')
        mlflow.log_artifact(scaling_cfg, 'input_cfg')
        mlflow.log_artifact(to_absolute_path("Training_CNN.py"), 'input_cfg')
        mlflow.log_artifact(to_absolute_path("common.py"), 'input_cfg')

        # log hydra files
        mlflow.log_artifacts('.hydra', 'input_cfg/hydra')
        mlflow.log_artifact('Training_CNN.log', 'input_cfg/hydra')

        # log misc. info
        mlflow.log_param('run_id', run_id)
        mlflow.log_param('git_commit', _get_git_commit(to_absolute_path('.')))
        print(
            f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {run_id}\n'
        )
        mlflow.end_run()

        # Temporary workaround to kill additional subprocesses that have not exited correctly
        try:
            current_process = psutil.Process()
            children = current_process.children(recursive=True)
            for child in children:
                child.kill()
        except:
            pass