Ejemplo n.º 1
0
def _check_directory(directory: str, if_exists: str) -> str:
    if os.path.exists(directory):
        if if_exists == 'error':
            raise ValueError('directory {} already exists.'.format(directory))
        elif if_exists == 'replace':
            warnings.warn(
                'directory {} already exists. It will be replaced by the new result'
                .format(directory))

            existing_run_id = _try_to_get_existing_mlflow_run_id(directory)
            if existing_run_id is not None:
                requires_mlflow()
                import mlflow
                mlflow.delete_run(existing_run_id)

            shutil.rmtree(directory, ignore_errors=True)
        elif if_exists == 'rename':
            postfix_index = 1

            while os.path.exists(directory + '_' + str(postfix_index)):
                postfix_index += 1

            directory += '_' + str(postfix_index)
            warnings.warn(
                'directory is renamed to {} because the original directory already exists.'
                .format(directory))
    return directory
Ejemplo n.º 2
0
        def finalize():

            if args.local_rank not in [-1, 0,]:
                # Make sure only the first process in distributed training will download model & vocab
                torch.distributed.barrier()

            if args.local_rank in [-1, 0] and args.n_epochs > 0:
                try:
                    # On the main process: rename the last checkpoint
                    # (for easy re-loading with from_pretrained method)
                    os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(args.logdir, WEIGHTS_NAME))

                    if args.use_mlflow:
                        mlflow.log_artifact(args.logdir / WEIGHTS_NAME, "training")
                        logger.info("ending mlflow run")
                        logger.info(f"run_id: {mlflow.active_run().info.run_id}")
                        mlflow.end_run()

                        rmtree(args.logdir)

                except:
                    logger.info("No checkpoint to finalize the model. Deleting run")
                    # TODO: fix issue in mlflow trying to delete the experiment multiple times
                    mlflow.delete_run(mlflow.active_run().info.run_id)
                    rmtree(args.logdir)

                if args.local_rank == 0:
                    torch.distributed.barrier()
Ejemplo n.º 3
0
def ignore_and_delete_unfinished(df: DataFrame) -> DataFrame:
    """
     Find and delete 'unfinished' experiments, returning the clean df of finished experiments.

     :param df: (DataFrame) all existing experiments
     :return: (DataFrame) only finished experiments
     """

    delete_mask = df['metrics.percentage_infected'] != 0.0

    delete_df = df[delete_mask]
    if len(delete_df) > 0:
        print('There are {} experiments to be deleted'.format(len(delete_df)))
        for run_id in delete_df['run_id']:
            mlflow.delete_run(run_id=run_id)

    return df[~delete_mask]
Ejemplo n.º 4
0
def delete_unfinished_experiments():
    """
    Find all unfinished experiments (with n_infected != 0) and delete them.

    :return: None
    """
    print('Loading experiments ...')
    mlflow.set_experiment(GlobalConfig().experiment_name)
    df = mlflow.search_runs(experiment_ids=GlobalConfig().experiment_id)
    print('... found {} experiments.'.format(len(df)))

    # Filter to keep only those that were not completed
    df = df[df['metrics.percentage_infected'] != 0.0]

    # Delete the selected experiments
    print('There are {} experiments to be deleted'.format(len(df)))
    for run_id in tqdm(df['run_id']):
        mlflow.delete_run(run_id=run_id)
    def remove_old_models(
        self, experiment_name: str, max_n_models: int = 10, artifact_folder: str = None
    ):
        """Remove old models per experiment.

        Note: This functionality is not incorporated in MLFlow natively
        See also: https://github.com/mlflow/mlflow/issues/2152"""
        if max_n_models < 1:
            raise ValueError(
                f"Max models to keep should be greater than 1! Received: {max_n_models}"
            )
        previous_runs = self._find_models(experiment_name=experiment_name)
        if len(previous_runs) > max_n_models:
            self.logger.debug(
                f"Going to delete old models. {len(previous_runs)} > {max_n_models}"
            )
            # Find run_ids of oldest runs
            runs_to_remove = previous_runs.sort_values(
                by="end_time", ascending=False
            ).loc[max_n_models:, :]
            for _, run in runs_to_remove.iterrows():
                self.logger.debug(
                    f"Going to remove run {run.run_id}, from {run.end_time}."
                )
                mlflow.delete_run(run.run_id)
                self.logger.debug("Removed run")

                # mlflow.delete_run only marks it as deleted but does not delete it by itself
                if artifact_folder:  # Also try to remove artifact from disk.
                    artifact_filepath = (
                        f"{artifact_folder}/mlruns/{run.experiment_id}/{run.run_id}"
                    )
                    self.logger.debug(f"Removing artifact: {artifact_filepath}")
                    try:
                        shutil.rmtree(artifact_filepath)
                        self.logger.debug("Removed artifact")
                    except Exception as e:
                        self.logger.info(f"Failed removing artifacts: {e}")
Ejemplo n.º 6
0
if __name__ == "__main__":

    warnings.filterwarnings("ignore")
    print(mlflow.__version__)

    # Create two runs
    with mlflow.start_run() as run1:
        mlflow.log_param("p", 0)
        mlflow.log_metric("click_rate", 1.55)

    with mlflow.start_run() as run2:
        mlflow.log_param("p", 0)
        mlflow.log_metric("click_rate", 2.50)

    # Delete the last run
    mlflow.delete_run(run2.info.run_id)

    def print_run_infos(run_infos):
        for r in run_infos:
            print("- run_id: {}, lifecycle_stage: {}".format(
                r.run_id, r.lifecycle_stage))

    print("Active runs:")
    print_run_infos(
        mlflow.list_run_infos("0", run_view_type=ViewType.ACTIVE_ONLY))

    print("Deleted runs:")
    print_run_infos(
        mlflow.list_run_infos("0", run_view_type=ViewType.DELETED_ONLY))

    print("All runs:")
Ejemplo n.º 7
0
#
# Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#delete_run
#
import warnings
import mlflow

if __name__ == "__main__":

    warnings.filterwarnings("ignore")
    print(mlflow.__version__)

    with mlflow.start_run() as run:
        mlflow.log_param("p", 0)

    run_id = run.info.run_id
    mlflow.delete_run(run_id)
    print("run_id: {}; lifecycle_stage: {}".format(
        run_id,
        mlflow.get_run(run_id).info.lifecycle_stage))
Ejemplo n.º 8
0
 def delete_run(self, run_id):
     return mlflow.delete_run(run_id)