def test_duplicate_parameters(self): config_dict = self.load_config_dict( self.CONFIG_WITH_DUPLICATE_PARAMETERS_1) with self.assertRaises(ConfigError): configs = config.generate_configs(config_dict) config_dict = self.load_config_dict( self.CONFIG_WITH_DUPLICATE_PARAMETERS_2) with self.assertRaises(ConfigError): configs = config.generate_configs(config_dict) with self.assertRaises(ConfigError): configs = config.read_config( self.CONFIG_WITH_DUPLICATE_PARAMETERS_3) config_dict = self.load_config_dict( self.CONFIG_WITH_DUPLICATE_PARAMETERS_NESTED) with self.assertRaises(ConfigError): configs = config.generate_configs(config_dict) config_dict = self.load_config_dict( self.CONFIG_WITH_DUPLICATE_RDM_PARAMETERS_2) configs = config.generate_configs(config_dict) assert len(configs) == config_dict['random']['samples']
def add_experiments(db_collection_name, config_file, force_duplicates, no_hash=False, no_sanity_check=False, no_code_checkpoint=False): """ Add configurations from a config file into the database. Parameters ---------- db_collection_name: the MongoDB collection name. config_file: path to the YAML configuration. force_duplicates: if True, disable duplicate detection. no_hash: if True, disable hashing of the configurations for duplicate detection. This is much slower, so use only if you have a good reason to. no_sanity_check: if True, do not check the config for missing/unused arguments. no_code_checkpoint: if True, do not upload the experiment source code files to the MongoDB. Returns ------- None """ seml_config, slurm_config, experiment_config = read_config(config_file) # Use current Anaconda environment if not specified if 'conda_environment' not in seml_config: if 'CONDA_DEFAULT_ENV' in os.environ: seml_config['conda_environment'] = os.environ['CONDA_DEFAULT_ENV'] else: seml_config['conda_environment'] = None # Set Slurm config with default parameters as fall-back option if slurm_config is None: slurm_config = {'sbatch_options': {}} for k, v in SETTINGS.SLURM_DEFAULT['sbatch_options'].items(): if k not in slurm_config['sbatch_options']: slurm_config['sbatch_options'][k] = v del SETTINGS.SLURM_DEFAULT['sbatch_options'] for k, v in SETTINGS.SLURM_DEFAULT.items(): if k not in slurm_config: slurm_config[k] = v slurm_config['sbatch_options'] = remove_prepended_dashes( slurm_config['sbatch_options']) configs = generate_configs(experiment_config) collection = get_collection(db_collection_name) batch_id = get_max_in_collection(collection, "batch_id") if batch_id is None: batch_id = 1 else: batch_id = batch_id + 1 if seml_config['use_uploaded_sources'] and not no_code_checkpoint: uploaded_files = upload_sources(seml_config, collection, batch_id) else: uploaded_files = None if not no_sanity_check: check_config(seml_config['executable'], seml_config['conda_environment'], configs) path, commit, dirty = get_git_info(seml_config['executable']) git_info = None if path is not None: git_info = {'path': path, 'commit': commit, 'dirty': dirty} use_hash = not no_hash if use_hash: configs = [{**c, **{'config_hash': make_hash(c)}} for c in configs] if not force_duplicates: len_before = len(configs) # First, check for duplicates withing the experiment configurations from the file. if not use_hash: # slow duplicate detection without hashes unique_configs = [] for c in configs: if c not in unique_configs: unique_configs.append(c) configs = unique_configs else: # fast duplicate detection using hashing. configs_dict = {c['config_hash']: c for c in configs} configs = [v for k, v in configs_dict.items()] len_after_deduplication = len(configs) # Now, check for duplicate configurations in the database. configs = filter_experiments(collection, configs) len_after = len(configs) if len_after_deduplication != len_before: logging.info( f"{len_before - len_after_deduplication} of {len_before} experiment{s_if(len_before)} were " f"duplicates. Adding only the {len_after_deduplication} unique configurations." ) if len_after != len_after_deduplication: logging.info( f"{len_after_deduplication - len_after} of {len_after_deduplication} " f"experiment{s_if(len_before)} were already found in the database. They were not added again." ) # Create an index on the config hash. If the index is already present, this simply does nothing. collection.create_index("config_hash") # Add the configurations to the database with STAGED status. if len(configs) > 0: add_configs(collection, seml_config, slurm_config, configs, uploaded_files, git_info)
def main(): parser = argparse.ArgumentParser( description="Manage experiments for the given configuration. " "Each experiment is represented as a record in the database. " "See examples/README.md for more details.", formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( 'db_collection_name', type=str, nargs='?', default=None, help="Name of the database collection for the experiment.") parser.add_argument('--verbose', '-v', action='store_true', help='Display more log messages.') subparsers = parser.add_subparsers(title="Possible operations") parser_jupyter = subparsers.add_parser("jupyter", help="Start a Jupyter slurm job.") parser_jupyter.add_argument( "-l", "--lab", action='store_true', help="Start a jupyter-lab instance instead of jupyter notebook.") parser_jupyter.add_argument( "-c", "--conda_env", type=str, default=None, help="Start the Jupyter instance in a Conda environment.") parser_jupyter.add_argument( '-sb', '--sbatch_options', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"gres\": \"gpu:2\"}') to request two GPUs." ) parser_jupyter.set_defaults(func=start_jupyter_job) parser_configure = subparsers.add_parser( "configure", help="Provide your MongoDB credentials.") parser_configure.set_defaults(func=mongodb_credentials_prompt) parser_queue = subparsers.add_parser( "queue", help="Queue the experiments as defined in the configuration.") parser_queue.add_argument( 'config_file', type=str, nargs='?', default=None, help="Path to the YAML configuration file for the experiment.") parser_queue.add_argument( '-nh', '--no-hash', action='store_true', help= "Do not use the hash of the config dictionary to filter out duplicates (by comparing all" "dictionary values individually). This is much slower, so use only if you have a good reason not to" " use the hash.") parser_queue.add_argument( '-nc', '--no-config-check', action='store_true', help="Do not check the config for missing/unused arguments. " "Use this if the check fails unexpectedly when using " "advanced Sacred features or to accelerate queueing.") parser_queue.add_argument( '-ncc', '--no-code-checkpoint', action='store_true', help="Do upload the source code files to the MongoDB. " "When a queued experiment is started, it will use whatever is the current version of the code " "files (which might have been updated in the meantime or could fail when started)." ) parser_queue.add_argument( '-f', '--force-duplicates', action='store_true', help= "Add experiments to the database even when experiments with identical configurations " "are already in the database.") parser_queue.set_defaults(func=queue_experiments) parser_start = subparsers.add_parser( "start", help= "Fetch queued experiments from the database and run them (by default via Slurm)." ) parser_start.add_argument( '-l', '--local', action='store_true', help="Run the experiments locally (not via Slurm).") parser_start.add_argument( '-n', '--num-exps', type=int, default=-1, help="Only start the specified number of experiments.") parser_start.add_argument( '-u', '--unobserved', action='store_true', help= "Run the experiments without Sacred observers (no changes to the database). " "This also disables output capturing by Sacred, facilitating the use of debuggers (pdb, ipdb)." ) parser_start.add_argument('-pm', '--post-mortem', action='store_true', help="Activate post-mortem debugging with pdb.") parser_start.add_argument( '-d', '--debug', action='store_true', help= "Run a single experiment locally without Sacred observers and with post-mortem debugging. " "This is equivalent to " "`--verbose --local --num-exps 1 --unobserved --post-mortem --output-to-console`." ) parser_start.add_argument( '-dr', '--dry-run', action='store_true', help= "Only show the associated commands instead of running the experiments." ) parser_start.add_argument( '-id', '--sacred-id', type=int, help= "Sacred ID (_id in the database collection) of the experiment to cancel." ) parser_start.add_argument( '-b', '--batch-id', type=int, help= "Batch ID (batch_id in the database collection) of the experiments to be cancelled. " "Experiments that were queued together have the same batch_id.") parser_start.add_argument( '-f', '--filter-dict', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"config.dataset\": \"cora_ml\"}') to filter the experiments by." ) parser_start.add_argument( '-o', '--output-to-console', action='store_true', help= "Print output to console instead of writing it to a log file. Only possible if experiment is run locally." ) parser_start.set_defaults(func=start_experiments) parser_status = subparsers.add_parser( "status", help="Report status of experiments in the database collection.") parser_status.set_defaults(func=report_status) parser_cancel = subparsers.add_parser( "cancel", help= "Cancel the Slurm job/job step corresponding to experiments, filtered by ID or state." ) parser_cancel.add_argument( '-id', '--sacred-id', type=int, help= "Sacred ID (_id in the database collection) of the experiment to cancel." ) parser_cancel.add_argument( '-s', '--filter-states', type=str, nargs='*', default=['PENDING', 'RUNNING'], help= "List of states to filter experiments by. Cancels all experiments if an empty list is passed. " "Default: Cancel all pending and running experiments.") parser_cancel.add_argument( '-b', '--batch-id', type=int, help= "Batch ID (batch_id in the database collection) of the experiments to be cancelled. " "Experiments that were queued together have the same batch_id.") parser_cancel.add_argument( '-f', '--filter-dict', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"config.dataset\": \"cora_ml\"}') " "to filter the experiments by.") parser_cancel.set_defaults(func=cancel_experiments) parser_delete = subparsers.add_parser( "delete", help="Delete experiments by ID or state (does not cancel Slurm jobs).") parser_delete.add_argument( '-id', '--sacred-id', type=int, help= "Sacred ID (_id in the database collection) of the experiment to delete." ) parser_delete.add_argument( '-s', '--filter-states', type=str, nargs='*', default=['QUEUED', 'FAILED', 'KILLED', 'INTERRUPTED'], help= "List of states to filter experiments by. Deletes all experiments if an empty list is passed. " "Default: Delete all queued, failed, killed and interrupted experiments." ) parser_delete.add_argument( '-b', '--batch-id', type=int, help= "Batch ID (batch_id in the database collection) of the experiments to be deleted. " "Experiments that were queued together have the same batch_id.") parser_delete.add_argument( '-f', '--filter-dict', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"config.dataset\": \"cora_ml\"}') " "to filter the experiments by.") parser_delete.set_defaults(func=delete_experiments) parser_reset = subparsers.add_parser( "reset", help= "Reset the state of experiments (set to QUEUED and clean database entry) " "by ID or state (does not cancel Slurm jobs).") parser_reset.add_argument( '-id', '--sacred-id', type=int, help= "Sacred ID (_id in the database collection) of the experiment to reset." ) parser_reset.add_argument( '-s', '--filter-states', type=str, nargs='*', default=['FAILED', 'KILLED', 'INTERRUPTED'], help="List of states to filter experiments by. " "Resets all experiments if an empty list is passed. " "Default: Reset failed, killed and interrupted experiments.") parser_reset.add_argument( '-f', '--filter-dict', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"config.dataset\": \"cora_ml\"}') " "to filter the experiments by.") parser_reset.add_argument( '-b', '--batch-id', type=int, help= "Batch ID (batch_id in the database collection) of the experiments to be deleted. " "Experiments that were queued together have the same batch_id.") parser_reset.set_defaults(func=reset_experiments) parser_detect = subparsers.add_parser( "detect-killed", help= "Detect experiments where the corresponding Slurm jobs were killed externally." ) parser_detect.set_defaults(func=detect_killed) parser_clean_db = subparsers.add_parser( "clean-db", help= "Remove orphaned artifacts in the DB from runs which have been deleted." ) parser_clean_db.add_argument( '-a', '--all-collections', action='store_true', help= "Scan all collections for orphaned artifacts (not just the one provided in the config)." ) parser_clean_db.set_defaults(func=clean_unreferenced_artifacts) args = parser.parse_args() # Initialize logging if args.verbose: logging_level = logging.VERBOSE else: logging_level = logging.INFO hdlr = logging.StreamHandler(sys.stderr) hdlr.setFormatter(LoggingFormatter()) logging.root.addHandler(hdlr) logging.root.setLevel(logging_level) if args.func == mongodb_credentials_prompt: # launch SEML configure. del args.db_collection_name elif args.func == start_jupyter_job: del args.db_collection_name else: # otherwise remove the flag as it is not used elsewhere. if not args.db_collection_name: parser.error( "the following arguments are required: db_collection_name") else: if os.path.isfile(args.db_collection_name): logging.warning( "Loading the collection name from a config file. This has been deprecated. " "Please instead provide a database collection name in the command line." ) seml_config, _, _ = read_config(args.db_collection_name) if args.func == queue_experiments: args.config_file = args.db_collection_name args.db_collection_name = seml_config['db_collection'] elif args.func == queue_experiments and not args.config_file: parser_queue.error( "the following arguments are required: config_file") f = args.func del args.func del args.verbose if 'filter_states' in args: args.filter_states = [state.upper() for state in args.filter_states] f(**args.__dict__)
def build_configs_and_run( config_files: Sequence[str], executable: Optional[str] = None, kwargs: Dict[str, Any] = {}) -> Tuple[List[Dict[str, Any]], Callable]: """Returns all (deduplicated) configs provided in `config_files` and provides the `run`. You can pass the config via the `config_updates` argument (see Example below). Parameters ---------- config_files : Sequence[str] Config (`.yaml`) files of same experiment (all must refer to the same potentially provided executable). executable : str, optional Optionally the name of the executable, by default None. kwargs : Dict[str, Any], optional Overwrite/add certain configs (please make sure they are valid!), by default {}. Returns ------- Tuple[List[Dict[str, Any]], Callable] Configs and the callable of type `sacred.Experiment#run` (pass config via `config_updates` argument). Raises ------ ValueError If the configs contain multiple executables or the executable has no `sacred.Experiment` attribute. Examples -------- >>> configs, run = build_configs_and_run(['a.yaml', 'b.yaml']) >>> results = [] >>> for config in configs: >>> results.append(run(config_updates=config).result) """ configs = [] executable = None for config_file in config_files: seml_config, _, experiment_config = read_config(config_file) if executable is None: executable = seml_config['executable'] elif executable != seml_config['executable']: raise ValueError( f'All configs must be for the same executable! Found {executable} and {seml_config["executable"]}.' ) configs.extend(generate_configs(experiment_config)) # Overwrite/add configs for key, value in kwargs.items(): for config in configs: config[key] = value deduplicate_index = { json.dumps(config, sort_keys=True): i for i, config in enumerate(configs) } configs = [configs[i] for i in deduplicate_index.values()] module = importlib.import_module( os.path.splitext(os.path.basename(executable))[0]) run = None for attr in dir(module): if isinstance(getattr(module, attr), Experiment): run = getattr(module, attr).run if run is None: raise ValueError( f'Executable {executable} has not attribute of type `sacred.Experiment`!' ) return configs, run
def main(): parser = argparse.ArgumentParser( description="Manage experiments for the given configuration. " "Each experiment is represented as a record in the database. " "See examples/README.md for more details.", formatter_class=argparse.RawTextHelpFormatter, add_help=True) parser.add_argument( 'db_collection_name', type=str, nargs='?', default=None, help="Name of the database collection for the experiment.") parser.add_argument('--verbose', '-v', action='store_true', help='Display more log messages.') subparsers = parser.add_subparsers(title="Possible operations") parser_jupyter = subparsers.add_parser("jupyter", help="Start a Jupyter slurm job.") parser_jupyter.add_argument( "-l", "--lab", action='store_true', help="Start a jupyter-lab instance instead of jupyter notebook.") parser_jupyter.add_argument( "-c", "--conda-env", type=str, default=None, help="Start the Jupyter instance in a Conda environment.") parser_jupyter.add_argument( '-sb', '--sbatch-options', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"gres\": \"gpu:2\"}') to request two GPUs." ) parser_jupyter.set_defaults(func=start_jupyter_job) parser_configure = subparsers.add_parser( "configure", help="Provide your MongoDB credentials.") parser_configure.set_defaults(func=mongodb_credentials_prompt) parser_add = subparsers.add_parser( "add", aliases=["queue"], help= "Add the experiments to the database as defined in the configuration.") parser_add.add_argument( 'config_file', type=str, nargs='?', default=None, help="Path to the YAML configuration file for the experiment.") parser_add.add_argument( '-nh', '--no-hash', action='store_true', help= "Do not use the hash of the config dictionary to filter out duplicates (by comparing all" "dictionary values individually). This is much slower, so use only if you have a good reason not to" " use the hash.") parser_add.add_argument( '-nsc', '--no-sanity-check', action='store_true', help="Do not check the config for missing/unused arguments. " "Use this if the check fails unexpectedly when using " "advanced Sacred features or to accelerate adding.") parser_add.add_argument( '-ncc', '--no-code-checkpoint', action='store_true', help="Do not save the source code files in the MongoDB. " "When a staged experiment is started, it will instead use the current version of the code " "files (which might have been updated in the meantime or could fail when started)." ) parser_add.add_argument( '-f', '--force-duplicates', action='store_true', help= "Add experiments to the database even when experiments with identical configurations " "are already in the database.") parser_add.set_defaults(func=add_experiments) parser_start = subparsers.add_parser( "start", help= "Fetch staged experiments from the database and run them (by default via Slurm)." ) parser_start.add_argument( '-pc', '--print-command', action='store_true', help= "Only show the associated commands instead of running the experiments." ) parser_start.add_argument( '-d', '--debug', action='store_true', help= "Run a single interactive experiment without Sacred observers and with post-mortem debugging. " "Implies `--verbose --num-exps 1 --post-mortem --output-to-console`.") parser_start.add_argument( '-ds', '--debug-server', action='store_true', help= "Run the experiment with a debug server, to which you can remotely connect with e.g. VS Code. " "Implies `--debug`.") parser_start_local = parser_start.add_argument_group( "optional arguments for local jobs") parser_start_local.add_argument( '-l', '--local', action='store_true', help="Run the experiments locally (not via Slurm).") parser_start_local.add_argument( '-nw', '--no-worker', action='store_true', help= "Do not launch a local worker after setting experiments' state to PENDING." ) parser_start.set_defaults(func=start_experiments, set_to_pending=True) parser_launch_worker = subparsers.add_parser( "launch-worker", help="Launch a local worker that runs PENDING jobs.") parser_launch_worker.set_defaults(func=start_experiments, set_to_pending=False, no_worker=False, local=True, debug=False, debug_server=False, print_command=False) for subparser in [parser_start, parser_launch_worker]: subparser.add_argument( '-n', '--num-exps', type=int, default=0, help= "Only start the specified number of experiments. 0: run all staged experiments." ) subparser.add_argument( '-nf', '--no-file-output', action='store_true', help="Do not save the console output in a file.") for subparser in [parser_start_local, parser_launch_worker]: subparser.add_argument( '-ss', '--steal-slurm', action='store_true', help= "Local jobs 'steal' from the Slurm queue, i.e. also execute experiments waiting for execution via " "Slurm. Has no effect if --local is not active.") subparser.add_argument( '-wg', '--worker-gpus', type=str, help= "The IDs of the GPUs used by the local worker. Will be directly passed to CUDA_VISIBLE_DEVICES. " "Has no effect for Slurm jobs.") subparser.add_argument( '-wc', '--worker-cpus', type=int, help= "The number of CPU cores used by the local worker. Will be directly passed to OMP_NUM_THREADS. Has no " "effect for Slurm jobs.") subparser.add_argument( '-we', '--worker-environment-vars', type=json.loads, help= "Further environment variables to be set for the local worker. Has no effect for Slurm jobs." ) subparser.add_argument('-pm', '--post-mortem', action='store_true', help="Activate post-mortem debugging with pdb.") subparser.add_argument('-o', '--output-to-console', action='store_true', help="Print output to console.") parser_status = subparsers.add_parser( "status", help="Report status of experiments in the database collection.") parser_status.set_defaults(func=report_status) parser_cancel = subparsers.add_parser( "cancel", help= "Cancel the Slurm job/job step corresponding to experiments, filtered by ID or state." ) parser_cancel.add_argument( '-s', '--filter-states', type=str, nargs='*', default=[*States.PENDING, *States.RUNNING], help= "List of states to filter experiments by. Cancels all experiments if an empty list is passed. " "Default: Cancel all pending and running experiments.") parser_cancel.set_defaults(func=cancel_experiments) parser_delete = subparsers.add_parser( "delete", help="Delete experiments by ID or state (does not cancel Slurm jobs).") parser_delete.add_argument( '-s', '--filter-states', type=str, nargs='*', default=[ *States.STAGED, *States.FAILED, *States.KILLED, *States.INTERRUPTED ], help= "List of states to filter experiments by. Deletes all experiments if an empty list is passed. " "Default: Delete all staged, failed, killed and interrupted experiments." ) parser_delete.set_defaults(func=delete_experiments) parser_reset = subparsers.add_parser( "reset", help= "Reset the state of experiments by setting their state to staged and cleaning their database entry. " "Does not cancel Slurm jobs.") parser_reset.add_argument( '-s', '--filter-states', type=str, nargs='*', default=[*States.FAILED, *States.KILLED, *States.INTERRUPTED], help="List of states to filter experiments by. " "Resets all experiments if an empty list is passed. " "Default: Reset failed, killed and interrupted experiments.") parser_reset.set_defaults(func=reset_experiments) parser_detect = subparsers.add_parser( "detect-killed", help= "Detect experiments where the corresponding Slurm jobs were killed externally." ) parser_detect.set_defaults(func=detect_killed) parser_clean_db = subparsers.add_parser( "clean-db", help= "Remove orphaned artifacts in the DB from runs which have been deleted." ) parser_clean_db.add_argument( '-a', '--all-collections', action='store_true', help= "Scan all collections for orphaned artifacts (not just the one provided in the config)." ) parser_clean_db.set_defaults(func=clean_unreferenced_artifacts) for subparser in [ parser_start, parser_launch_worker, parser_cancel, parser_delete, parser_reset ]: subparser.add_argument( '-id', '--sacred-id', type=int, help= "Sacred ID (_id in the database collection) of the experiment to start." ) subparser.add_argument( '-b', '--batch-id', type=int, help= "Batch ID (batch_id in the database collection) of the experiments to be started. " "Experiments that were staged together have the same batch_id.") subparser.add_argument( '-f', '--filter-dict', type=json.loads, help= "Dictionary (passed as a string, e.g. '{\"config.dataset\": \"cora_ml\"}') to filter " "the experiments by.") args = parser.parse_args() # Initialize logging if args.verbose: logging_level = logging.VERBOSE else: logging_level = logging.INFO hdlr = logging.StreamHandler(sys.stderr) hdlr.setFormatter(LoggingFormatter()) logging.root.addHandler(hdlr) logging.root.setLevel(logging_level) if args.func == mongodb_credentials_prompt: # launch SEML configure. del args.db_collection_name elif args.func == start_jupyter_job: del args.db_collection_name else: # otherwise remove the flag as it is not used elsewhere. if not args.db_collection_name: parser.error( "the following arguments are required: db_collection_name") else: if os.path.isfile(args.db_collection_name): logging.warning( "Loading the collection name from a config file. This has been deprecated. " "Please instead provide a database collection name in the command line." ) seml_config, _, _ = read_config(args.db_collection_name) if args.func == add_experiments: args.config_file = args.db_collection_name args.db_collection_name = seml_config['db_collection'] elif args.func == add_experiments and not args.config_file: parser_add.error( "the following arguments are required: config_file") f = args.func del args.func del args.verbose if 'filter_states' in args: args.filter_states = [state.upper() for state in args.filter_states] f(**args.__dict__)