def test_fails_not_dict(self): d1 = {'a': 3} d2 = ['not_dict'] with self.assertRaises(ValueError): merged = utils.merge_dicts(d1, d2) with self.assertRaises(ValueError): merged = utils.merge_dicts(d2, d1)
def test_empty(self): d1 = {'a': 3} d2 = {} merged1 = utils.merge_dicts(d1, d2) merged2 = utils.merge_dicts(d2, d1) expected = {'a': 3} self.assertEqual(merged1, expected) self.assertEqual(merged2, expected)
def test_nested_non_dict_override(self): d1 = {'a': 3, 'b': {'c': {'d': 4}, 'e': 11}} d2 = {'b': {'c': ['not_dict']}} merged1 = utils.merge_dicts(d1, d2) expected1 = {'a': 3, 'b': {'c': ['not_dict'], 'e': 11}} merged2 = utils.merge_dicts(d2, d1) expected2 = {'a': 3, 'b': {'c': {'d': 4}, 'e': 11}} self.assertEqual(merged1, expected1) self.assertEqual(merged2, expected2)
'project_root_dir' ], "VALID_SLURM_CONFIG_VALUES": [ 'experiments_per_job', 'max_simultaneous_jobs', 'sbatch_options_template', 'sbatch_options' ], "LOGIN_NODE_NAMES": ["fs"], "OBSERVERS": { "NEPTUNE": { "AUTH_TOKEN": "YOUR_AUTH_TOKEN", }, "SLACK": { "WEBHOOK": "YOUR_WEBHOOK", }, "MATTERMOST": { "WEBHOOK": "YOUR_WEBHOOK", "DEFAULT_CHANNEL": "YOUR_DEFAULT_CHANNEL", } }, }, ) # Load user settings if SETTINGS.USER_SETTINGS_PATH.exists(): user_settings_source = imp.load_source('SETTINGS', str(SETTINGS.USER_SETTINGS_PATH)) SETTINGS = munchify(merge_dicts(SETTINGS, user_settings_source.SETTINGS)) SETTINGS.SLURM_STATES.ACTIVE = (SETTINGS.SLURM_STATES.PENDING + SETTINGS.SLURM_STATES.RUNNING + SETTINGS.SLURM_STATES.PAUSED)
def generate_configs(experiment_config): """Generate parameter configurations based on an input configuration. Input is a nested configuration where on each level there can be 'fixed', 'grid', and 'random' parameters. In essence, we take the cartesian product of all the `grid` parameters and take random samples for the random parameters. The nested structure makes it possible to define different parameter spaces e.g. for different datasets. Parameter definitions lower in the hierarchy overwrite parameters defined closer to the root. For each leaf configuration we take the maximum of all num_samples values on the path since we need to have the same number of samples for each random parameter. For each configuration of the `grid` parameters we then create `num_samples` configurations of the random parameters, i.e. leading to `num_samples * len(grid_configurations)` configurations. See Also `examples/example_config.yaml` and the example below. Parameters ---------- experiment_config: dict Dictionary that specifies the "search space" of parameters that will be enumerated. Should be parsed from a YAML file. Returns ------- all_configs: list of dicts Contains the individual combinations of the parameters. """ reserved, next_level = unpack_config(experiment_config) reserved = standardize_config(reserved) level_stack = [('', next_level)] config_levels = [reserved] final_configs = [] detect_duplicate_parameters(invert_config(reserved), None) while len(level_stack) > 0: current_sub_name, sub_vals = level_stack.pop(0) sub_config, sub_levels = unpack_config(sub_vals) sub_config = standardize_config(sub_config) config_above = config_levels.pop(0) inverted_sub_config = invert_config(sub_config) detect_duplicate_parameters(inverted_sub_config, current_sub_name) inverted_config_above = invert_config(config_above) redefined_parameters = set(inverted_sub_config.keys()).intersection( set(inverted_config_above.keys())) if len(redefined_parameters) > 0: logging.warning( f"Found redefined parameters in {current_sub_name}: {redefined_parameters}. " f"Redefinitions of parameters override earlier ones.") config_above = copy.deepcopy(config_above) for p in redefined_parameters: sections = inverted_config_above[p] for s in sections: del config_above[s][p] config = merge_dicts(config_above, sub_config) if len(sub_levels) == 0: final_configs.append((current_sub_name, config)) for sub_name, sub_vals in sub_levels.items(): new_sub_name = f'{current_sub_name}.{sub_name}' if current_sub_name != '' else sub_name level_stack.append((new_sub_name, sub_vals)) config_levels.append(config) all_configs = [] for subconfig_name, conf in final_configs: conf = standardize_config(conf) random_params = conf['random'] if 'random' in conf else {} fixed_params = flatten(conf['fixed']) if 'fixed' in conf else {} grid_params = conf['grid'] if 'grid' in conf else {} if len(random_params) > 0: num_samples = random_params['samples'] root_seed = random_params.get('seed', None) random_sampled = sample_random_configs(flatten(random_params), seed=root_seed, samples=num_samples) grids = [ generate_grid(v, parent_key=k) for k, v in grid_params.items() ] grid_configs = dict([sub for item in grids for sub in item]) grid_product = list(cartesian_product_dict(grid_configs)) with_fixed = [{**d, **fixed_params} for d in grid_product] if len(random_params) > 0: with_random = [{ **grid, **random } for grid in with_fixed for random in random_sampled] else: with_random = with_fixed all_configs.extend(with_random) # Cast NumPy integers to normal integers since PyMongo doesn't like them all_configs = [{ k: int(v) if isinstance(v, np.integer) else v for k, v in config.items() } for config in all_configs] all_configs = [unflatten(conf) for conf in all_configs] return all_configs
def test_basic(self): d1 = {'a': 3, 'b': 5} d2 = {'b': 99, 'c': 7} merged = utils.merge_dicts(d1, d2) expected = {'a': 3, 'b': 99, 'c': 7} self.assertEqual(merged, expected)
def test_nested(self): d1 = {'a': 3, 'b': {'c': 10, 'd': 9}} d2 = {'e': 7, 'b': {'c': 99, 'f': 11}} merged = utils.merge_dicts(d1, d2) expected = {'a': 3, 'b': {'c': 99, 'd': 9, 'f': 11}, 'e': 7} self.assertEqual(merged, expected)
def add_experiments(db_collection_name, config_file, force_duplicates, no_hash=False, no_sanity_check=False, no_code_checkpoint=False): """ Add configurations from a config file into the database. Parameters ---------- db_collection_name: the MongoDB collection name. config_file: path to the YAML configuration. force_duplicates: if True, disable duplicate detection. no_hash: if True, disable hashing of the configurations for duplicate detection. This is much slower, so use only if you have a good reason to. no_sanity_check: if True, do not check the config for missing/unused arguments. no_code_checkpoint: if True, do not upload the experiment source code files to the MongoDB. Returns ------- None """ seml_config, slurm_config, experiment_config = read_config(config_file) # Use current Anaconda environment if not specified if 'conda_environment' not in seml_config: if 'CONDA_DEFAULT_ENV' in os.environ: seml_config['conda_environment'] = os.environ['CONDA_DEFAULT_ENV'] else: seml_config['conda_environment'] = None # Set Slurm config with default parameters as fall-back option slurm_config = merge_dicts(SETTINGS.SLURM_DEFAULT, slurm_config) # Check for and use sbatch options template sbatch_options_template = slurm_config.get('sbatch_options_template', None) if sbatch_options_template is not None: if sbatch_options_template not in SETTINGS.SBATCH_OPTIONS_TEMPLATES: raise ConfigError( f"sbatch options template '{sbatch_options_template}' not found in settings.py." ) for k, v in SETTINGS.SBATCH_OPTIONS_TEMPLATES[ sbatch_options_template].items(): if k not in slurm_config['sbatch_options']: slurm_config['sbatch_options'][k] = v slurm_config['sbatch_options'] = remove_prepended_dashes( slurm_config['sbatch_options']) configs = generate_configs(experiment_config) collection = get_collection(db_collection_name) batch_id = get_max_in_collection(collection, "batch_id") if batch_id is None: batch_id = 1 else: batch_id = batch_id + 1 if seml_config['use_uploaded_sources'] and not no_code_checkpoint: uploaded_files = upload_sources(seml_config, collection, batch_id) else: uploaded_files = None del seml_config['use_uploaded_sources'] if not no_sanity_check: check_config(seml_config['executable'], seml_config['conda_environment'], configs) path, commit, dirty = get_git_info(seml_config['executable']) git_info = None if path is not None: git_info = {'path': path, 'commit': commit, 'dirty': dirty} use_hash = not no_hash if use_hash: configs = [{**c, **{'config_hash': make_hash(c)}} for c in configs] if not force_duplicates: len_before = len(configs) # First, check for duplicates withing the experiment configurations from the file. if not use_hash: # slow duplicate detection without hashes unique_configs = [] for c in configs: if c not in unique_configs: unique_configs.append(c) configs = unique_configs else: # fast duplicate detection using hashing. configs_dict = {c['config_hash']: c for c in configs} configs = [v for k, v in configs_dict.items()] len_after_deduplication = len(configs) # Now, check for duplicate configurations in the database. configs = filter_experiments(collection, configs) len_after = len(configs) if len_after_deduplication != len_before: logging.info( f"{len_before - len_after_deduplication} of {len_before} experiment{s_if(len_before)} were " f"duplicates. Adding only the {len_after_deduplication} unique configurations." ) if len_after != len_after_deduplication: logging.info( f"{len_after_deduplication - len_after} of {len_after_deduplication} " f"experiment{s_if(len_before)} were already found in the database. They were not added again." ) # Create an index on the config hash. If the index is already present, this simply does nothing. collection.create_index("config_hash") # Add the configurations to the database with STAGED status. if len(configs) > 0: add_configs(collection, seml_config, slurm_config, configs, uploaded_files, git_info)