def run(_run, chime6, test_run=False): if dlp_mpi.IS_MASTER: print_config(_run) _dir = get_dir() print('Experiment dir:', _dir) else: _dir = None _dir = dlp_mpi.bcast(_dir, dlp_mpi.MASTER) if chime6: enhancer = get_enhancer_chime6() else: enhancer = get_enhancer() if test_run: print('Database', enhancer.db) session_ids = get_session_ids() if dlp_mpi.IS_MASTER: print('Enhancer:', enhancer) print(session_ids) enhancer.enhance_session(session_ids, _dir / 'audio', dataset_slice=test_run, audio_dir_exist_ok=True) if dlp_mpi.IS_MASTER: print('Finished experiment dir:', _dir)
def rirs( database_path, datasets, sample_rate, filter_length, ): database_path = Path(database_path) if dlp_mpi.IS_MASTER: scenario_json = database_path / "scenarios.json" with scenario_json.open() as f: database = json.load(f) for dataset in datasets: dataset_path = database_path / dataset dataset_path.mkdir(parents=True, exist_ok=True) else: database = None database = dlp_mpi.bcast(database) for dataset_name, dataset in database['datasets'].items(): print(f'RANK={dlp_mpi.RANK}, SIZE={dlp_mpi.SIZE}:' f' Starting {dataset_name}.') for _example_id, example in dlp_mpi.split_managed( list(sorted(dataset.items())), progress_bar=True, is_indexable=True, ): h = generate_rir(room_dimensions=example['room_dimensions'], source_positions=example['source_position'], sensor_positions=example['sensor_position'], sound_decay_time=example['sound_decay_time'], sample_rate=sample_rate, filter_length=filter_length, sensor_orientations=None, sensor_directivity=None, sound_velocity=343) assert not np.any(np.isnan( h)), f"{np.sum(np.isnan(h))} values of {h.size} are NaN." K, D, T = h.shape directory = database_path / dataset_name / _example_id directory.mkdir(parents=False, exist_ok=False) for k in range(K): # Although storing as np.float64 does not allow every reader # to access the files, it does not require normalization and # we are unsure how much precision is needed for RIRs. with soundfile.SoundFile(str(directory / f"h_{k}.wav"), subtype='DOUBLE', samplerate=sample_rate, mode='w', channels=h.shape[1]) as f: f.write(h[k, :, :].T) dlp_mpi.barrier() print(f'RANK={dlp_mpi.RANK}, SIZE={dlp_mpi.SIZE}:' f' Finished {dataset_name}.')
def from_file( cls, config_path: Path, in_config_path: str = '', consider_mpi=False, ): """Instantiate the module from given config_file. Args: config_path: in_config_path: consider_mpi: If True and mpi is used, only read config_path and checkpoint_path once and broadcast the content with mpi. Reduces the io load. Returns: """ config_path = Path(config_path).expanduser().resolve() assert config_path.is_file(), config_path assert config_path.is_file(), f'Expected {config_path} is file.' def load_config(config_path): if config_path.suffix == '.json': import json with config_path.open() as fp: configurable_config = json.load(fp) elif config_path.suffix == '.yaml': import yaml with config_path.open() as fp: configurable_config = yaml.safe_load(fp) else: raise ValueError(config_path) return configurable_config if consider_mpi: import dlp_mpi if dlp_mpi.IS_MASTER: configurable_config = load_config(config_path=config_path) else: configurable_config = None configurable_config = dlp_mpi.bcast(configurable_config) else: configurable_config = load_config(config_path=config_path) if config_path != '': for part in in_config_path.split('.'): configurable_config = configurable_config[part] return cls.from_config(configurable_config)
def load_checkpoint( self, checkpoint_path: (Path, str), in_checkpoint_path: str = 'model', map_location='cpu', consider_mpi=False, ) -> 'Module': """Update the module parameters from the given checkpoint. Args: checkpoint_path: in_checkpoint_path: map_location: consider_mpi: If True and mpi is used, only read config_path and checkpoint_path once and broadcast the content with mpi. Reduces the io load. Returns: """ checkpoint_path = Path(checkpoint_path).expanduser().resolve() assert checkpoint_path.is_file(), checkpoint_path # Load weights if consider_mpi: import dlp_mpi if dlp_mpi.IS_MASTER: checkpoint_path_content = Path(checkpoint_path).read_bytes() else: checkpoint_path_content = None checkpoint_path_content = dlp_mpi.bcast(checkpoint_path_content) checkpoint = torch.load( io.BytesIO(checkpoint_path_content), map_location=map_location, ) else: checkpoint = torch.load(checkpoint_path, map_location=map_location) if in_checkpoint_path: for part in in_checkpoint_path.split('.'): try: checkpoint = deflatten(checkpoint, maxdepth=1) checkpoint = checkpoint[part] except KeyError: raise ValueError(part, in_checkpoint_path, checkpoint) self.load_state_dict(checkpoint) return self
def write_wavs(dst_dir: Path, wsj0_root: Path, wsj1_root: Path, sample_rate): wsj0_root = Path(wsj0_root).expanduser().resolve() wsj1_root = Path(wsj1_root).expanduser().resolve() dst_dir = Path(dst_dir).expanduser().resolve() assert wsj0_root.exists(), wsj0_root assert wsj1_root.exists(), wsj1_root assert not dst_dir == wsj0_root, (wsj0_root, dst_dir) assert not dst_dir == wsj1_root, (wsj1_root, dst_dir) # Expect, that the dst_dir does not exist to make sure to not overwrite. if dlp_mpi.IS_MASTER: dst_dir.mkdir(parents=True, exist_ok=False) if dlp_mpi.IS_MASTER: # Search for CD numbers, e.g. "13-34.1" # CD stands for compact disk. cds_0 = list(wsj0_root.rglob("*-*.*")) cds_1 = list(wsj1_root.rglob("*-*.*")) cds = set(cds_0 + cds_1) expected_number_of_files = { 'pl': 3, 'ndx': 106, 'ptx': 3547, 'dot': 3585, 'txt': 256 } number_of_written_files = dict() for suffix in expected_number_of_files.keys(): files_0 = list(wsj0_root.rglob(f"*.{suffix}")) files_1 = list(wsj1_root.rglob(f"*.{suffix}")) files = set(files_0 + files_1) # Filter files that do not have a folder that matches "*-*.*". files = { file for file in files if any([fnmatch.fnmatch(part, "*-*.*") for part in file.parts]) } # the readme.txt file in the parent directory is not copied print(f"About to write ca. {len(files)} {suffix} files.") for cd in cds: cd_files = list(cd.rglob(f"*.{suffix}")) for file in cd_files: target = dst_dir / file.relative_to(cd.parent) target.parent.mkdir(parents=True, exist_ok=True) if not target.is_file(): shutil.copy(file, target.parent) number_of_written_files[suffix] = len( list(dst_dir.rglob(f"*.{suffix}"))) print(f"Writing {number_of_written_files[suffix]} {suffix} files.") print( f'Expected {expected_number_of_files[suffix]} {suffix} files.') for suffix in expected_number_of_files.keys(): message = (f'Expected that ' f'{expected_number_of_files[suffix]} ' f'files with the {suffix} are written. ' f'But only {number_of_written_files} are written. ') if (number_of_written_files[suffix] != expected_number_of_files[suffix]): warnings.warn(message) if suffix == 'pl' and number_of_written_files[suffix] == 1: raise RuntimeError( 'Found only one pl file although we expected three. ' 'A typical reason is having only WSJ0. ' 'Please make sure you have WSJ0+1 = WSJ COMPLETE.') if dlp_mpi.IS_MASTER: # Ignore .wv2 files since they are not referenced in our database # anyway wsj_nist_files = [(cd, nist_file) for cd in cds for nist_file in cd.rglob("*.wv1")] print(f"About to write {len(wsj_nist_files)} wav files.") else: wsj_nist_files = None wsj_nist_files = dlp_mpi.bcast(wsj_nist_files) for nist_file_tuple in dlp_mpi.split_managed(wsj_nist_files): cd, nist_file = nist_file_tuple assert isinstance(nist_file, Path), nist_file signal = read_nist_wsj(nist_file, expected_sample_rate=16000) file = nist_file.with_suffix('.wav') target = dst_dir / file.relative_to(cd.parent) assert not target == nist_file, (nist_file, target) target.parent.mkdir(parents=True, exist_ok=True) signal = resample_with_sox(signal, rate_in=16000, rate_out=sample_rate) # normalization to mean 0: signal = signal - np.mean(signal) # normalization: # Correction, because the allowed values are in the range [-1, 1). # => "1" is not a vaild value correction = (2**15 - 1) / (2**15) signal = signal * (correction / np.amax(np.abs(signal))) with soundfile.SoundFile( str(target), samplerate=sample_rate, channels=1, subtype='FLOAT', mode='w', ) as f: f.write(signal.T) dlp_mpi.barrier() if dlp_mpi.IS_MASTER: created_files = list(set(list(dst_dir.rglob("*.wav")))) print(f"Written {len(created_files)} wav files.") assert len(wsj_nist_files) == len(created_files), (len(wsj_nist_files), len(created_files))
def write_wavs(dst_dir: Path, wsj0_root: Path, wsj1_root: Path, sample_rate): wsj0_root = Path(wsj0_root).expanduser().resolve() wsj1_root = Path(wsj1_root).expanduser().resolve() dst_dir = Path(dst_dir).expanduser().resolve() assert wsj0_root.exists(), wsj0_root assert wsj1_root.exists(), wsj1_root assert not dst_dir == wsj0_root, (wsj0_root, dst_dir) assert not dst_dir == wsj1_root, (wsj1_root, dst_dir) # Expect, that the dst_dir does not exist to make sure to not overwrite. if dlp_mpi.IS_MASTER: dst_dir.mkdir(parents=True, exist_ok=False) if dlp_mpi.IS_MASTER: cds_0 = list(wsj0_root.rglob("*-*.*")) cds_1 = list(wsj1_root.rglob("*-*.*")) cds = set(cds_0 + cds_1) for suffix in 'pl ndx ptx dot txt'.split(): files_0 = list(wsj0_root.rglob(f"*.{suffix}")) files_1 = list(wsj1_root.rglob(f"*.{suffix}")) files = set(files_0 + files_1) # the readme.txt file in the parent directory is not copied print(f"About to write ca. {len(files)} {suffix} files.") for cd in cds: cd_files = list(cd.rglob(f"*.{suffix}")) for file in cd_files: target = dst_dir / file.relative_to(cd.parent) target.parent.mkdir(parents=True, exist_ok=True) if not target.is_file(): shutil.copy(file, target.parent) written_files = list(dst_dir.rglob(f"*.{suffix}")) print(f"Writing {len(written_files)} {suffix} files.") # assert len(written_files) == len(files), (files, written_files) if dlp_mpi.IS_MASTER: # Ignore .wv2 files since they are not referenced in our database # anyway wsj_nist_files = [(cd, nist_file) for cd in cds for nist_file in cd.rglob("*.wv1")] print(f"About to write {len(wsj_nist_files)} wav files.") else: wsj_nist_files = None wsj_nist_files = dlp_mpi.bcast(wsj_nist_files) for nist_file_tuple in dlp_mpi.split_managed(wsj_nist_files): cd, nist_file = nist_file_tuple assert isinstance(nist_file, Path), nist_file signal = read_nist_wsj(nist_file, expected_sample_rate=16000) file = nist_file.with_suffix('.wav') target = dst_dir / file.relative_to(cd.parent) assert not target == nist_file, (nist_file, target) target.parent.mkdir(parents=True, exist_ok=True) # normalization: # Correction, because the allowed values are in the range [-1, 1). # => "1" is not a vaild value signal = resample_with_sox(signal, rate_in=16000, rate_out=sample_rate) correction = (2 ** 15 - 1) / (2 ** 15) signal = signal * (correction / np.amax(np.abs(signal))) with soundfile.SoundFile( str(target), samplerate=sample_rate, channels=1, subtype='FLOAT', mode='w', ) as f: f.write(signal.T) dlp_mpi.barrier() if dlp_mpi.IS_MASTER: created_files = list(set(list(dst_dir.rglob("*.wav")))) print(f"Written {len(created_files)} wav files.") assert len(wsj_nist_files) == len(created_files), (len(wsj_nist_files), len(created_files))
def get_new_folder( basedir, try_id=None, dry_run=False, mkdir=True, consider_mpi=False, ): """ The core source code if copied from the FileStorageObserver in sacred. Get a sub folder from basedir with sacred style. Assume integer folder names and select as return folder the last folder integer plus one. Args: basedir: try_id: Suggestion for the folder name. Can be used as prefix. try_id=prefix with return a folder like: prefix, prefix_2, ... dry_run: Per default also creates the directory to be thread safe. mkdir: With mkdir this function is thread and process safe. consider_mpi: If True only the master selects a folder and syncs the folder with the slaves. Returns: """ if consider_mpi: import dlp_mpi if dlp_mpi.IS_MASTER: pass else: new_folder = None new_folder = dlp_mpi.bcast(new_folder) return new_folder suggested_id = try_id basedir = Path(basedir).expanduser() for i in range(200): if suggested_id is None: dir_nrs = [ int(d) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() and d.isdigit() ] _id = max(dir_nrs + [0]) + 1 else: if (basedir / f'{suggested_id}').exists(): dir_nrs = [ int(re.sub(f'{suggested_id}_?', '', str(d))) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() if fnmatch.fnmatch(d, f'{suggested_id}_*') if re.sub(f'{suggested_id}_?', '', str(d)).isdigit() ] _id = max(dir_nrs + [1]) + 1 _id = f'{suggested_id}_{_id}' else: _id = f'{suggested_id}' simu_dir = basedir / str(_id) try: if dry_run: print(f'dry_run: "os.mkdir({simu_dir})"') elif mkdir is False: pass elif mkdir is True: simu_dir.mkdir() else: raise ValueError(mkdir) if consider_mpi: import dlp_mpi assert dlp_mpi.IS_MASTER, dlp_mpi.RANK simu_dir = dlp_mpi.bcast(simu_dir) return simu_dir except FileExistsError: # Catch race conditions if i > 100: # After some tries, # expect that something other went wrong raise
def get_new_folder( basedir, try_id=None, dry_run=False, consider_mpi=False, chdir=False, mkdir=True, ): """ Args: basedir: try_id: mkdir: Enables thread safety dry_run: Per default also creates the directory to be thread safe. Returns: """ if consider_mpi: import dlp_mpi if dlp_mpi.IS_MASTER: pass else: new_folder = None new_folder = dlp_mpi.bcast(new_folder) return new_folder suggested_id = try_id basedir = Path(basedir).expanduser().resolve() if Path('/net') in basedir.parents: # If nt filesystem, assert not in /net/home assert Path('/net/home') not in basedir.parents, basedir for i in range(200): if suggested_id is None: dir_nrs = [ int(d) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() and d.isdigit() ] _id = max(dir_nrs + [0]) + 1 else: if (basedir / f'{suggested_id}').exists(): dir_nrs = [ int(re.sub(f'{suggested_id}_?', '', str(d))) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() if fnmatch.fnmatch(d, f'{suggested_id}_*') if re.sub(f'{suggested_id}_?', '', str(d)).isdigit() ] _id = max(dir_nrs + [1]) + 1 _id = f'{suggested_id}_{_id}' else: _id = f'{suggested_id}' simu_dir = basedir / str(_id) try: if dry_run: print(f'dry_run: "os.mkdir({simu_dir})"') elif mkdir is False: pass elif mkdir is True: simu_dir.mkdir() else: raise ValueError(mkdir) if consider_mpi: import dlp_mpi assert dlp_mpi.IS_MASTER, dlp_mpi.RANK simu_dir = dlp_mpi.bcast(simu_dir) if chdir: os.chdir(simu_dir) return simu_dir except FileExistsError: # Catch race conditions if i > 100: # After some tries, # expect that something other went wrong raise
parser.add_argument("--json_path", default="data/sms_wsj.json", help="Full path to sms_wsj.json") def main(conf): experiment.run(config_updates=dict( json_path=conf["main_args"]["json_path"], **conf["mm_config"])) if __name__ == "__main__": if dlp_mpi.IS_MASTER: # We start with opening the config file conf.yml as a dictionary from # which we can create parsers. Each top level key in the dictionary defined # by the YAML file creates a group in the parser. with open("local/conf.yml") as f: def_conf = yaml.safe_load(f) parser = prepare_parser_from_dict(def_conf, parser=parser) # Arguments are then parsed into a hierarchical dictionary (instead of # flat, as returned by argparse) to falicitate calls to the different # asteroid methods (see in main). # plain_args is the direct output of parser.parse_args() and contains all # the attributes in an non-hierarchical structure. It can be useful to also # have it so we included it here but it is not used. arg_dict, plain_args = parse_args_as_dict(parser, return_plain_args=True) else: arg_dict = None arg_dict = dlp_mpi.bcast(arg_dict, root=dlp_mpi.MASTER) main(arg_dict)
def get_new_subdir( basedir: [str, Path], *, id_naming: [str, callable] = 'index', mkdir: bool = True, prefix: str = None, suffix: str = None, consider_mpi: bool = False, dry_run: bool = False, ): """Determine a new non-existent sub directory. Features: - With mkdir: Thread and process save. - Different conventions for ID naming possible, default running index. - MPI aware: Get the folder on one worker and distribute to others. Args: basedir: The new subdir will be inside this directory id_naming: The id naming that is used for the folder name. - str: 'index': The largest index in basedir + 1. e.g.: '1', '2', ... - str: 'time': A timestamp with the format %Y-%m-%d-%H-%M-%S e.g. '2020-08-13-17-02-57' - callable: Each call should generate a new name. mkdir: Creates the dir and makes the program process/thread safe. Note this option ensures that you don't get a conflict between two concurrent calls of get_new_folder. Example: You launch several times your programs and each should get another folder (e.g. hyperparameter search). When inspecting basedir maybe some recognize they can use '2' as sub folder. This option ensures, that only one program gets the '2' and the remaining programs search for another free id. prefix: Optional prefix for the id. e.g.: '2' -> '{prefix}_2' suffix: Optional suffix for the id. e.g.: '2' -> '2_{suffix}' consider_mpi: If True, only search on one mpi process for the folder and distribute the folder name. When using mpi (and `consider_mpi is False`) the following can/will happen - When mkdir is True every process will get another folder. i.e. each process has a folder just for this process. - Warning: Never use mpi, when `mkdir is False` and `consider_mpi is False`. Depending on some random factors (e.g. python startup time) all workers could get the same folder, but mostly some get the same folder and some different. You never want this. dry_run: When true, disables mkdir and prints the folder name. Returns: pathlib.Path of the new subdir >>> get_new_subdir('/', dry_run=True) # root folder usually contain no digits dry_run: "os.mkdir(/1)" PosixPath('/1') >>> import numpy as np >>> np.random.seed(0) # This is for doctest. Never use it in practise. >>> get_new_subdir('/', id_naming=NameGenerator(), dry_run=True) dry_run: "os.mkdir(/smooth_tomato_finch)" PosixPath('/smooth_tomato_finch') """ if consider_mpi: import dlp_mpi if dlp_mpi.IS_MASTER: pass else: new_folder = None new_folder = dlp_mpi.bcast(new_folder) return new_folder basedir = Path(basedir).expanduser().resolve() if not basedir.exists(): if dry_run: print(f'dry_run: "os.makedirs({basedir})"') # ToDo: Make this working. # Will fail when calling os.listdir else: basedir.mkdir(parents=True) if Path('/net') in basedir.parents: # If nt filesystem, assert not in /net/home assert Path('/net/home') not in basedir.parents, basedir prefix_ = f'{prefix}_' if prefix else '' _suffix = f'_{suffix}' if suffix else '' for i in range(200): if id_naming == 'index': if prefix is None and suffix is None: dir_nrs = [ int(d) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() and d.isdigit() ] _id = max(dir_nrs + [0]) + 1 else: def remove_pre_suf(d): return _removesuffix(_removeprefix(str(d), prefix_), _suffix) dir_nrs = [ int(remove_pre_suf(d)) for d in os.listdir(str(basedir)) if (basedir / d).is_dir() if fnmatch.fnmatch(d, f'{prefix_}*{_suffix}') if remove_pre_suf(d).isdigit() ] dir_nrs += [0] _id = max(dir_nrs) + 1 _id = f'{prefix_}{_id}{_suffix}' elif id_naming == 'time': if i != 0: time.sleep(1) _id = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') _id = f'{prefix_}{_id}{_suffix}' elif callable(id_naming): _id = id_naming() else: raise ValueError(id_naming) simu_dir = basedir / str(_id) try: if dry_run: print(f'dry_run: "os.mkdir({simu_dir})"') elif mkdir is False: pass elif mkdir is True: simu_dir.mkdir() else: raise ValueError(mkdir) if consider_mpi: import dlp_mpi assert dlp_mpi.IS_MASTER, dlp_mpi.RANK simu_dir = dlp_mpi.bcast(simu_dir) return simu_dir except FileExistsError: # Catch race conditions if i > 100: # After some tries, # expect that something other went wrong raise