def get_file( self, src_path: str, dst_path: str, version: str, *, ext: str = None, ): r"""Get file from backend. Args: src_path: path to file on backend dst_path: destination path to local file version: version string ext: file extension, if ``None`` uses characters after last dot Returns: full path to local file Raises: FileNotFoundError: if file does not exist on backend """ src_path = self.path(src_path, version, ext=ext) if not self._exists(src_path): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), src_path, ) dst_path = audeer.safe_path(dst_path) audeer.mkdir(os.path.dirname(dst_path)) self._get_file(src_path, dst_path)
def test_mixed_cache(): # Avoid failing searching for other versions # if databases a stored accross private and shared cache # and the private one is empty, see # https://github.com/audeering/audb/issues/101 # First load to shared cache audb.load( DB_NAME, sampling_rate=8000, full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, only_metadata=True, tables='files', cache_root=pytest.SHARED_CACHE_ROOT, ) # Now try to load same version to private cache # to force audb.cached() to return empty dataframe clear_root(pytest.CACHE_ROOT) audeer.mkdir(pytest.CACHE_ROOT) audb.load( DB_NAME, sampling_rate=8000, full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, only_metadata=True, tables='segments', )
def test_glob(tmpdir, files, pattern, folder, expected, backend): paths = [] for file in files: local_file = os.path.join(tmpdir, file) audeer.mkdir(os.path.dirname(local_file)) with open(local_file, 'w'): pass remote_file = backend.join( pytest.ID, 'test_glob', file, ) paths.append( backend.put_file( local_file, remote_file, '1.0.0', ) ) expected = [ backend.path( backend.join( pytest.ID, 'test_glob', *x.split(backend.sep), ), '1.0.0', ) for x in expected ] assert sorted(expected) == sorted(backend.glob(pattern, folder=folder))
def to_yaml( self, path_or_stream: typing.Union[str, typing.IO], *, include_version: bool = True, ): r"""Save object to YAML file. Args: path_or_stream: file path or stream include_version: add version to class name """ if isinstance(path_or_stream, str): path_or_stream = audeer.safe_path(path_or_stream) root = os.path.dirname(path_or_stream) audeer.mkdir(root) with open(path_or_stream, 'w') as fp: return self.to_yaml(fp, include_version=include_version) else: return yaml.dump( self.to_dict( include_version=include_version, root=os.path.dirname(path_or_stream.name), ), path_or_stream, )
def _put_file( self, src_path: str, dst_path: str, ): r"""Put file to backend.""" audeer.mkdir(os.path.dirname(dst_path)) shutil.copy(src_path, dst_path)
def dependencies( name: str, *, version: str = None, cache_root: str = None, ) -> Dependencies: r"""Database dependencies. Args: name: name of database version: version string cache_root: cache folder where databases are stored. If not set :meth:`audb.default_cache_root` is used Returns: dependency object """ if version is None: version = latest_version(name) cache_roots = [ default_cache_root(True), # check shared cache first default_cache_root(False), ] if cache_root is None else [cache_root] for cache_root in cache_roots: deps_root = audeer.safe_path(os.path.join( cache_root, name, version, )) if os.path.exists(deps_root): break audeer.mkdir(deps_root) deps_path = os.path.join(deps_root, define.CACHED_DEPENDENCIES_FILE) deps = Dependencies() if not os.path.exists(deps_path): backend = lookup_backend(name, version) with tempfile.TemporaryDirectory() as tmp_root: archive = backend.join(name, define.DB) backend.get_archive( archive, tmp_root, version, ) deps.load(os.path.join(tmp_root, define.DEPENDENCIES_FILE)) deps.save(deps_path) else: deps.load(deps_path) return deps
def _copy_file( file: str, root_src: str, root_tmp: str, root_dst: str, ): r"""Copy file.""" src_path = os.path.join(root_src, file) tmp_path = os.path.join(root_tmp, file) dst_path = os.path.join(root_dst, file) audeer.mkdir(os.path.dirname(tmp_path)) audeer.mkdir(os.path.dirname(dst_path)) shutil.copy(src_path, tmp_path) _move_file(root_tmp, root_dst, file)
def fixture_publish_db(): clear_root(DB_ROOT) clear_root(pytest.FILE_SYSTEM_HOST) # create db db = audformat.testing.create_db(minimal=True) db.name = DB_NAME db['files'] = audformat.Table(audformat.filewise_index(list(DB_FILES))) db['files']['original'] = audformat.Column() db['files']['original'].set(list(DB_FILES)) for file in DB_FILES: signal = np.zeros( ( DB_FILES[file]['channels'], DB_FILES[file]['sampling_rate'], ), dtype=np.float32, ) path = os.path.join(DB_ROOT, file) audeer.mkdir(os.path.dirname(path)) audiofile.write(path, signal, DB_FILES[file]['sampling_rate'], bit_depth=DB_FILES[file]['bit_depth']) db['segments'] = audformat.Table( audformat.segmented_index( [list(DB_FILES)[0]] * 3, starts=['0s', '1s', '2s'], ends=['1s', '2s', '3s'], )) db.save(DB_ROOT) # publish db audb.publish( DB_ROOT, '1.0.0', pytest.PUBLISH_REPOSITORY, verbose=False, ) yield clear_root(DB_ROOT) clear_root(pytest.FILE_SYSTEM_HOST)
def database_cache_folder( name: str, version: str, cache_root: str = None, flavor: Flavor = None, ) -> str: r"""Create and return database cache folder. Args: name: name of database version: version of database cache_root: path to cache folder flavor: flavor of database Returns: path to cache folder """ if cache_root is None: cache_roots = [ default_cache_root(True), # check shared cache first default_cache_root(False), ] else: cache_roots = [cache_root] for cache_root in cache_roots: if flavor is None: db_root = os.path.join( cache_root, name, version, ) else: db_root = os.path.join( cache_root, flavor.path(name, version), ) db_root = audeer.safe_path(db_root) if os.path.exists(db_root): break audeer.mkdir(db_root) return db_root
def test_archive(tmpdir, files, name, folder, version, backend): files_as_list = [files] if isinstance(files, str) else files for file in files_as_list: path = os.path.join(tmpdir, file) audeer.mkdir(os.path.dirname(path)) with open(path, 'w'): pass archive = backend.join( pytest.ID, 'test_archive', name, ) path_backend = backend.put_archive(tmpdir, files, archive, version) # operation will be skipped assert backend.put_archive(tmpdir, files, archive, version) == path_backend assert backend.exists(archive + '.zip', version) assert backend.get_archive(archive, tmpdir, version) == files_as_list
def _get_media( media: typing.List[str], db_root: str, db_root_tmp: str, db_name: str, deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): # create folder tree to avoid race condition # in os.makedirs when files are unpacked for file in media: audeer.mkdir(os.path.dirname(os.path.join(db_root, file))) audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file))) # figure out archives archives = set() for file in media: archives.add((deps.archive(file), deps.version(file))) def job(archive: str, version: str): archive = backend.join( db_name, define.DEPEND_TYPE_NAMES[define.DependType.MEDIA], archive, ) files = backend.get_archive(archive, db_root_tmp, version) for file in files: _move_file(db_root_tmp, db_root, file) audeer.run_tasks( job, params=[([archive, version], {}) for archive, version in archives], num_workers=num_workers, progress_bar=verbose, task_description='Get media', )
def test_list_dir_names(tmpdir, dir_list): dir_tmp = tmpdir.mkdir('folder') directories = [] for directory in dir_list: directory = os.path.join(str(dir_tmp), directory) directories.append(audeer.mkdir(directory)) for directory in directories: assert os.path.isdir(directory) path = os.path.join(str(dir_tmp), '.') dirs = audeer.list_dir_names(path) assert dirs == sorted(directories) assert type(dirs) is list
def test_filepath(tmpdir): root = os.path.join(tmpdir, 'test') new_root = os.path.join(tmpdir, 'some', 'where', 'else') # create resource file resource_path = os.path.join(root, 're', 'source.txt') audeer.mkdir(os.path.dirname(resource_path)) with open(resource_path, 'w'): pass # create object and serialize yaml_path = os.path.join(root, 'yaml', 'object.yaml') o = ObjectWithFile(resource_path) o.to_yaml(yaml_path, include_version=False) # move files to another location shutil.move(root, new_root) new_yaml_path = os.path.join(new_root, 'yaml', 'object.yaml') # re-instantiate object from new location and assert path exists o2 = audobject.Object.from_yaml(new_yaml_path) assert isinstance(o2, ObjectWithFile) assert os.path.exists(o2.path)
def test_file(tmpdir, local_file, remote_file, version, ext, backend): local_file = os.path.join(tmpdir, local_file) audeer.mkdir(os.path.dirname(local_file)) with open(local_file, 'w'): pass remote_file = backend.join( pytest.ID, 'test_file', remote_file, ) assert not backend.exists(remote_file, version, ext=ext) path_backend = backend.put_file(local_file, remote_file, version, ext=ext) # operation will be skipped assert backend.put_file( local_file, remote_file, version, ext=ext, ) == path_backend assert backend.exists(remote_file, version, ext=ext) backend.get_file(remote_file, local_file, version, ext=ext) assert os.path.exists(local_file) assert backend.checksum(remote_file, version, ext=ext) == md5(local_file) assert backend.remove_file(remote_file, version, ext=ext) == path_backend assert not backend.exists(remote_file, version, ext=ext) if ext is None: _, ext = os.path.splitext(local_file) else: ext = '.' + ext assert path_backend.endswith(ext)
def database_tmp_folder(cache_root: str, ) -> str: r"""Create and return temporary database cache folder. The temporary cache folder is created under ``cache_root + '~'``. Args: cache_root: path to cache folder Returns: path to temporary cache folder """ tmp_root = cache_root + '~' tmp_root = audeer.mkdir(tmp_root) return tmp_root
def test_safe_path_symlinks(tmpdir): filename = 'file.txt' linkname = 'link.txt' dir_tmp = tmpdir.mkdir('folder') f = dir_tmp.join(filename) f.write('') folder = audeer.mkdir(str(dir_tmp)) file = os.path.join(folder, filename) link = os.path.join(folder, linkname) os.symlink(file, link) expected_path = os.path.realpath(os.path.expanduser(link)) path = audeer.safe_path(link) _, path = os.path.splitdrive(path) _, expected_path = os.path.splitdrive(expected_path) assert path == expected_path assert type(path) is str
def test_mkdir(tmpdir): # New dir path = str(tmpdir.mkdir('folder1')) p = audeer.mkdir(path) assert os.path.isdir(p) is True assert p == path # Existing dir p = audeer.mkdir(path) assert os.path.isdir(p) is True assert p == path # Existing dir with content dir_tmp = tmpdir.mkdir('folder2') f = dir_tmp.join('file.txt') f.write('') path = str(dir_tmp) p = audeer.mkdir(path) assert os.path.isdir(p) is True assert p == path # Relative path path = str(tmpdir.mkdir('folder3')) current_path = os.getcwd() os.chdir(path) p = audeer.mkdir('folder4') os.chdir(current_path) assert os.path.isdir(p) is True assert p == os.path.join(path, 'folder4') # Subdirectories os.chdir(path) p = audeer.mkdir('folder5/folder6') os.chdir(current_path) assert os.path.isdir(p) is True assert p == os.path.join(path, 'folder5', 'folder6') # Path in bytes path = str(tmpdir.mkdir('folder7')) path = bytes(path, 'utf8') p = audeer.mkdir(path) assert os.path.isdir(p) is True assert p == path.decode('utf8') # Empty dir path = '' p = audeer.mkdir(path) assert p == path # Mode, see https://stackoverflow.com/a/705088 # Default mode path = os.path.join(str(tmpdir.mkdir('folder8')), 'sub-folder') os.umask(0) p = audeer.mkdir(path) mode = stat.S_IMODE(os.stat(p).st_mode) expected_mode = int('777', 8) assert mode == expected_mode # Non-default modes # Under Windows, changing permissions does not work, # there we always expect 777 path = os.path.join(str(tmpdir.mkdir('folder9')), 'sub-folder') os.umask(0) p = audeer.mkdir(path, mode=0o775) expected_mode = '775' if platform.system() == 'Windows': expected_mode = '777' mode = stat.S_IMODE(os.stat(p).st_mode) assert mode == int(expected_mode, 8) assert mode != int('755', 8) path = os.path.join(str(tmpdir.mkdir('folder10')), 'sub-folder') os.umask(0) p = audeer.mkdir(path, mode=0o755) expected_mode = '755' if platform.system() == 'Windows': expected_mode = '777' mode = stat.S_IMODE(os.stat(p).st_mode) assert mode == int(expected_mode, 8) assert mode != int('775', 8)
def test_archives(tmpdir): # Create tmp files to put in archives filenames = ['file1', 'file2'] dir_tmp = tmpdir.mkdir('content') for filename in filenames: f = dir_tmp.join(f'{filename}.txt') f.write('') path = str(dir_tmp) src_path = audeer.mkdir(path) # Create destination folder dir_tmp = tmpdir.mkdir('destination') destination = str(dir_tmp) os.rmdir(destination) # make sure destination does not exists # Create folder holding archive files archive_dir_tmp = tmpdir.mkdir('archives') archive_folder = audeer.mkdir(str(archive_dir_tmp)) # Create archives zip_files = [] tar_files = [] for filename in filenames: src_file = f'{filename}.txt' zip_file = os.path.join(archive_folder, f'{filename}.zip') audeer.create_archive(src_path, src_file, zip_file) zip_files.append(zip_file) tar_file = os.path.join(archive_folder, f'{filename}.tar.gz') audeer.create_archive(src_path, src_file, tar_file) tar_files.append(tar_file) with pytest.raises(RuntimeError): audeer.create_archive(src_path, src_file, f'{filename}.7z') # Extract archives members = audeer.extract_archives(zip_files, destination) for filename, member in zip(filenames, members): target_file = os.path.join(destination, f'{filename}.txt') assert os.path.exists(target_file) assert os.path.basename(target_file) == member os.remove(target_file) members = audeer.extract_archives(tar_files, destination) for filename, member in zip(filenames, members): target_file = os.path.join(destination, f'{filename}.txt') assert os.path.exists(target_file) assert os.path.basename(target_file) == member os.remove(target_file) with pytest.raises(RuntimeError): audeer.extract_archive(os.path.join(src_path, src_file), destination) audeer.extract_archive(tar_files[0], destination, keep_archive=False) assert not os.path.exists(tar_files[0]) assert os.path.exists(tar_files[1]) # Create broken archives for ext in ['zip', 'tar.gz']: f = archive_dir_tmp.join(f'broken.{ext}') f.write('') archive_file = os.path.join(archive_folder, f'broken.{ext}') if ext == 'zip': with pytest.raises(RuntimeError): audeer.extract_archive( archive_file, destination, keep_archive=False, ) elif ext == 'tar.gz': with pytest.raises(RuntimeError): audeer.extract_archive( archive_file, destination, keep_archive=False, ) # File should still be there if extraction failed assert os.path.exists(zip_file)
def load_to( root: str, name: str, *, version: str = None, cache_root: str = None, num_workers: typing.Optional[int] = 1, verbose: bool = True, ) -> audformat.Database: r"""Load database to directory. Loads the original state of the database to a custom directory. No conversion or filtering will be applied. If the target folder already contains some version of the database, it will upgrade to the requested version. Unchanged files will be skipped. Args: root: target directory name: name of database version: version string, latest if ``None`` cache_root: cache folder where databases are stored. If not set :meth:`audb.default_cache_root` is used. Only used to read the dependencies of the requested version num_workers: number of parallel jobs or 1 for sequential processing. If ``None`` will be set to the number of processors on the machine multiplied by 5 verbose: show debug messages Returns: database object """ if version is None: version = latest_version(name) db_root = audeer.safe_path(root) db_root_tmp = database_tmp_folder(db_root) # remove files with a wrong checksum # to ensure we load correct version update = os.path.exists(db_root) and os.listdir(db_root) audeer.mkdir(db_root) deps = dependencies(name, version=version, cache_root=cache_root) if update: for file in deps.files: full_file = os.path.join(db_root, file) if os.path.exists(full_file): checksum = audbackend.md5(full_file) if checksum != deps.checksum(file): os.remove(full_file) # load database header without tables from backend db_header, backend = load_header( db_root_tmp, name, version, overwrite=True, ) # get altered and new tables db_header.save(db_root_tmp, header_only=True) tables = _find_tables(db_header, db_root, deps, num_workers, verbose) _get_tables(tables, db_root, db_root_tmp, name, deps, backend, num_workers, verbose) # load database # move header to root and load database ... _move_file(db_root_tmp, db_root, define.HEADER_FILE) try: db = audformat.Database.load( db_root, num_workers=num_workers, verbose=verbose, ) except (KeyboardInterrupt, Exception): # pragma: no cover # make sure to remove header if user interrupts os.remove(os.path.join(db_root, define.HEADER_FILE)) raise # afterwards remove header to avoid the database # can be loaded before download is complete os.remove(os.path.join(db_root, define.HEADER_FILE)) # get altered and new media files media = _find_media(db, db_root, deps, num_workers, verbose) _get_media(media, db_root, db_root_tmp, name, deps, backend, num_workers, verbose) # save dependencies dep_path_tmp = os.path.join(db_root_tmp, define.DEPENDENCIES_FILE) deps.save(dep_path_tmp) _move_file(db_root_tmp, db_root, define.DEPENDENCIES_FILE) # save database and remove the temporal directory # to signal all files were correctly loaded _save_database(db, db_root, db_root_tmp, num_workers, verbose) try: _remove_empty_dirs(db_root_tmp) except OSError: # pragma: no cover raise RuntimeError('Could not remove temporary directory, ' 'probably there are some leftover files.' 'This should not happen.') return db
import os import shutil import audeer import audformat import audiofile as af import pandas as pd src_dir = 'src' build_dir = audeer.mkdir('build') # Prepare functions for getting information from file names def parse_names(names, from_i, to_i, is_number=False, mapping=None): for name in names: key = name[from_i:to_i] if is_number: key = int(key) yield mapping[key] if mapping else key # Gather metadata description = ( 'Berlin Database of Emotional Speech. ' 'A German database of emotional utterances ' 'spoken by actors ' 'recorded as a part of the DFG funded research project ' 'SE462/3-1 in 1997 and 1999. ' 'Recordings took place in the anechoic chamber ' 'of the Technical University Berlin, '
import audb import audeer import audformat name = 'emodb' previous_version = '1.1.0' version = '1.1.1' build_dir = '../build' repository = audb.Repository( name='data-public', host='https://audeering.jfrog.io/artifactory', backend='artifactory', ) build_dir = audeer.mkdir(build_dir) audb.load_to( build_dir, name, version=previous_version, num_workers=8, ) # Fix gender labels of speakers, see # https://github.com/audeering/emodb/issues/2 db = audformat.Database.load(build_dir) db.schemes['speaker'].labels = { 3: { 'gender': 'male', 'age': 31, 'language': 'deu'
def _get_media_from_backend( name: str, media: typing.Sequence[str], db_root: str, db_root_tmp: str, flavor: typing.Optional[Flavor], deps: Dependencies, backend: audbackend.Backend, num_workers: typing.Optional[int], verbose: bool, ): r"""Load media from backend.""" # figure out archives archives = set() archive_names = set() for file in media: archive_name = deps.archive(file) archive_version = deps.version(file) archives.add((archive_name, archive_version)) archive_names.add(archive_name) # collect all files that will be extracted, # if we have more files than archives if len(deps.files) > len(deps.archives): files = list() for file in deps.media: archive = deps.archive(file) if archive in archive_names: files.append(file) media = files # create folder tree to avoid race condition # in os.makedirs when files are unpacked # using multi-processing for file in media: audeer.mkdir(os.path.dirname(os.path.join(db_root, file))) audeer.mkdir(os.path.dirname(os.path.join(db_root_tmp, file))) def job(archive: str, version: str): archive = backend.join( name, define.DEPEND_TYPE_NAMES[define.DependType.MEDIA], archive, ) # extract and move all files that are stored in the archive, # even if only a single file from the archive was requested files = backend.get_archive(archive, db_root_tmp, version) for file in files: if flavor is not None: bit_depth = deps.bit_depth(file) channels = deps.channels(file) sampling_rate = deps.sampling_rate(file) src_path = os.path.join(db_root_tmp, file) file = flavor.destination(file) dst_path = os.path.join(db_root_tmp, file) flavor( src_path, dst_path, src_bit_depth=bit_depth, src_channels=channels, src_sampling_rate=sampling_rate, ) if src_path != dst_path: os.remove(src_path) _move_file(db_root_tmp, db_root, file) audeer.run_tasks( job, params=[([archive, version], {}) for archive, version in archives], num_workers=num_workers, progress_bar=verbose, task_description='Load media', )