def samples(file: str) -> int: """Number of samples in audio file. Args: file: file name of input audio file Returns: number of samples in audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ def samples_as_int(file): return int( soundfile.info(file).duration * soundfile.info(file).samplerate) file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return samples_as_int(file) else: # Always convert to WAV for non SNDFORMATS with tempfile.TemporaryDirectory(prefix='audiofile') as tmpdir: tmpfile = os.path.join(tmpdir, 'tmp.wav') convert_to_wav(file, tmpfile) return samples_as_int(tmpfile)
def get_file( self, src_path: str, dst_path: str, version: str, *, ext: str = None, ): r"""Get file from backend. Args: src_path: path to file on backend dst_path: destination path to local file version: version string ext: file extension, if ``None`` uses characters after last dot Returns: full path to local file Raises: FileNotFoundError: if file does not exist on backend """ src_path = self.path(src_path, version, ext=ext) if not self._exists(src_path): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), src_path, ) dst_path = audeer.safe_path(dst_path) audeer.mkdir(os.path.dirname(dst_path)) self._get_file(src_path, dst_path)
def default_cache_root(shared=False, ) -> str: r"""Default cache folder. If ``shared`` is ``True``, returns the path specified by the environment variable ``AUDB_SHARED_CACHE_ROOT`` or ``audb.config.SHARED_CACHE_ROOT``. If ``shared`` is ``False``, returns the path specified by the environment variable ``AUDB_CACHE_ROOT`` or ``audb.config.CACHE_ROOT``. Args: shared: if ``True`` returns path to shared cache folder Returns: path normalized by :func:`audeer.safe_path` """ if shared: cache = (os.environ.get('AUDB_SHARED_CACHE_ROOT') or config.SHARED_CACHE_ROOT) else: cache = (os.environ.get('AUDB_CACHE_ROOT') or config.CACHE_ROOT) return audeer.safe_path(cache)
def to_yaml( self, path_or_stream: typing.Union[str, typing.IO], *, include_version: bool = True, ): r"""Save object to YAML file. Args: path_or_stream: file path or stream include_version: add version to class name """ if isinstance(path_or_stream, str): path_or_stream = audeer.safe_path(path_or_stream) root = os.path.dirname(path_or_stream) audeer.mkdir(root) with open(path_or_stream, 'w') as fp: return self.to_yaml(fp, include_version=include_version) else: return yaml.dump( self.to_dict( include_version=include_version, root=os.path.dirname(path_or_stream.name), ), path_or_stream, )
def sampling_rate(file: str) -> int: """Sampling rate of audio file. Args: file: file name of input audio file Returns: sampling rate of audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return soundfile.info(file).samplerate else: try: return int(sox.file_info.sample_rate(file)) except sox.core.SoxiError: cmd = f'mediainfo --Inform="Audio;%SamplingRate%" "{file}"' sampling_rate = run(cmd) if sampling_rate: return int(sampling_rate) else: raise RuntimeError(broken_file_error(file))
def channels(file: str) -> int: """Number of channels in audio file. Args: file: file name of input audio file Returns: number of channels in audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return soundfile.info(file).channels else: try: return int(sox.file_info.channels(file)) except sox.core.SoxiError: # For MP4 stored and returned number of channels can be different cmd1 = f'mediainfo --Inform="Audio;%Channel(s)_Original%" "{file}"' cmd2 = f'mediainfo --Inform="Audio;%Channel(s)%" "{file}"' try: return int(run(cmd1)) except ValueError: try: return int(run(cmd2)) except ValueError: raise RuntimeError(broken_file_error(file))
def test_safe_path(path): if path: expected_path = os.path.abspath(os.path.expanduser(path)) else: expected_path = '' if type(expected_path) == bytes: expected_path = expected_path.decode('utf8') path = audeer.safe_path(path) assert path == expected_path assert type(path) is str
def md5( file: str, chunk_size: int = 8192, ) -> str: r"""Create MD5 checksum.""" file = audeer.safe_path(file) with open(file, 'rb') as fp: hasher = hashlib.md5() for chunk in md5_read_chunk(fp, chunk_size): hasher.update(chunk) return hasher.hexdigest()
def dependencies( name: str, *, version: str = None, cache_root: str = None, ) -> Dependencies: r"""Database dependencies. Args: name: name of database version: version string cache_root: cache folder where databases are stored. If not set :meth:`audb.default_cache_root` is used Returns: dependency object """ if version is None: version = latest_version(name) cache_roots = [ default_cache_root(True), # check shared cache first default_cache_root(False), ] if cache_root is None else [cache_root] for cache_root in cache_roots: deps_root = audeer.safe_path(os.path.join( cache_root, name, version, )) if os.path.exists(deps_root): break audeer.mkdir(deps_root) deps_path = os.path.join(deps_root, define.CACHED_DEPENDENCIES_FILE) deps = Dependencies() if not os.path.exists(deps_path): backend = lookup_backend(name, version) with tempfile.TemporaryDirectory() as tmp_root: archive = backend.join(name, define.DB) backend.get_archive( archive, tmp_root, version, ) deps.load(os.path.join(tmp_root, define.DEPENDENCIES_FILE)) deps.save(deps_path) else: deps.load(deps_path) return deps
def duration(file: str, sloppy=False) -> float: """Duration in seconds of audio file. The default behavior (``sloppy=False``) ensures the duration in seconds matches the one in samples. To achieve this it first decodes files to WAV if needed, e.g. MP3 files. If you have different decoders on different machines, results might differ. The case ``sloppy=True`` returns the duration as reported in the header of the audio file. This is faster, but might still return different results on different machines as it depends on the installed software. If no duration information is provided in the header it will fall back to ``sloppy=False``. Args: file: file name of input audio file sloppy: if ``True`` report duration as stored in the header Returns: duration in seconds of audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return soundfile.info(file).duration if sloppy: try: duration = sox.file_info.duration(file) if duration is None: duration = 0.0 return duration except sox.core.SoxiError: cmd = f'mediainfo --Inform="Audio;%Duration%" "{file}"' duration = run(cmd) if duration: # Convert to seconds, as mediainfo returns milliseconds return float(duration) / 1000 return samples(file) / sampling_rate(file)
def __init__( self, feature_set: typing.Union[str, FeatureSet] = FeatureSet.ComParE_2016, feature_level: typing.Union[str, FeatureLevel] = FeatureLevel.Functionals, *, options: dict = None, loglevel: int = 2, logfile: str = None, sampling_rate: int = None, channels: typing.Union[int, typing.Sequence[int]] = 0, mixdown: bool = False, resample: bool = False, segment: audinterface.Segment = None, keep_nat: bool = False, num_workers: typing.Optional[int] = 1, multiprocessing: bool = False, verbose: bool = False, ): self.feature_level = feature_level r"""Standard feature level or sink level in custom config file.""" self.feature_set = feature_set r"""Standard feature set or path to custom config file""" self.options = options or {} r"""Dictionary with options""" self.logfile = audeer.safe_path(logfile) if logfile else None r"""Log file""" self.loglevel = loglevel r"""Log level""" self.verbose = verbose self._check_deltas_available() super().__init__( self._feature_names(), name='smile', params=None, process_func=self._extract, num_workers=num_workers, sampling_rate=sampling_rate, resample=resample, channels=channels, mixdown=mixdown, segment=segment, keep_nat=keep_nat, multiprocessing=multiprocessing, verbose=verbose, ) self.params = self.to_dict(flatten=True) self._check_deprecated()
def save(self, path: str): r"""Write dependencies to file. Args: path: path to file. File extension can be ``csv`` or ``pkl``. """ path = audeer.safe_path(path) if path.endswith('csv'): self._df.to_csv(path) elif path.endswith('pkl'): self._df.to_pickle(path)
def test_safe_path_symlinks(tmpdir): filename = 'file.txt' linkname = 'link.txt' dir_tmp = tmpdir.mkdir('folder') f = dir_tmp.join(filename) f.write('') folder = audeer.mkdir(str(dir_tmp)) file = os.path.join(folder, filename) link = os.path.join(folder, linkname) os.symlink(file, link) expected_path = os.path.realpath(os.path.expanduser(link)) path = audeer.safe_path(link) _, path = os.path.splitdrive(path) _, expected_path = os.path.splitdrive(expected_path) assert path == expected_path assert type(path) is str
def put_archive( self, src_root: str, files: typing.Union[str, typing.Sequence[str]], dst_path: str, version: str, ) -> str: r"""Create archive and put on backend. The operation is silently skipped, if an archive with the same checksum already exists on the backend. Args: src_root: local root directory where files are located. Only folders and files below ``src_root`` will be included into the archive files: relative path to file(s) from ``src_root`` dst_path: path to archive on backend without extension, e.g. ``media/archive1`` version: version string Returns: archive path on backend Raises: FileNotFoundError: if one or more files do not exist """ src_root = audeer.safe_path(src_root) if isinstance(files, str): files = [files] for file in files: path = os.path.join(src_root, file) if not os.path.exists(path): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), path, ) with tempfile.TemporaryDirectory() as tmp: _, archive_name = self.split(dst_path) archive = os.path.join(tmp, f'{archive_name}-{version}.zip') audeer.create_archive(src_root, files, archive) remote_archive = dst_path + '.zip' return self.put_file(archive, remote_archive, version)
def load(self, path: str): r"""Read dependencies from file. Clears existing dependencies. Args: path: path to file. File extension can be ``csv`` or ``pkl``. Raises: ValueError: if file extension is not ``csv`` or ``pkl`` FileNotFoundError: if ``path`` does not exists """ self._df = pd.DataFrame(columns=define.DEPEND_FIELD_NAMES.values()) path = audeer.safe_path(path) extension = audeer.file_extension(path) if extension not in ['csv', 'pkl']: raise ValueError( f"File extension of 'path' has to be 'csv' or 'pkl' " f"not '{extension}'") if not os.path.exists(path): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), path, ) if extension == 'pkl': self._df = pd.read_pickle(path) elif extension == 'csv': # Data type of dependency columns dtype_mapping = { name: dtype for name, dtype in zip( define.DEPEND_FIELD_NAMES.values(), define.DEPEND_FIELD_DTYPES.values(), ) } # Data type of index index = 0 dtype_mapping[index] = str self._df = pd.read_csv( path, index_col=index, na_filter=False, dtype=dtype_mapping, )
def test_list_file_names(tmpdir, files, path, filetype, file_list): dir_tmp = tmpdir.mkdir('folder') dir_tmp.mkdir('subfolder') path = os.path.join(str(dir_tmp), path) for file in files: # Create the files file_tmp = dir_tmp.join(file) file_tmp.write('') if os.path.isdir(path): file_list = [ audeer.safe_path(os.path.join(path, f)) for f in file_list ] else: file_list = [path] f = audeer.list_file_names(path, filetype=filetype) assert f == file_list assert type(f) is list
def bit_depth(file: str) -> typing.Optional[int]: r"""Bit depth of audio file. For lossy audio files, ``None`` is returned as they have a varying bit depth. Args: file: file name of input audio file Returns: bit depth of audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) file_type = file_extension(file) if file_type == 'wav': precision_mapping = { 'PCM_16': 16, 'PCM_24': 24, 'PCM_32': 32, 'PCM_U8': 8, 'FLOAT': 32, 'DOUBLE': 64, 'ULAW': 8, 'ALAW': 8, 'IMA_ADPCM': 4, 'MS_ADPCM': 4, 'GSM610': 16, # not sure if this could be variable? 'G721_32': 4, # not sure if correct } elif file_type == 'flac': precision_mapping = { 'PCM_16': 16, 'PCM_24': 24, 'PCM_32': 32, 'PCM_S8': 8, } if file_extension(file) in ['wav', 'flac']: depth = precision_mapping[soundfile.info(file).subtype] else: depth = None return depth
def read_audio( path: str, start: pd.Timedelta = None, end: pd.Timedelta = None, channel: int = None, ) -> typing.Tuple[np.ndarray, int]: # pragma: no cover """Reads (segment of an) audio file. Args: path: path to audio file start: read from this position end: read until this position channel: channel number Returns: signal: array with signal values in shape ``(channels, samples)`` sampling_rate: sampling rate in Hz """ if start is None or pd.isna(start): offset = 0 else: offset = start.total_seconds() if end is None or pd.isna(end): duration = None else: duration = end.total_seconds() - offset # load raw audio signal, sampling_rate = af.read( audeer.safe_path(path), always_2d=True, offset=offset, duration=duration, ) # mix down if channel is not None: if channel < 0 or channel >= signal.shape[0]: raise ValueError(f'We need 0<=channel<{signal.shape[0]}, ' f'but we have channel={channel}.') signal = signal[channel, :] return signal, sampling_rate
def __init__( self, feature_set: typing.Union[str, FeatureSet] = FeatureSet.ComParE_2016, feature_level: typing.Union[str, FeatureLevel] = FeatureLevel.Functionals, *, options: dict = None, loglevel: int = 2, logfile: str = None, num_channels: int = 1, keep_nat: bool = False, num_workers: typing.Optional[int] = 1, verbose: bool = False, ): self.feature_level = feature_level r"""Standard feature level or sink level in custom config file.""" self.feature_set = feature_set r"""Standard feature set or path to custom config file""" self.options = options or {} r"""Dictionary with options""" self.logfile = audeer.safe_path(logfile) if logfile else None r"""Log file""" self.loglevel = loglevel r"""Log level""" self.verbose = verbose self._check_deltas_available() super().__init__( self._feature_names(), process_func=self._extract, num_workers=num_workers, num_channels=num_channels, keep_nat=keep_nat, multiprocessing=True, verbose=verbose, ) self._y = None self._starts = None self._ends = None self._check_deprecated()
def database_cache_folder( name: str, version: str, cache_root: str = None, flavor: Flavor = None, ) -> str: r"""Create and return database cache folder. Args: name: name of database version: version of database cache_root: path to cache folder flavor: flavor of database Returns: path to cache folder """ if cache_root is None: cache_roots = [ default_cache_root(True), # check shared cache first default_cache_root(False), ] else: cache_roots = [cache_root] for cache_root in cache_roots: if flavor is None: db_root = os.path.join( cache_root, name, version, ) else: db_root = os.path.join( cache_root, flavor.path(name, version), ) db_root = audeer.safe_path(db_root) if os.path.exists(db_root): break audeer.mkdir(db_root) return db_root
def config_path(self) -> str: r"""Return file path of config file.""" if type(self.feature_set) is FeatureSet: config_path = os.path.join( self.default_config_root, self.feature_set.value + config.CONFIG_EXT, ) else: config_path = audeer.safe_path(self.feature_set) if not os.path.exists(config_path): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), config_path, ) return config_path
def test_download( tmpdir, url, destination, force_download, expected_path, ): cache = str(tmpdir.mkdir('audfactory')) destination = audeer.safe_path( os.path.join(cache, destination) ) path = audfactory.download( url, destination, chunk=4 * 1024, force_download=force_download, verbose=False, ) assert os.path.exists(path) assert os.path.basename(path) == expected_path
def decode(self, value: str) -> str: r"""Decode file path. If object is read from a file, this will convert a relative file path to an absolute path by expanding it with the source directory. Args: value: relative file path Returns: expanded file path """ if self.root is not None: root = self.root value = os.path.join(root, value) value = audeer.safe_path(value) return value
def read_audio( file: str, *, start: pd.Timedelta = None, end: pd.Timedelta = None, root: str = None, ) -> typing.Tuple[np.ndarray, int]: """Reads (segment of an) audio file. Args: file: path to audio file start: read from this position end: read until this position root: root folder Returns: signal: array with signal values in shape ``(channels, samples)`` sampling_rate: sampling rate in Hz """ if root is not None and not os.path.isabs(file): file = os.path.join(root, file) if start is None or pd.isna(start): offset = 0 else: offset = start.total_seconds() if end is None or pd.isna(end): duration = None else: duration = end.total_seconds() - offset signal, sampling_rate = af.read( audeer.safe_path(file), always_2d=True, offset=offset, duration=duration, ) return signal, sampling_rate
def checksum(path, type='md5') -> str: r"""Calculate checksum for local or remote file. Args: path: local file path, or URL to file path on Artifactory type: checksum type to calculate, one of ``'md5'``, ``'sha1'``, ``'sha256'`` Returns: checksum Example: >>> checksum( ... 'https://audeering.jfrog.io/artifactory/' ... 'data-public/emodb/db/1.1.0/db-1.1.0.zip' ... ) 'f4cfdbc821a070e1163d225b72b241a7' """ if path.startswith('http'): path = _path(path) if not path.exists(): raise RuntimeError(f'File not found: {path}') if type == 'md5': return ArtifactoryPath.stat(path).md5 elif type == 'sha1': return ArtifactoryPath.stat(path).sha1 elif type == 'sha256': return ArtifactoryPath.stat(path).sha256 else: path = audeer.safe_path(path) if not os.path.exists(path): raise RuntimeError(f'File not found: {path}') if type == 'md5': return md5sum(path) elif type == 'sha1': return sha1sum(path) elif type == 'sha256': return sha256sum(path)
def test_custom(config, level): # create feature extractor fex = opensmile.Smile(config, level) # extract from file y_file = fex.process_file(pytest.WAV_FILE) # extract from array x, sr = audiofile.read(pytest.WAV_FILE) y_array = fex.process_signal(x, sr, file=pytest.WAV_FILE) # assertions assert fex.config_name == audeer.basename_wo_ext(config) assert fex.config_path == audeer.safe_path(config) assert fex.num_features == len(fex.feature_names) assert fex.feature_names == y_file.columns.to_list() pd.testing.assert_frame_equal(y_file, y_array)
def test_checksum(tmpdir): with pytest.raises(RuntimeError, match=r'File not found:'): audfactory.checksum('file-not-found.txt') with pytest.raises(RuntimeError, match=r'File not found:'): url = f'{SERVER}/{REPOSITORY}/file-not-found.txt' audfactory.checksum(url) url = ( f'{SERVER}/{REPOSITORY}/{GROUP_ID_URL}/' f'{NAME}/{VERSION}/{FILENAME}.zip' ) cache = str(tmpdir.mkdir('audfactory')) destination = audeer.safe_path(cache) path = audfactory.download(url, destination) assert audfactory.checksum(url, type='md5') == \ audfactory.checksum(path, type='md5') assert audfactory.checksum(url, type='sha1') == \ audfactory.checksum(path, type='sha1') assert audfactory.checksum(url, type='sha256') == \ audfactory.checksum(path, type='sha256')
def clear_root(root: str): root = audeer.safe_path(root) if os.path.exists(root): shutil.rmtree(root)
pytest.ROOT = os.path.dirname(os.path.realpath(__file__)) pytest.WAV_FILE = os.path.join(pytest.ROOT, 'test.wav') pytest.WAV_ARRAY, pytest.WAV_SR = af.read(pytest.WAV_FILE, always_2d=True) pytest.FRAME_LIST_STARTS = pd.to_timedelta(['1.0s', '3.0s', '4.0s']) pytest.FRAME_LIST_ENDS = pd.to_timedelta(['1.5s', '3.5s', '5.0s']) pytest.CONFIG_FILE = os.path.join(pytest.ROOT, 'test.conf') if sys.platform == "win32": # pragma: no cover platform = 'win' elif sys.platform == "darwin": # pragma: no cover platform = 'osx' else: # pragma: no cover platform = 'linux' pytest.SMILEXTRACT = audeer.safe_path( os.path.join(pytest.ROOT, '..', 'opensmile', 'core', 'bin', platform, 'SMILExtract')) @pytest.fixture(scope='session', autouse=True) def fixture_clean_session(): def clean(): path = os.path.join(pytest.ROOT, '..', '.coverage.*') for file in glob.glob(path): os.remove(file) clean() yield clean()
import glob import os import shutil import pytest import audb import audeer pytest.ROOT = audeer.safe_path( os.path.join( os.path.dirname(os.path.realpath(__file__)), 'tmp', )) pytest.BACKEND = 'file-system' pytest.CACHE_ROOT = os.path.join(pytest.ROOT, 'cache') pytest.FILE_SYSTEM_HOST = os.path.join(pytest.ROOT, 'repo') pytest.ID = audeer.uid() pytest.NUM_WORKERS = 5 pytest.REPOSITORY_NAME = 'data-unittests-local' pytest.REPOSITORIES = [ audb.Repository( name=pytest.REPOSITORY_NAME, host=pytest.FILE_SYSTEM_HOST, backend=pytest.BACKEND, ), ] pytest.PUBLISH_REPOSITORY = pytest.REPOSITORIES[0] pytest.SHARED_CACHE_ROOT = os.path.join(pytest.ROOT, 'shared')