Beispiel #1
0
def formats(
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
) -> typing.Set[str]:
    """Media formats.

    Args:
        name: name of database
        version: version of database
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used

    Returns:
        format

    Example:
        >>> formats('emodb', version='1.1.1')
        {'wav'}

    """
    deps = dependencies(name, version=version, cache_root=cache_root)
    df = deps()
    return set(df[df.type == define.DependType.MEDIA].format)
Beispiel #2
0
def channels(
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
) -> typing.Set[int]:
    """Media channels.

    Args:
        name: name of database
        version: version of database
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used

    Returns:
        channel numbers

    Example:
        >>> channels('emodb', version='1.1.1')
        {1}

    """
    deps = dependencies(name, version=version, cache_root=cache_root)
    df = deps()
    return set(df[df.type == define.DependType.MEDIA].channels)
Beispiel #3
0
def duration(
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
) -> pd.Timedelta:
    """Total media duration.

    Args:
        name: name of database
        version: version of database
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used

    Returns:
        duration

    Example:
        >>> duration('emodb', version='1.1.1')
        Timedelta('0 days 00:24:47.092187500')

    """
    deps = dependencies(name, version=version, cache_root=cache_root)
    df = deps()
    return pd.to_timedelta(
        df[df.type == define.DependType.MEDIA].duration.sum(),
        unit='s',
    )
Beispiel #4
0
def _cached_versions(
    name: str,
    version: str,
    flavor: Flavor,
    cache_root: typing.Optional[str],
) -> typing.Sequence[typing.Tuple[LooseVersion, str, Dependencies]]:
    r"""Find other cached versions of same flavor."""

    df = cached(cache_root=cache_root)
    # If no explicit cache root is given,
    # we look into the private and shared one.
    # This fixes https://github.com/audeering/audb/issues/101
    if cache_root is None and os.path.exists(default_cache_root(shared=True)):
        df = pd.concat((df, cached(shared=True)))

    df = df[df.name == name]

    cached_versions = []
    for flavor_root, row in df.iterrows():
        if row['flavor_id'] == flavor.short_id:
            if row['version'] == version:
                continue
            deps = dependencies(
                name,
                version=row['version'],
                cache_root=cache_root,
            )
            # as it is more likely we find files
            # in newer versions, push them to front
            cached_versions.insert(
                0,
                (
                    LooseVersion(row['version']),
                    flavor_root,
                    deps,
                ),
            )

    return cached_versions
Beispiel #5
0
def publish(
    db_root: str,
    version: str,
    repository: Repository,
    *,
    archives: typing.Mapping[str, str] = None,
    previous_version: typing.Optional[str] = 'latest',
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
) -> Dependencies:
    r"""Publish database.

    A database can have dependencies
    to files of an older version of itself.
    E.g. you might add a few new files to an existing database
    and publish as a new version.
    :func:`audb.publish` will upload then only the new files
    and store dependencies on the already published files.

    To allow for dependencies
    you first have to load the version of the database
    that the new version should depend on
    with :func:`audb.load_to` to ``db_root``.
    Afterwards you make your changes to that folder
    and run :func:`audb.publish`.
    :func:`audb.publish` will then check
    that the version of the files inside that folder
    match the version given by ``previous_version``.

    Setting ``previous_version=None`` allows you
    to start from scratch and upload all files
    even if an older versions exist.
    In this case you don't call :func:`audb.load_to`
    before running :func:`audb.publish`.

    Args:
        db_root: root directory of database
        version: version string
        repository: name of repository
        archives: dictionary mapping files to archive names.
            Can be used to bundle files into archives.
            Archive name must not include an extension
        previous_version: specifies the version
            this publication should be based on.
            If ``'latest'``
            it will use automatically the latest published version
            or ``None``
            if no version was published.
            If ``None`` it assumes you start from scratch.
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used.
            Only used to read the dependencies of the previous version
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        dependency object

    Raises:
        RuntimeError: if version already exists
        RuntimeError: if database tables reference non-existing files
        RuntimeError: if database in ``db_root`` depends on other version
            as indicated by ``previous_version``
        RuntimeError: if database is not portable,
            see :meth:`audformat.Database.is_portable`

    """
    db = audformat.Database.load(db_root, load_data=False)

    backend = audbackend.create(
        repository.backend,
        repository.host,
        repository.name,
    )

    remote_header = backend.join(db.name, define.HEADER_FILE)
    versions = backend.versions(remote_header)
    if version in versions:
        raise RuntimeError('A version '
                           f"'{version}' "
                           'already exists for database '
                           f"'{db.name}'.")
    if previous_version == 'latest':
        if len(versions) > 0:
            previous_version = versions[-1]
        else:
            previous_version = None

    # load database and dependencies
    deps_path = os.path.join(db_root, define.DEPENDENCIES_FILE)
    deps = Dependencies()
    if os.path.exists(deps_path):
        deps.load(deps_path)

    # check if database folder depends on the right version

    # dependencies shouldn't be there
    if previous_version is None and len(deps) > 0:
        raise RuntimeError(
            f"You did not set a dependency to a previous version, "
            f"but you have a '{define.DEPENDENCIES_FILE}' file present "
            f"in {db_root}.")

    # dependencies missing
    if previous_version is not None and len(deps) == 0:
        raise RuntimeError(
            f"You want to depend on '{previous_version}' "
            f"of {db.name}, "
            f"but you don't have a '{define.DEPENDENCIES_FILE}' file present "
            f"in {db_root}. "
            f"Did you forgot to call "
            f"'audb.load_to({db_root}, {db.name}, "
            f"version={previous_version}?")

    # dependencies do not match version
    if previous_version is not None and len(deps) > 0:
        with tempfile.TemporaryDirectory() as tmp_dir:
            previous_deps_path = os.path.join(
                tmp_dir,
                define.DEPENDENCIES_FILE,
            )
            previous_deps = dependencies(
                db.name,
                version=previous_version,
                cache_root=cache_root,
            )
            previous_deps.save(previous_deps_path)
            if audbackend.md5(deps_path) != audbackend.md5(previous_deps_path):
                raise RuntimeError(
                    f"You want to depend on '{previous_version}' "
                    f"of {db.name}, "
                    f"but the MD5 sum of your "
                    f"'{define.DEPENDENCIES_FILE}' file "
                    f"in {db_root} "
                    f"does not match the MD5 sum of the corresponding file "
                    f"for the requested version in the repository. "
                    f"Did you forgot to call "
                    f"'audb.load_to({db_root}, {db.name}, "
                    f"version='{previous_version}') "
                    f"or modified the file manually?")

    # load database from folder
    db = audformat.Database.load(db_root)

    if not db.is_portable:
        raise RuntimeError("Some files in the tables have absolute paths "
                           "or use '.' or '..' to address a folder. "
                           "Please replace those paths by relative paths "
                           "and use folder names instead of dots.")

    # check all files referenced in a table exists
    missing_files = [
        f for f in db.files if not os.path.exists(os.path.join(db_root, f))
    ]
    if len(missing_files) > 0:
        number_of_presented_files = 20
        error_msg = (
            f'{len(missing_files)} files are referenced in tables '
            'that cannot be found. '
            f"Missing files are: '{missing_files[:number_of_presented_files]}")
        if len(missing_files) <= number_of_presented_files:
            error_msg += "'."
        else:
            error_msg += ", ...'."
        raise RuntimeError(error_msg)

    # make sure all tables are stored in CSV format
    for table_id, table in db.tables.items():
        table_path = os.path.join(db_root, f'db.{table_id}')
        table_ext = audformat.define.TableStorageFormat.CSV
        if not os.path.exists(table_path + f'.{table_ext}'):
            table.save(table_path, storage_format=table_ext)

    # check archives
    archives = archives or {}

    # publish tables
    tables = _find_tables(db, db_root, version, deps, verbose)
    _put_tables(tables, db_root, db.name, version, backend, num_workers,
                verbose)

    # publish media
    media = _find_media(db, db_root, version, deps, archives, verbose)
    _put_media(media, db_root, db.name, version, deps, backend, num_workers,
               verbose)

    # publish dependencies and header
    deps.save(deps_path)
    archive_file = backend.join(db.name, define.DB)
    backend.put_archive(
        db_root,
        define.DEPENDENCIES_FILE,
        archive_file,
        version,
    )
    try:
        local_header = os.path.join(db_root, define.HEADER_FILE)
        remote_header = db.name + '/' + define.HEADER_FILE
        backend.put_file(local_header, remote_header, version)
    except Exception:  # pragma: no cover
        # after the header is published
        # the new version becomes visible,
        # so if something goes wrong here
        # we better clean up
        if backend.exists(remote_header, version):
            backend.remove_file(remote_header, version)

    return deps
Beispiel #6
0
def load_to(
    root: str,
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
) -> audformat.Database:
    r"""Load database to directory.

    Loads the original state of the database
    to a custom directory.
    No conversion or filtering will be applied.
    If the target folder already contains
    some version of the database,
    it will upgrade to the requested version.
    Unchanged files will be skipped.

    Args:
        root: target directory
        name: name of database
        version: version string, latest if ``None``
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used.
            Only used to read the dependencies of the requested version
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        database object

    """
    if version is None:
        version = latest_version(name)

    db_root = audeer.safe_path(root)
    db_root_tmp = database_tmp_folder(db_root)

    # remove files with a wrong checksum
    # to ensure we load correct version
    update = os.path.exists(db_root) and os.listdir(db_root)
    audeer.mkdir(db_root)
    deps = dependencies(name, version=version, cache_root=cache_root)
    if update:
        for file in deps.files:
            full_file = os.path.join(db_root, file)
            if os.path.exists(full_file):
                checksum = audbackend.md5(full_file)
                if checksum != deps.checksum(file):
                    os.remove(full_file)

    # load database header without tables from backend

    db_header, backend = load_header(
        db_root_tmp,
        name,
        version,
        overwrite=True,
    )

    # get altered and new tables

    db_header.save(db_root_tmp, header_only=True)
    tables = _find_tables(db_header, db_root, deps, num_workers, verbose)
    _get_tables(tables, db_root, db_root_tmp, name, deps, backend, num_workers,
                verbose)

    # load database

    # move header to root and load database ...
    _move_file(db_root_tmp, db_root, define.HEADER_FILE)
    try:
        db = audformat.Database.load(
            db_root,
            num_workers=num_workers,
            verbose=verbose,
        )
    except (KeyboardInterrupt, Exception):  # pragma: no cover
        # make sure to remove header if user interrupts
        os.remove(os.path.join(db_root, define.HEADER_FILE))
        raise
    # afterwards remove header to avoid the database
    # can be loaded before download is complete
    os.remove(os.path.join(db_root, define.HEADER_FILE))

    # get altered and new media files

    media = _find_media(db, db_root, deps, num_workers, verbose)
    _get_media(media, db_root, db_root_tmp, name, deps, backend, num_workers,
               verbose)

    # save dependencies

    dep_path_tmp = os.path.join(db_root_tmp, define.DEPENDENCIES_FILE)
    deps.save(dep_path_tmp)
    _move_file(db_root_tmp, db_root, define.DEPENDENCIES_FILE)

    # save database and remove the temporal directory
    # to signal all files were correctly loaded

    _save_database(db, db_root, db_root_tmp, num_workers, verbose)
    try:
        _remove_empty_dirs(db_root_tmp)
    except OSError:  # pragma: no cover
        raise RuntimeError('Could not remove temporary directory, '
                           'probably there are some leftover files.'
                           'This should not happen.')

    return db
Beispiel #7
0
def load_media(
    name: str,
    media: typing.Union[str, typing.Sequence[str]],
    *,
    version: str = None,
    bit_depth: int = None,
    channels: typing.Union[int, typing.Sequence[int]] = None,
    format: str = None,
    mixdown: bool = False,
    sampling_rate: int = None,
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
) -> typing.List:
    r"""Load media file(s).

    If you are interested in media files
    and not the corresponding tables,
    you can use :func:`audb.load_media`
    to load them.
    This will not download any table files
    to your disk,
    but share the cache with :func:`audb.load`.

    Args:
        name: name of database
        media: load media files provided in the list
        version: version of database
        bit_depth: bit depth, one of ``16``, ``24``, ``32``
        channels: channel selection, see :func:`audresample.remix`.
            Note that media files with too few channels
            will be first upsampled by repeating the existing channels.
            E.g. ``channels=[0, 1]`` upsamples all mono files to stereo,
            and ``channels=[1]`` returns the second channel
            of all multi-channel files
            and all mono files.
        format: file format, one of ``'flac'``, ``'wav'``
        mixdown: apply mono mix-down
        sampling_rate: sampling rate in Hz, one of
            ``8000``, ``16000``, ``22500``, ``44100``, ``48000``
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        paths to media files

    Raises:
        ValueError: if a media file is requested
            that is not part of the database

    Example:
        >>> paths = load_media(
        ...     'emodb',
        ...     ['wav/03a01Fa.wav'],
        ...     version='1.1.1',
        ...     format='flac',
        ...     verbose=False,
        ... )
        >>> cache_root = audb.default_cache_root()
        >>> [p[len(cache_root):] for p in paths]
        ['/emodb/1.1.1/40bb2241/wav/03a01Fa.flac']

    """
    media = audeer.to_list(media)
    if len(media) == 0:
        return []

    if version is None:
        version = latest_version(name)
    deps = dependencies(name, version=version, cache_root=cache_root)

    available_files = deps.media
    for media_file in media:
        if media_file not in available_files:
            raise ValueError(
                f"Could not find '{media_file}' in {name} {version}")

    cached_versions = None

    flavor = Flavor(
        channels=channels,
        format=format,
        mixdown=mixdown,
        bit_depth=bit_depth,
        sampling_rate=sampling_rate,
    )
    db_root = database_cache_folder(name, version, cache_root, flavor)
    db_root_tmp = database_tmp_folder(db_root)

    if verbose:  # pragma: no cover
        print(f'Get:   {name} v{version}')
        print(f'Cache: {db_root}')

    # Start with database header without tables
    db, backend = load_header(
        db_root,
        name,
        version,
        flavor=flavor,
        add_audb_meta=True,
    )

    db_is_complete = _database_is_complete(db)

    # load missing media
    if not db_is_complete:
        _load_media(
            media,
            backend,
            db_root,
            db_root_tmp,
            name,
            version,
            cached_versions,
            deps,
            flavor,
            cache_root,
            num_workers,
            verbose,
        )

    if format is not None:
        media = [audeer.replace_file_extension(m, format) for m in media]

    return [os.path.join(db_root, m) for m in media]
Beispiel #8
0
def load(
    name: str,
    *,
    version: str = None,
    only_metadata: bool = False,
    bit_depth: int = None,
    channels: typing.Union[int, typing.Sequence[int]] = None,
    format: str = None,
    mixdown: bool = False,
    sampling_rate: int = None,
    tables: typing.Union[str, typing.Sequence[str]] = None,
    media: typing.Union[str, typing.Sequence[str]] = None,
    removed_media: bool = False,
    full_path: bool = True,
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
    **kwargs,
) -> audformat.Database:
    r"""Load database.

    Loads meta and media files of a database to the local cache and returns
    a :class:`audformat.Database` object.

    When working with data,
    we often make assumptions about the media files.
    For instance, we expect that audio files are
    have a specific sampling rate.
    By setting
    ``bit_depth``, ``channels``, ``format``, ``mixdown``, and ``sampling_rate``
    we can request a specific flavor of the database.
    In that case media files are automatically converted to the desired
    properties (see also :class:`audb.Flavor`).

    It is possible to filter meta and media files with the arguments
    ``tables`` and ``media``.
    Note that only media files with at least one reference are loaded.
    I.e. filtering meta files, may also remove media files.
    Likewise, references to missing media files will be removed, too.
    I.e. filtering media files, may also remove entries from the meta files.

    Args:
        name: name of database
        version: version string, latest if ``None``
        only_metadata: load only metadata
        bit_depth: bit depth, one of ``16``, ``24``, ``32``
        channels: channel selection, see :func:`audresample.remix`.
            Note that media files with too few channels
            will be first upsampled by repeating the existing channels.
            E.g. ``channels=[0, 1]`` upsamples all mono files to stereo,
            and ``channels=[1]`` returns the second channel
            of all multi-channel files
            and all mono files.
        format: file format, one of ``'flac'``, ``'wav'``
        mixdown: apply mono mix-down
        sampling_rate: sampling rate in Hz, one of
            ``8000``, ``16000``, ``22500``, ``44100``, ``48000``
        tables: include only tables matching the regular expression or
            provided in the list
        media: include only media matching the regular expression or
            provided in the list
        removed_media: keep rows that reference removed media
        full_path: replace relative with absolute file paths
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        database object

    """
    if version is None:
        version = latest_version(name)
    deps = dependencies(name, version=version, cache_root=cache_root)

    # backward compatibility to audb<1.0.0
    channels, mixdown, media = parse_deprecated_load_arguments(
        channels,
        mixdown,
        media,
        deps,
        kwargs,
    )

    cached_versions = None

    flavor = Flavor(
        channels=channels,
        format=format,
        mixdown=mixdown,
        bit_depth=bit_depth,
        sampling_rate=sampling_rate,
    )
    db_root = database_cache_folder(name, version, cache_root, flavor)
    db_root_tmp = database_tmp_folder(db_root)

    if verbose:  # pragma: no cover
        print(f'Get:   {name} v{version}')
        print(f'Cache: {db_root}')

    # Start with database header without tables
    db, backend = load_header(
        db_root,
        name,
        version,
        flavor=flavor,
        add_audb_meta=True,
    )

    db_is_complete = _database_is_complete(db)

    # filter tables
    requested_tables = _tables(deps, tables)

    # load missing tables
    if not db_is_complete:
        _load_tables(
            requested_tables,
            backend,
            db_root,
            db_root_tmp,
            db,
            version,
            cached_versions,
            deps,
            flavor,
            cache_root,
            num_workers,
            verbose,
        )

    # filter tables
    if tables is not None:
        db.pick_tables(requested_tables)

    # load tables
    for table in requested_tables:
        db[table].load(os.path.join(db_root, f'db.{table}'))

    # filter media
    requested_media = _media(db, media)

    # load missing media
    if not db_is_complete and not only_metadata:
        _load_media(
            requested_media,
            backend,
            db_root,
            db_root_tmp,
            name,
            version,
            cached_versions,
            deps,
            flavor,
            cache_root,
            num_workers,
            verbose,
        )

    # filter media
    if media is not None or tables is not None:
        db.pick_files(requested_media)

    if not removed_media:
        _remove_media(db, deps, num_workers, verbose)

    # fix media extension in tables
    if flavor.format is not None:
        _fix_media_ext(db.tables.values(), flavor.format, num_workers, verbose)

    # convert to full path
    if full_path:
        _full_path(db, db_root)

    # check if database is now complete
    if not db_is_complete:
        _database_check_complete(db, db_root, db_root_tmp, flavor, deps)

    if os.path.exists(db_root_tmp):
        shutil.rmtree(db_root_tmp)

    return db
Beispiel #9
0
def load_table(
    name: str,
    table: str,
    *,
    version: str = None,
    cache_root: str = None,
    num_workers: typing.Optional[int] = 1,
    verbose: bool = True,
) -> pd.DataFrame:
    r"""Load a database table.

    If you are interested in a single table
    from a database
    you can use :func:`audb.load_table`
    to directly load it.
    This will not download any media files
    to your disk,
    but share the cache with :func:`audb.load`.

    Args:
        name: name of database
        table: load table from database
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used
        num_workers: number of parallel jobs or 1 for sequential
            processing. If ``None`` will be set to the number of
            processors on the machine multiplied by 5
        verbose: show debug messages

    Returns:
        database table

    Raises:
        ValueError: if a table is requested
            that is not part of the database

    Example:
        >>> df = load_table(
        ...     'emodb',
        ...     'emotion',
        ...     version='1.1.1',
        ...     verbose=False,
        ... )
        >>> df[:3]
                           emotion  emotion.confidence
        file
        wav/03a01Fa.wav  happiness                0.90
        wav/03a01Nc.wav    neutral                1.00
        wav/03a01Wa.wav      anger                0.95

    """
    if version is None:
        version = latest_version(name)
    deps = dependencies(name, version=version, cache_root=cache_root)

    if table not in deps.table_ids:
        raise ValueError(f"Could not find table '{table}' in {name} {version}")

    cached_versions = None

    db_root = database_cache_folder(name, version, cache_root)
    db_root_tmp = database_tmp_folder(db_root)

    if verbose:  # pragma: no cover
        print(f'Get:   {name} v{version}')
        print(f'Cache: {db_root}')

    # Start with database header without tables
    db, backend = load_header(
        db_root,
        name,
        version,
    )

    # Load table
    table_file = os.path.join(db_root, f'db.{table}')
    if not (os.path.exists(f'{table_file}.csv')
            or os.path.exists(f'{table_file}.pkl')):
        _load_tables(
            [table],
            backend,
            db_root,
            db_root_tmp,
            db,
            version,
            cached_versions,
            deps,
            Flavor(),
            cache_root,
            num_workers,
            verbose,
        )
    table = audformat.Table()
    table.load(table_file)

    return table._df