Beispiel #1
0
def create_database(database_path, transcription_realigned_path, chime6):
    logging.basicConfig(format='%(levelname)s: %(message)s',
                        level=logging.INFO)
    datasets = dict()
    alias = dict()

    transcription_realigned_pathes = Dispatcher({
        p.name: p
        for p in Path(transcription_realigned_path).glob('**/*.json')
    })

    kaldi_transcriptions = dict()

    if chime6:
        set_length = set_length_chime6
    else:
        set_length = set_length_chime5

    for dataset in set_length.keys():
        out_dict = get_dataset(database_path, dataset,
                               transcription_realigned_pathes,
                               kaldi_transcriptions, chime6)
        for session_id, v in out_dict.items():
            datasets[session_id] = v
        alias[dataset] = list(out_dict.keys())
    return {keys.DATASETS: datasets, 'alias': alias}
Beispiel #2
0
def write_keyed_text_file(text_file: Path, data_dict):
    """
    Often used to write e.g. Kaldi `text`, `wav.scp` or `spk2utt`.
    Sorting is enforced here to avoid subsequent calls to fix_data_dir.sh

    For some file names, it tries to perform some kind of sanity check to match
    the Kaldi file standards.

    Args:
        text_file: Path with file in format: <utterance_id> <else>

    Returns:

    """
    text_file = Path(text_file)
    data = []
    for k, text in sorted(data_dict.items()):
        if isinstance(text, list):
            text = ' '.join(text)
        if text_file.name == 'utt2dur':
            try:
                text_number = float(text)
            except Exception:
                raise ValueError(
                    f'The text "{text}" for {k} that should be written to '
                    f'{text_file} does not represent a number.')
            else:
                assert 0. < text_number < 1000., f'Strange duration: {k}: {text_number} s'
        elif text_file.name == 'spk2gender':
            text = Dispatcher(
                male='m',
                female='f',
                m='m',
                f='f',
            )[text]
        else:
            pass

        data.append(f'{k} {text}')

    text_file.write_text('\n'.join(data))
Beispiel #3
0
from collections import defaultdict

from pb_chime5.mapping import Dispatcher

dev_sess_ref_array_mapping = Dispatcher({
    'S02': ['U02', 'U03', 'U05'],
    'S09': ['U01', 'U04', 'U06']
})

#TODO: check if still relevant
# error_id_mapping = Dispatcher({
#     'nan_in_gcc_phat': [
#         'P27_S09_0217746-0218448',
#         'P25_S09_0218350-0218590',
#          'P27_S09_0218528-0218656',
#          'P25_S09_0218645-0218785',
#          'P28_S09_0228518-0228606',
#          'P25_S09_0228530-0228730',
#          'P26_S09_0228590-0228810',
#          'P28_S09_0228724-0228822',
#          'P25_S09_0228730-0229010',
#          'P27_S09_0228832-0229004',
#          'P26_S09_0236312-0236452',
#          'P26_S09_0236558-0236688',
#          'P26_S09_0274100-0274266',
#          'P25_S09_0274135-0274260',
#          'P26_S09_0286486-0286608',
#          'P25_S09_0286555-0286720',
#          'P25_S09_0439024-0439440',
#          'P28_S09_0439248-0439372',
#          'P25_S09_0468290-0468476',
Beispiel #4
0
def dump_audio(
    obj,
    path,
    *,
    sample_rate=16000,
    dtype=np.int16,
    start=None,
    normalize=True,
    format=None,
):
    """
    If normalize is False and the dytpe is float, the values of obj should be in
    the range [-1, 1).

    Params:
        obj: Shape (channels, samples) or (samples,)
        path:
        sample_rate:
        dtype:
        start:
        normalize:

    >>> from pb_chime5.utils.process_caller import run_process
    >>> from pb_chime5.io import load_audio
    >>> a = np.array([1, 2, -4, 4], dtype=np.int16)
    >>> import io, os
    >>> # file = io.BytesIO()
    >>> file = Path('tmp_audio_data.wav')
    >>> dump_audio(a, file, normalize=False)
    >>> load_audio(file) * 2**15
    array([ 1.,  2., -4.,  4.])
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>
    >>> dump_audio(a, file, normalize=True)
    >>> load_audio(file)
    array([ 0.24996948,  0.49996948, -0.99996948,  0.99996948])
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>

    >>> data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) / 32
    >>> data
    array([0.     , 0.03125, 0.0625 , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125])
    >>> dump_audio(data, file, normalize=False)
    >>> load_audio(file)
    array([0.     , 0.03125, 0.0625 , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125])
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>
    >>> dump_audio(np.array([16, 24]) / 32, file, normalize=False, start=1)
    >>> load_audio(file)
    array([0.     , 0.5    , 0.75   , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125])
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>
    >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=9)
    >>> load_audio(file)
    array([0.     , 0.5    , 0.75   , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.5    , 0.75   , 0.75   , 0.75   ])
    >>> load_audio(file).shape
    (13,)
    >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=20)
    >>> load_audio(file)
    array([0.     , 0.5    , 0.75   , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.5    , 0.75   , 0.75   , 0.75   , 0.     ,
           0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.5    ,
           0.75   , 0.75   , 0.75   ])
    >>> load_audio(file).shape
    (24,)
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>
    >>> os.remove('tmp_audio_data.wav')
    >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=20)
    >>> load_audio(file)
    array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
           0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.5 , 0.75,
           0.75, 0.75])
    >>> load_audio(file).shape
    (24,)
    >>> print(run_process(f'file {file}').stdout)
    tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz
    <BLANKLINE>

    >>> data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) / 32
    >>> data
    array([0.     , 0.03125, 0.0625 , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125])
    >>> dump_audio(data, file, normalize=False, dtype=None)
    >>> load_audio(file)
    array([0.     , 0.03125, 0.0625 , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125])
    >>> print(run_process(f'soxi {file}').stdout)
    <BLANKLINE>
    Input File     : 'tmp_audio_data.wav'
    Channels       : 1
    Sample Rate    : 16000
    Precision      : 53-bit
    Duration       : 00:00:00.00 = 10 samples ~ 0.046875 CDDA sectors
    File Size      : 160
    Bit Rate       : 2.05M
    Sample Encoding: 64-bit Floating Point PCM
    <BLANKLINE>
    <BLANKLINE>
    >>> dump_audio(data.astype(np.float32), file, normalize=False, dtype=None)
    >>> load_audio(file, dtype=None)
    array([0.     , 0.03125, 0.0625 , 0.09375, 0.125  , 0.15625, 0.1875 ,
           0.21875, 0.25   , 0.28125], dtype=float32)
    >>> print(run_process(f'soxi {file}').stdout)
    <BLANKLINE>
    Input File     : 'tmp_audio_data.wav'
    Channels       : 1
    Sample Rate    : 16000
    Precision      : 24-bit
    Duration       : 00:00:00.00 = 10 samples ~ 0.046875 CDDA sectors
    File Size      : 120
    Bit Rate       : 1.54M
    Sample Encoding: 32-bit Floating Point PCM
    <BLANKLINE>
    <BLANKLINE>

    """
    path = normalize_path(path, as_str=True)
    obj = np.asarray(obj)

    if normalize:
        if not obj.dtype.kind in ['f', 'i']:
            raise TypeError(
                'Only float and int is currently supported with normalize. '
                f'Got dtype {obj.dtype}')
        # Normalization can change the type (e.g. int to float).
        # When saving as float, normalize is a bad idea.
        # The normalization is adjusted for int16
        assert dtype == np.int16, (
            'Currently is only normalize allowed for dtype == np.int16'
            f'and not for dtype == {dtype}')
        # Correction, because the allowed values are in the range [-1, 1).
        # => "1" is not a vaild value
        correction = (2**15 - 1) / (2**15)
        obj = obj * (correction / np.amax(np.abs(obj)))

    # ToDo: better exception when path is file descriptor
    if start is None or not Path(path).exists():
        if obj.ndim == 1:
            channels = 1
        else:
            channels = obj.shape[0]

        sf_args = dict(
            mode='w',
            channels=channels,
            samplerate=sample_rate,
        )
    else:
        sf_args = dict(mode='r+')
    sf_args['format'] = format

    dtype_map = Dispatcher({
        np.int16: 'PCM_16',
        np.dtype('int16'): 'PCM_16',
        np.int32: 'PCM_32',
        np.dtype('int32'): 'PCM_32',
        np.float32: 'FLOAT',
        np.dtype('float32'): 'FLOAT',
        np.float64: 'DOUBLE',
        np.dtype('float64'): 'DOUBLE',
    })

    if dtype in [np.int16]:
        pass
    elif dtype in [np.float32, np.float64, np.int32]:
        sf_args['subtype'] = dtype_map[dtype]
    elif dtype is None:
        sf_args['subtype'] = dtype_map[obj.dtype]
    else:
        raise TypeError(dtype)

    # soundfile.write()

    with soundfile.SoundFile(path, **sf_args) as f:
        if start is not None:
            f.seek(start)
        f.write(obj.T)
    return
Beispiel #5
0
def get_activity(
    iterator,
    *,
    perspective,
    garbage_class,
    dtype=np.bool,
    non_sil_alignment_fn=None,
    debug=False,
    use_ArrayIntervall=False,
):
    """

    perspective:
        Example:
            'global_worn' -- global perspective for worn ('P')
            'worn' -- return perspective for each speaker ('P01', ...)
            'array' -- return perspective for each array ('U01', ...)
    garbage_class: True, False, None
        True: garbage_class is always one
        False: garbage_class is always zero
        None: the number of classes is 4 and not 5
    non_sil_alignment_fn: None or a function with the signature:
        value = non_sil_alignment_fn(ex, perspective_mic_array)
        where
            ex is one example in iterator
            perspective_mic_array is in ['U01', ..., 'P01', ..., 'P']
            value is a 1d array indicating if at a sample the source is active
                or not
        use_ArrayIntervall: ArrayIntervall is a special datatype to reduce
            memory usage

    returns:
        dict[session_id][mic_perspective][speaker_id] = array(dtype=bool)
        session_id e.g.: 'S02', ...
        mic_perspective e.g.: 'P', 'P05', 'U01', ...
        speaker_id e.g.: 'P05', ...

    >>> from pb_chime5.database.chime5 import Chime5
    >>> import textwrap
    >>> db = Chime5()
    >>> def display_activity(activity):
    ...     print(tuple(activity.keys()))
    ...     print(' '*2, tuple(activity['S02'].keys()))
    ...     print(' '*4, tuple(activity['S02']['P'].keys()))
    ...     print(' '*6, activity['S02']['P']['P05'])
    ...     print(' '*6, activity['S02']['P']['Noise'])
    >>> def display_activity(activity, indent=0):
    ...     indent_print = lambda x: print(textwrap.indent(str(x), ' '*indent))
    ...     if isinstance(activity, dict):
    ...         for i, (k, v) in enumerate(activity.items()):
    ...             if i == 0 or k in ['Noise']:
    ...                 indent_print(f'{k}:')
    ...                 display_activity(v, indent=indent+2)
    ...             else:
    ...                 indent_print(f'{k}: ...')
    ...     else:
    ...         indent_print(activity)
    >>> activity = get_activity(db.get_datasets('S02'), perspective='global_worn', garbage_class=True)
    >>> display_activity(activity)
    S02:
      P:
        P05:
          [False False False ... False False False]
        P06: ...
        P07: ...
        P08: ...
        Noise:
          [ True  True  True ...  True  True  True]
    >>> activity = get_activity(db.get_datasets('S02'), perspective='worn', garbage_class=False)
    >>> display_activity(activity)
    S02:
      P05:
        P05:
          [False False False ... False False False]
        P06: ...
        P07: ...
        P08: ...
        Noise:
          [False False False ... False False False]
      P06: ...
      P07: ...
      P08: ...
    >>> activity = get_activity(db.get_datasets('S02'), perspective='array', garbage_class=None)
    >>> display_activity(activity)
    S02:
      U01:
        P05:
          [False False False ... False False False]
        P06: ...
        P07: ...
        P08: ...
      U02: ...
      U03: ...
      U04: ...
      U05: ...
      U06: ...

    """

    dict_it_S = iterator.groupby(lambda ex: ex['session_id'])

    # Dispatcher is a dict with better KeyErrors
    all_acitivity = Dispatcher()
    for session_id, it_S in dict_it_S.items():

        if perspective == 'worn':
            perspective_tmp = mapping.session_to_speakers[session_id]
        elif perspective == 'global_worn':
            perspective_tmp = ['P']  # Always from target speaker
        elif perspective == 'array':
            # The mapping considers missing arrays
            perspective_tmp = mapping.session_to_arrays[session_id]
        else:
            perspective_tmp = perspective

            if not isinstance(perspective_tmp, (tuple, list)):
                perspective_tmp = [
                    perspective_tmp,
                ]

        speaker_ids = mapping.session_to_speakers[session_id]

        if use_ArrayIntervall:
            assert dtype == np.bool, dtype
            zeros = ArrayIntervall

            def ones(shape):
                arr = zeros(shape=shape)
                arr[:] = 1
                return arr
        else:
            import functools
            zeros = functools.partial(np.zeros, dtype=dtype)
            ones = functools.partial(np.ones, dtype=dtype)

        all_acitivity[session_id] = Dispatcher({
            p: Dispatcher({
                s: zeros(shape=[
                    mapping.session_array_to_num_samples[f'{session_id}_{p}']
                ])
                # s: ArrayIntervall(shape=[num_samples])
                for s in speaker_ids
            })
            for p in perspective_tmp
        })

        if garbage_class is True:
            for p in perspective_tmp:
                num_samples = mapping.session_array_to_num_samples[
                    f'{session_id}_{p}']
                all_acitivity[session_id][p]['Noise'] = ones(
                    shape=[num_samples], )
        elif garbage_class is False:
            for p in perspective_tmp:
                num_samples = mapping.session_array_to_num_samples[
                    f'{session_id}_{p}']
                all_acitivity[session_id][p]['Noise'] = zeros(
                    shape=[num_samples])
        elif garbage_class is None:
            pass
        elif isinstance(garbage_class, int) and garbage_class > 0:
            for noise_idx in range(garbage_class):
                for p in perspective_tmp:
                    num_samples = mapping.session_array_to_num_samples[
                        f'{session_id}_{p}']
                    all_acitivity[session_id][p][f'Noise{noise_idx}'] = ones(
                        shape=[num_samples])
        else:
            raise ValueError(garbage_class)

        missing_count = 0
        for ex in it_S:
            for pers in perspective_tmp:
                if ex['transcription'] == '[redacted]':
                    continue

                target_speaker = ex['speaker_id']
                # example_id = ex['example_id']

                if pers == 'P':
                    perspective_mic_array = target_speaker
                else:
                    perspective_mic_array = pers

                if perspective_mic_array.startswith('P'):
                    start = ex['start']['worn'][perspective_mic_array]
                    end = ex['end']['worn'][perspective_mic_array]
                else:
                    if not perspective_mic_array in ex['audio_path'][
                            'observation']:
                        continue
                    start = ex['start']['observation'][perspective_mic_array]
                    end = ex['end']['observation'][perspective_mic_array]

                if non_sil_alignment_fn is None:
                    value = 1
                else:
                    value = non_sil_alignment_fn(ex, perspective_mic_array)
                    if value is 1:
                        missing_count += 1

                if debug:
                    all_acitivity[session_id][pers][target_speaker][
                        start:end] += value
                else:
                    all_acitivity[session_id][pers][target_speaker][
                        start:end] = value
        if missing_count > len(it_S) // 2:
            raise RuntimeError(
                f'Something went wrong.\n'
                f'Expected {len(it_S) * len(perspective_tmp)} times a '
                f'finetuned annotation for session {session_id}, but '
                f'{missing_count} times they are missing.\n'
                f'Expect that at least {len(it_S) // 2} finetuned annotations '
                f'are available, when non_sil_alignment_fn is given.\n'
                f'Otherwise assume something went wrong.')

        del it_S

    return all_acitivity
Beispiel #6
0
def get_phone_alignment(
        ali_path,
        use_kaldi_id=False,
        unique_per_utt=True,
        channel_preference=None,
):
    """

    use_kaldi_id:
        Use a unique id per utterance or tha kaldi id (i.e. array dependent)
    unique_per_utt:
        Return one per utterance. When multiple kaldi ids are available use
        channel_preference.
    channel_preference:
        None or list of channels.
        Example channel_preference = ['R', 'L']
         - assert any alignment has a left channel and any alignment has a right
           channel. (Note any not all)
         - If an example has a left and right channel, select the right.

    >>> # np.set_string_function(lambda a: f'array(shape={a.shape}, dtype={a.dtype})')
    >>> np.set_printoptions(threshold=50, edgeitems=30)
    >>> from IPython.lib.pretty import pprint
    >>> p = Path('/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali')
    >>> # p = ('~/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali', '~/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali')
    >>> alignment = get_phone_alignment(p)
    >>> pprint(alignment['P06_S02_0060700-0061058'])  # doctest: +ELLIPSIS
    array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'd_B', 'd_B', 'd_B', 'd_B', 'd_B', 'd_B', 'ih_I', 'ih_I', 'ih_I',
           'z_E', 'z_E', 'z_E', ..., 'ay_I', 'ay_I', 'ay_I', 't_E', 't_E',
           't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4')
    >>> pprint(alignment['P25_S09_0121800-0122035'])  # doctest: +ELLIPSIS
    array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'ay_B', 'ay_B', 'ay_B', 'ay_B', 'ay_B', 'm_E', 'm_E', 'm_E',
           'g_B', 'g_B', 'g_B', 'aa_I', 'aa_I', 'aa_I', 'aa_I', 'n_I', 'n_I',
           'n_I', 'ah_E', 'ah_E', ..., 'n_E', 'n_E', 'n_E', 'n_E', 'n_E',
           'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4')
    >>> non_sil_alignment = {k: v != 'sil' for k, v in alignment.items()}
    >>> pprint(dict(list(non_sil_alignment.items())[:3]))  # doctest: +ELLIPSIS
    {'P05_S02_0004060-0004382': array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
             True,  True,  True,  True,  True,  True,  True,  True,  True,
             True,  True,  True,  True,  True,  True,  True,  True,  True,
             True, False, False, ...,  True,  True,  True,  True,  True,  True,
             True,  True,  True,  True,  True, False, False, False, False,
            False, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False]),
     'P05_S02_0007011-0007297': array([False, False, False, False, False,  True,  True,  True,  True,
             True,  True,  True,  True,  True,  True,  True,  True,  True,
             True,  True,  True,  True,  True,  True,  True,  True,  True,
             True,  True,  True, ..., False, False, False, False, False, False,
            False, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False]),
     'P05_S02_0007437-0007908': array([False, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False, False, False, False,
             True,  True,  True, ...,  True,  True,  True,  True,  True,  True,
             True,  True,  True,  True,  True,  True,  True,  True,  True,
             True, False, False, False, False, False, False, False, False,
            False, False, False, False, False, False])}

    # >>> p = '/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_cleaned_ali_train_worn_u100k_cleaned_sp'
    # >>> alignment = get_phone_alignment(p)
    # >>> pprint(dict(list(non_sil_alignment.items())[:3]))  # doctest: +ELLIPSIS
    # >>> print(len(alignment))

    >>> ali_path = (
    ...     '/net/vol/jensheit/kaldi/egs/chime5/inear_bss_cacgmm_v3/finetune_0/kaldi/exp/tri3_worn_bss_stereo_train_worn_bss_stereo_ali/',
    ...     '/net/vol/jensheit/kaldi/egs/chime5/inear_bss_cacgmm_v3/finetune_0/kaldi/exp/tri3_worn_bss_stereo_dev_worn_bss_stereo_ali/',
    ... )  # slow because of train
    >>> alignment = get_phone_alignment(ali_path, channel_preference=['R', 'L'])
    >>> pprint(alignment['P06_S02_0060700-0061058'])  # doctest: +ELLIPSIS
    array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'd_B', 'd_B', 'd_B', 'd_B', 'ih_I', 'ih_I',
           'ih_I', 'z_E', 'z_E', ..., 't_E', 't_E', 't_E', 't_E', 't_E',
           't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil',
           'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4')


    """
    import pb_chime5.kaldi

    if isinstance(ali_path, (tuple, list)):
        alignments_list = [
            get_phone_alignment(
                ap,
                channel_preference=channel_preference,
                use_kaldi_id=use_kaldi_id,
            )
            for ap in ali_path
        ]
        total_len = sum([len(a) for a in alignments_list])
        alignments = {
            k: v
            for a in alignments_list
            for k, v in a.items()
        }
        assert len(alignments) == total_len
        return alignments

    ali_path = Path(ali_path).expanduser().resolve()

    tmp = [reversed(line.split()) for line in
           (ali_path / 'phones.txt').read_text().splitlines()]
    id2phone = {int(k): v for k, v in tmp}
    assert len(id2phone) == len(tmp)

    _alignments = pb_chime5.kaldi.alignment.import_alignment_data(
        ali_path,
        import_fn=pb_chime5.kaldi.alignment.import_phone_alignment_from_file,
        per_frame=True,
        model_name=ali_path / 'final.mdl'
    )

    alignments = _helper(
        _alignments,
        channel_preference=channel_preference,
        # id2phone=id2phone,
        unique_per_utt=unique_per_utt,
        use_kaldi_id=use_kaldi_id,
    )

    return Dispatcher(cy_alignment_id2phone(alignments, id2phone))
Beispiel #7
0
def load_audio(
    path,
    *,
    frames=-1,
    start=0,
    stop=None,
    dtype=np.float64,
    fill_value=None,
    expected_sample_rate=None,
    unit='samples',
    return_sample_rate=False,
):
    """
    WIP will deprecate audioread in the future

    Difference to soundfile.read:
     - Default: Return only signal
     - With the argument "unit" the unit of frames, start and stop can be
       changed (stop currently unsupported).
     - With given expected_sample_rate an assert is included (recommended)

    soundfile.read doc text and some examples:

    Provide audio data from a sound file as NumPy array.

    By default, the whole file is read from the beginning, but the
    position to start reading can be specified with `start` and the
    number of frames to read can be specified with `frames`.
    Alternatively, a range can be specified with `start` and `stop`.

    If there is less data left in the file than requested, the rest of
    the frames are filled with `fill_value`.
    If no `fill_value` is specified, a smaller array is returned.

    Parameters
    ----------
    file : str or int or file-like object
        The file to read from.  See :class:`SoundFile` for details.
    frames : int, optional
        The number of frames to read. If `frames` is negative, the whole
        rest of the file is read.  Not allowed if `stop` is given.
    start : int, optional
        Where to start reading.  A negative value counts from the end.
    stop : int, optional
        The index after the last frame to be read.  A negative value
        counts from the end.  Not allowed if `frames` is given.
    dtype : {'float64', 'float32', 'int32', 'int16'}, optional
        Data type of the returned array, by default ``'float64'``.
        Floating point audio data is typically in the range from
        ``-1.0`` to ``1.0``.  Integer data is in the range from
        ``-2**15`` to ``2**15-1`` for ``'int16'`` and from ``-2**31`` to
        ``2**31-1`` for ``'int32'``.

        .. note:: Reading int values from a float file will *not*
            scale the data to [-1.0, 1.0). If the file contains
            ``np.array([42.6], dtype='float32')``, you will read
            ``np.array([43], dtype='int32')`` for ``dtype='int32'``.

    Returns
    -------
    audiodata : numpy.ndarray or type(out)
        A two-dimensional (frames x channels) NumPy array is returned.
        If the sound file has only one channel, a one-dimensional array
        is returned.  Use ``always_2d=True`` to return a two-dimensional
        array anyway.

        If `out` was specified, it is returned.  If `out` has more
        frames than available in the file (or if `frames` is smaller
        than the length of `out`) and no `fill_value` is given, then
        only a part of `out` is overwritten and a view containing all
        valid frames is returned.

    Other Parameters
    ----------------
    always_2d : bool, optional
        By default, reading a mono sound file will return a
        one-dimensional array.  With ``always_2d=True``, audio data is
        always returned as a two-dimensional array, even if the audio
        file has only one channel.
    fill_value : float, optional
        If more frames are requested than available in the file, the
        rest of the output is be filled with `fill_value`.  If
        `fill_value` is not specified, a smaller array is returned.
    out : numpy.ndarray or subclass, optional
        If `out` is specified, the data is written into the given array
        instead of creating a new array.  In this case, the arguments
        `dtype` and `always_2d` are silently ignored!  If `frames` is
        not given, it is obtained from the length of `out`.
    samplerate, channels, format, subtype, endian, closefd
        See :class:`SoundFile`.

    Examples
    --------
    >>> from pb_chime5.io import load_audio
    >>> path = '/net/db/timit/pcm/train/dr1/fcjf0/sa1.wav'
    >>> data = load_audio(path)
    >>> data.shape
    (46797,)

    Say you load audio examples from a very long audio, you can provide a
    start position and a duration in samples or seconds.

    >>> path = '/net/db/timit/pcm/train/dr1/fcjf0/sa1.wav'
    >>> signal = load_audio(path, start=0, frames=16_000)
    >>> signal.shape
    (16000,)
    >>> signal = load_audio(path, start=0, frames=1, unit='seconds')
    >>> signal.shape
    (16000,)

    If the audio file is to short, only return the defined part:

    >>> signal = load_audio(path, start=0, frames=160_000)
    >>> signal.shape
    (46797,)

    >>> path = '/net/db/tidigits/tidigits/test/man/ah/111a.wav'
    >>> load_audio(path)  #doctest: +ELLIPSIS
    Traceback (most recent call last):
    ...
    RuntimeError: /net/db/tidigits/tidigits/test/man/ah/111a.wav: NIST SPHERE file
    <BLANKLINE>

    """

    # soundfile does not support pathlib.Path.
    # ToDo: Is this sill True?
    path = normalize_path(path, as_str=True)

    if unit == 'samples':
        pass
    elif unit == 'seconds':
        if stop is not None:
            if stop < 0:
                raise NotImplementedError(unit, stop)
        with soundfile.SoundFile(path) as f:
            # total_samples = len(f)
            samplerate = f.samplerate
        start = int(np.round(start * samplerate))
        if frames > 0:
            frames = int(np.round(frames * samplerate))
        if stop is not None and stop > 0:
            stop = int(np.round(stop * samplerate))
    else:
        raise ValueError(unit)

    try:
        with soundfile.SoundFile(
                path,
                'r',
        ) as f:
            if dtype is None:
                from pb_chime5.mapping import Dispatcher
                mapping = Dispatcher({
                    'PCM_16': np.int16,
                    'FLOAT': np.float32,
                    'DOUBLE': np.float64,
                })
                dtype = mapping[f.subtype]

            frames = f._prepare_read(start=start, stop=stop, frames=frames)
            data = f.read(frames=frames, dtype=dtype, fill_value=fill_value)
        signal, sample_rate = data, f.samplerate
    except RuntimeError as e:
        if isinstance(path, (Path, str)):
            if Path(path).suffix == '.wav':
                # Improve exception msg for NIST SPHERE files.
                from pb_chime5.utils.process_caller import run_process
                cp = run_process(f'file {path}')
                stdout = cp.stdout
                raise RuntimeError(f'{stdout}') from e
            else:
                raise RuntimeError(f'Wrong suffix {path.suffix} in {path}')
        raise

    if expected_sample_rate is not None:
        if expected_sample_rate != sample_rate:
            raise ValueError(
                f'Requested sampling rate is {expected_sample_rate} but the '
                f'audiofile has {sample_rate}')

    # When signal is multichannel, than soundfile return (samples, channels)
    # At NT it is more common to have the shape (channels, samples)
    # => transpose
    signal = signal.T

    if return_sample_rate:
        return signal, sample_rate
    else:
        return signal