Exemple #1
0
def build_triphones_indexer(limit, corpus, same_speaker_data):
    """Build an index of triphones appearing in each utterance of a corpus.

    limit : minimum number of utterances a triphone must appear in
            to be indexed

    Return a dict associating the (reduced) set of triphones comprised
    in an utterance with the latter's name.
    """
    # Load the dependency functions associated with the corpus to index.
    load_phone_labels, get_utterances_list, speakers = import_from_string(
        module='ac2art.corpora.%s.raw._loaders' % corpus,
        elements=['load_phone_labels', 'get_utterances_list', 'SPEAKERS'])

    # Define an auxiliary function to read an utterance's triphones.
    def load_triphones(name):
        """Load the set of triphones contained in a given utterance."""
        labels = load_phone_labels(name)
        return {
            '_'.join([phone[1] for phone in labels[i:i + 3]])
            for i in range(len(labels) - 2)
        }

    # Gather an index of triphones contained in each utterance.
    # If utterances are identical for each speaker, read a unique
    # list and index it with ranks instead of names.
    if same_speaker_data:
        utterances = {
            i: load_triphones(name)
            for i, name in enumerate(get_utterances_list(speakers[0]))
        }
    # Otherwise, gather the utterances from each and every speaker.
    else:
        utterances = {
            name: load_triphones(name)
            for name in get_utterances_list()
        }
    # Gathe the full set of triphones.
    all_triphones = {
        triphone
        for utt_triphones in utterances.values() for triphone in utt_triphones
    }
    # Select the triphones of interest.
    triphones = {
        triphone
        for triphone in all_triphones
        if sum(triphone in utt for utt in utterances.values()) >= limit
    }
    # Reduce the dict referencing triphones associated to each utterance.
    utterances_index = {
        utterance: [phone for phone in utt_triphones if phone in triphones]
        for utterance, utt_triphones in utterances.items()
    }
    return utterances_index
Exemple #2
0
def prepare_abkhazia_corpus(corpus,
                            data_folder,
                            limit_phones=True,
                            mode='w',
                            id_length=None):
    """Build or complete a corpus's data/ folder for use with abkhazia.

    corpus       : name of the corpus whose data to prepare (str)
    data_folder  : path to the 'data/' folder to build or complete
    limit_phones : whether to map the corpus' phones to a restricted set
                   of IPA phones, thus aggregating some (bool, default True)
    mode         : file writing mode (either 'w' or 'a', default 'w')
    id_length    : optional fixed length of utterances' id used internally

    Note: the `mode` and `id_length` parameters may be used to pile up
          data from multiple corpora in a single data/ folder, thus
          having abkhazia treat them as one large corpus. In this case,
          please be careful about corpus-specific phone symbols overlap.
    """
    # Check arguments validity.
    check_type_validity(corpus, str, 'corpus')
    check_type_validity(data_folder, str, 'data_folder')
    check_type_validity(limit_phones, bool, 'limit_phones')
    if mode not in ('w', 'a'):
        raise TypeError("'mode' should be a str in {'a', 'w'}.")
    check_type_validity(id_length, (int, type(None)), 'id_length')
    # Make the output directories if needed.
    wav_folder = os.path.join(data_folder, 'wavs')
    for folder in (data_folder, wav_folder):
        if not os.path.isdir(folder):
            os.makedirs(folder)
    # Gather dependency functions.
    copy_wavs, get_transcription = import_from_string(
        'ac2art.corpora.%s.abkhazia._loaders' % corpus,
        ['copy_wavs', 'get_transcription'])
    # Copy wav files to the data folder and gather the utterances list.
    utt_files = copy_wavs(wav_folder)
    utt_ids = normalize_utterance_ids(utt_files, id_length)
    # Fill the segments.txt file.
    with open(os.path.join(data_folder, 'segments.txt'), mode) as abk_file:
        abk_file.write('\n'.join(name + ' ' + name.strip('_') + '.wav'
                                 for name in utt_ids) + '\n')
    # Build the utt2spk, spk2utt, phones, silences and variants txt files.
    make_utt2spk_files(data_folder, utt_ids, mode)
    make_phones_files(data_folder, limit_phones, mode)
    # Load the corpus-specific to cross-corpus symbols conversion table.
    symbols = pd.read_csv(CONSTANTS['symbols_file'], index_col=corpus)
    symbols = symbols['common' + '_reduced' * limit_phones].to_dict()
    make_text_files(data_folder, utt_ids, get_transcription, symbols, mode)
Exemple #3
0
def build_features_extraction_functions(corpus, initial_sampling_rate,
                                        default_articulators,
                                        docstring_details):
    """Define and return a raw features extraction function for a corpus.

    corpus                : name of the corpus whose features to extract (str)
    initial_sampling_rate : initial sampling rate of the EMA data, in Hz (int)
    default_articulators  : default articulators to keep (list of str)
    docstring_details     : docstring complement for the returned functions

    Return a single function:
      - extract_utterances_data
    """
    # Long but explicit function name; pylint: disable=invalid-name
    # Define auxiliary functions through wrappers.
    control_arguments = build_arguments_checker(corpus, default_articulators)
    extract_data = build_extractor(corpus, initial_sampling_rate)
    # Import the get_utterances_list dependency function.
    get_utterances_list = import_from_string(
        'ac2art.corpora.%s.raw._loaders' % corpus, 'get_utterances_list')

    # Define a function extracting features from all utterances.
    def extract_utterances_data(audio_forms=None,
                                n_coeff=13,
                                articulators_list=None,
                                ema_sampling_rate=100,
                                audio_frames_time=25):
        """Extract acoustic and articulatory data of each {0} utterance.

        audio_forms       : optional list of representations of the audio data
                            to produce, among {{'lsf', 'lpc', 'mfcc'}}
                            (list of str, default None implying all of them)
        n_coeff           : number of static coefficients to compute for each
                            representation of the audio data, either as a
                            single int or a list of int (default 13)
                            Note : dynamic features will be added to those.
        articulators_list : optional list of raw EMA data columns to keep
                            (default None, implying twelve, detailed below)
        ema_sampling_rate : sample rate of the EMA data to use, in Hz
                            (int, default 100)
        audio_frames_time : duration of the audio frames used to compute
                            acoustic features, in milliseconds
                            (int, default 25)

        Data extractation includes the following:
          - optional resampling of the EMA data
          - framing of audio data to align acoustic and articulatory records
          - production of various acoustic features based on the audio data
          - trimming of silences at the beginning and end of each utterance

        Note: 'mfcc' audio form will produce MFCC coefficients enriched
              with pitch features, computed using abkhazia. Alternative
              computation using `data.commons.loaders.Wav.get_mfcc()` may
              be obtained using the 'mfcc_' keyword instead.

        The produced data is stored to the '{0}_processed_folder' set in the
        json configuration file, where a subfolder is built for each kind of
        features (ema, mfcc, etc.). Each utterance is stored as a '.npy' file.
        The file names include the utterance's name, extended with an indicator
        of the kind of features it contains.

        {1}
        """
        nonlocal corpus, control_arguments, extract_data, get_utterances_list
        # Check arguments, assign default values and build output folders.
        audio_forms, n_coeff, articulators_list = control_arguments(
            audio_forms, n_coeff, articulators_list, ema_sampling_rate,
            audio_frames_time)
        # Compute mfcc coefficients using abkhazia, if relevant.
        abkhazia_mfcc = 'mfcc' in audio_forms
        if abkhazia_mfcc:
            mfcc_ix = audio_forms.index('mfcc')
            wav_to_mfcc(corpus,
                        n_coeff=n_coeff[mfcc_ix],
                        pitch=True,
                        frame_time=audio_frames_time,
                        hop_time=(1000 / ema_sampling_rate))
            del audio_forms[mfcc_ix]
            del n_coeff[mfcc_ix]
        # Iterate over all corpus utterances.
        for utterance in get_utterances_list():
            extract_data(utterance, audio_forms, n_coeff, abkhazia_mfcc,
                         articulators_list, ema_sampling_rate,
                         audio_frames_time)
            end_time = time.asctime().split(' ')[-2]
            print('%s : Done with utterance %s.' % (end_time, utterance))
            sys.stdout.write('\033[F')
        # Record the list of articulators.
        path = os.path.join(CONSTANTS['%s_processed_folder' % corpus], 'ema',
                            'articulators')
        with open(path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(articulators_list))

    # Adjust the function's docstring and return it.
    extract_utterances_data.__doc__ = (extract_utterances_data.__doc__.format(
        corpus, docstring_details))
    return extract_utterances_data
Exemple #4
0
def build_extractor(corpus, initial_sampling_rate):
    """Define and return a function to extract features from an utterance.

    corpus                : name of the corpus from which to import features
    initial_sampling_rate : initial sampling rate of the EMA data, in Hz (int)
    """
    # Load the output path and dependency data loading functions.
    new_folder = CONSTANTS['%s_processed_folder' % corpus]
    load_ema, load_phone_labels, load_voicing, load_wav = import_from_string(
        module='ac2art.corpora.%s.raw._loaders' % corpus,
        elements=['load_ema', 'load_phone_labels', 'load_voicing', 'load_wav'])

    def get_boundaries(utterance, sampling_rate):
        """Return frames index to use so as to trim edge silences."""
        nonlocal load_phone_labels
        # Load phone labels and gather edge silences' timecodes.
        labels = load_phone_labels(utterance)
        start_time = labels[0][0] if labels[0][1] == '#' else 0
        end_time = labels[-2][0] if labels[-1][1] == '#' else labels[-1][0]
        # Compute and return associated frame indexes.
        start_frame = int(np.floor(start_time * sampling_rate))
        end_frame = int(np.ceil(end_time * sampling_rate))
        return start_frame, end_frame

    def extract_ema(utterance, sampling_rate, articulators):
        """Extract and return the EMA data associated with an utterance."""
        nonlocal initial_sampling_rate, load_ema, new_folder
        # Load EMA data and interpolate NaN values using cubic splines.
        ema, _ = load_ema(utterance, articulators)
        ema = np.concatenate([
            interpolate_missing_values(data_column).reshape(-1, 1)
            for data_column in np.transpose(ema)
        ],
                             axis=1)
        # Optionally resample the EMA data.
        if sampling_rate != initial_sampling_rate:
            ratio = sampling_rate / initial_sampling_rate
            ema = scipy.signal.resample(ema, num=int(len(ema) * ratio))
        # Return the EMA data.
        return ema

    def extract_audio(utterance, audio_forms, n_coeff, sampling_rate,
                      frames_time):
        """Generate and return speech features for an utterance."""
        # Wrapped function; pylint: disable=too-many-arguments
        nonlocal corpus, load_wav, new_folder
        hop_time = (1000 / sampling_rate)
        wav = load_wav(utterance, frames_time, hop_time)
        return {
            name: wav.get(name.strip('_'), n_feat, static_only=False)
            for name, n_feat in zip(audio_forms, n_coeff)
        }

    def extract_data(utterance, audio_forms, n_coeff, abkhazia_mfcc,
                     articulators, sampling_rate, frames_time):
        """Extract acoustic and articulatory data of a given utterance."""
        # Wrapped function; pylint: disable=too-many-arguments
        nonlocal load_wav, new_folder
        nonlocal extract_audio, extract_ema, get_boundaries
        # Generate or load all kinds of features for the utterance.
        data = extract_audio(utterance, audio_forms, n_coeff, sampling_rate,
                             frames_time)
        data['ema'] = extract_ema(utterance, sampling_rate, articulators)
        data['voicing'] = load_voicing(utterance, sampling_rate)
        if abkhazia_mfcc:
            path = os.path.join(new_folder, 'mfcc', utterance + '.npy')
            data['mfcc'] = np.load(path)
        # Fit the edge silences trimming values.
        start_frame, end_frame = get_boundaries(utterance, sampling_rate)
        original_end = end_frame
        for name, array in data.items():
            length = len(array)
            if length < start_frame:
                raise ValueError(
                    "Utterance '%s': '%s' features are shorter than the"
                    "expected start trimming zone." % (utterance, name))
            if length < original_end:
                print("Utterance '%s': '%s' features are shorter than expected"
                      "(%s vs %s).\nAll features will be trimmed to fit." %
                      (utterance, name, length, original_end))
                if length < end_frame:
                    end_frame = length
        # Trim and save all features sets to disk.
        for name, array in data.items():
            path = os.path.join(new_folder, name,
                                utterance + '_' + name + '.npy')
            np.save(path, array[start_frame:end_frame])

    # Return the previous last function.
    return extract_data
Exemple #5
0
def build_file_loaders(corpus):
    """Define and return functions to load single-file data or parameters."""
    # Load dependency constants and function.
    data_folder = CONSTANTS['%s_processed_folder' % corpus]
    get_utterances_list = import_from_string(
        'ac2art.corpora.%s.raw._loaders' % corpus, 'get_utterances_list'
    )
    # Define the four loading functions.

    def get_norm_parameters(file_type, speaker=None):
        """Return normalization parameters for a type of {0} features.

        file_type : type of features whose parameters to return (str)
        speaker   : optional speaker whose parameters to return (str)
                    (otherwise, corpus-wide parameters are returned)
        """
        nonlocal data_folder
        path = _get_normfile_path(data_folder, file_type, speaker)
        return np.load(path).tolist()

    def get_utterances(set_name=None):
        """Get the list of utterances from a given set.

        set_name : name of the set, e.g. 'train', 'validation' or 'test'
        """
        nonlocal data_folder, get_utterances_list
        if set_name is None:
            return get_utterances_list()
        path = os.path.join(data_folder, 'filesets', set_name + '.txt')
        with open(path) as file:
            utterances = [row.strip('\n') for row in file]
        return utterances

    def load_acoustic(
            name, audio_type='mfcc_stds', context_window=0, zero_padding=True
        ):
        """Load the acoustic data associated with an utterance from {0}.

        name           : name of the utterance whose data to load (str)
        audio_type     : name of the audio features to use, including
                         normalization indications (str, default 'mfcc_stds')
        context_window : half-size of the context window of frames to return
                         (default 0, returning single audio frames)
        zero_padding   : whether to zero-pad the data when building context
                         frames (bool, default True)
        """
        nonlocal data_folder
        audio_type, norm_type = (audio_type + '_').split('_', 1)
        folder = (
            audio_type + '_norm_' + norm_type.strip('_')
            if norm_type else audio_type
        )
        path = os.path.join(data_folder, folder, name + '_%s.npy' % audio_type)
        acoustic = np.load(path)
        if context_window:
            acoustic = (
                build_context_windows(acoustic, context_window, zero_padding)
            )
        return acoustic

    def load_ema(
            name, norm_type='', use_dynamic=True, articulators=None
        ):
        """Load the articulatory data associated with an utterance from {0}.

        name         : name of the utterance whose data to load (str)
        norm_type    : optional type of normalization to use (str)
        use_dynamic  : whether to return dynamic features (bool, default True)
        articulators : optional list of articulators to load
        """
        nonlocal corpus, data_folder, get_norm_parameters
        # Load the EMA data with proper normalization.
        ema_folder = (
            'ema' if norm_type in ('', 'mean', 'mean_byspeaker')
            else 'ema_norm_' + norm_type
        )
        ema = np.load(os.path.join(data_folder, ema_folder, name + '_ema.npy'))
        if norm_type.startswith('mean'):
            speaker = None if norm_type == 'mean' else name.split('_', 1)[0]
            ema -= get_norm_parameters('ema', speaker)['global_means']
        # Optionally select articulatory data to keep.
        add_voicing = True
        if isinstance(articulators, list):
            if 'voicing' in articulators:
                articulators = [e for e in articulators if e != 'voicing']
            else:
                add_voicing = False
            articulators_list = load_articulators_list(corpus, norm_type)
            invalid = [
                name for name in articulators if name not in articulators_list
            ]
            if invalid:
                raise KeyError(
                    'Invalid articulator(s): %s.\nValid articulators are %s.'
                    % (invalid, articulators_list)
                )
            cols_index = [articulators_list.index(key) for key in articulators]
            ema = ema[:, cols_index]
        # Optionally add dynamic features.
        if use_dynamic:
            ema = add_dynamic_features(ema)
        # Optionally add binary voicing data.
        if add_voicing:
            voicing = np.load(
                os.path.join(data_folder, 'voicing', name + '_voicing.npy')
            )
            if use_dynamic:
                n_static = ema.shape[1] // 3
                ema = np.concatenate(
                    [ema[:, :n_static], voicing, ema[:, n_static:]], axis=1
                )
            else:
                ema = np.concatenate([ema, voicing], axis=1)
        # Return the articulatory data.
        return ema

    # Adjust the functions' docstrings and return them.
    functions = (get_norm_parameters, get_utterances, load_acoustic, load_ema)
    for function in functions:
        function.__doc__ = function.__doc__.format(corpus)
    return functions
Exemple #6
0
def split_corpus_prototype(pct_train, limit, seed, corpus, lowest_limit,
                           same_speaker_data):
    """Split the {0} corpus, ensuring good triphones coverage of the sets.

    pct_train : percentage of observations used as training data; the
                rest will be divided equally between the validation
                and test sets float (between 0 and 1, default .7)
    limit     : minimum number of utterances a triphone must appear in
                so as to be taken into account (int, default {1})
    seed      : optional random seed to use

    Produce three lists of utterances, composing train, validation
    and test filesets. The filesets are built so that each triphone
    present in at list `limit` utterances appears at least once in
    each fileset. The filesets' length will also be made to match
    the `pct_train` argument.

    To achieve this, the split is conducted in two steps.
    * First, utterances are iteratively drawn in random order, and
    added to the fileset to which they add the most not-yet-covered
    triphones. This mechanically results in three filesets correctly
    covering the set of triphones (if not, the algorithm is restarted).
    * Then, utterances are randomly removed from the fileset(s) which
    prove too large compared to the desired split, under the condition
    that their removal does not break the triphones-coverage property.
    These utterances are then randomly re-assigned to the filesets
    which are too small.

    Note: due to the structure of the {0} utterances, using a `limit`
          parameter under {1} will generally fail.
    {2}
    The produced filesets are stored to the filesets/ subfolder of the
    processed {0} folder, in txt files named 'train', 'validation'
    and 'test'.
    """
    # Check arguments' validity
    check_type_validity(pct_train, float, 'pct_train')
    check_type_validity(limit, int, 'limit')
    if not 0 < pct_train < 1:
        raise ValueError('Invalid `pct_train` value: %s.' % pct_train)
    if limit < 3:
        raise ValueError('Minimum `limit` value is 3.')
    elif limit < lowest_limit:
        print('Warning: using such a low `limit` value is due to fail.')
    # Build the filesets.
    np.random.seed(seed)
    indexer = build_triphones_indexer(limit, corpus, same_speaker_data)
    filesets = build_initial_split(indexer)
    filesets = adjust_filesets(filesets, pct_train, indexer)
    # In case of identical speaker data, generalize the split to all speakers.
    if same_speaker_data:
        get_utterances_list, speakers = import_from_string(
            module='ac2art.corpora.%s.raw._loaders' % corpus,
            elements=['get_utterances_list', 'SPEAKERS'])
        utterances = {
            speaker: get_utterances_list(speaker)
            for speaker in speakers
        }
        filesets = [[
            utterances[speaker][i] for speaker in speakers for i in fileset
        ] for fileset in filesets]
    # Write the produced filesets to txt files.
    filesets_dict = dict(zip(('train', 'validation', 'test'), filesets))
    store_filesets(filesets_dict, corpus)
Exemple #7
0
def build_normalization_functions(corpus):
    """Define and return corpus-specific data normalization functions.

    Return two functions, in the following order:
      - compute_moments
      - normalize_files
    """
    # Gather dataset-specific dependencies.
    main_folder = CONSTANTS['%s_processed_folder' % corpus]
    get_utterances_list, speakers = import_from_string(
        'ac2art.corpora.%s.raw._loaders' % corpus,
        ['get_utterances_list', 'SPEAKERS'])

    # Wrap the normalization parameters computing function.
    def compute_moments(file_type, by_speaker=False, store=True):
        """Compute files moments."""
        nonlocal speakers
        # Optionally compute speaker-wise normalization parameters.
        if by_speaker:
            return {
                speaker: _compute_moments(file_type, speaker, store,
                                          main_folder, get_utterances_list)
                for speaker in speakers
            }
        # Otherwise, compute corpus-wide parameters.
        return _compute_moments(file_type, None, store, main_folder,
                                get_utterances_list)

    # Wrap the files normalization functon.
    def normalize_files(file_type, norm_type, scope='corpus'):
        """Normalize pre-extracted {0} data of a given type.

        Normalization includes de-meaning and division by either
        standard-deviation or the difference between the extremum
        points (distribution spread). Those parameters may either
        be computed file-wise, speaker-wise or corpus-wide.

        file_type  : one of {{'ema', 'energy', 'lpc', 'lsf', 'mfcc'}}
        norm_type  : normalization divisor to use ('spread' or 'stds')
        scope      : scope of the normalization parameters to use
                     ('corpus' for corpus-wide (default), 'speaker'
                     for speaker-wise and 'file' for file-wise)

        Normalized utterances are stored as .npy files in a
        properly-named folder.
        """
        nonlocal compute_moments, get_utterances_list, main_folder, speakers
        if scope == 'corpus':
            _corpus_wide_normalize(file_type, norm_type, None, main_folder,
                                   get_utterances_list, compute_moments)
        elif scope == 'speaker':
            for speaker in speakers:
                _corpus_wide_normalize(file_type, norm_type, speaker,
                                       main_folder, get_utterances_list,
                                       compute_moments)
        elif scope == 'file':
            _file_wise_normalize(file_type, norm_type, main_folder,
                                 get_utterances_list)
        else:
            raise ValueError(
                "'scope' should be one of {'corpus', 'speaker', 'file'}.")

    # Adjust the functions' docstrings and return them.
    compute_moments.__doc__ = _compute_moments.__doc__.format(corpus)
    normalize_files.__doc__ = normalize_files.__doc__.format(corpus)
    return compute_moments, normalize_files
Exemple #8
0
def build_h5features_extractor(corpus):
    """Define and return a function extracting features to h5 files.

    Return a single function:
      - extract_h5_features
    """
    # Load dependency path and functions.
    abx_folder = os.path.join(CONSTANTS['%s_processed_folder' % corpus], 'abx')
    load_acoustic, load_ema, get_utterances = import_from_string(
        'ac2art.corpora.%s.load._load' % corpus,
        ['load_acoustic', 'load_ema', 'get_utterances'])

    # Define features extraction functions.

    def _setup_features_loader(audio_features, ema_features, inverter,
                               dynamic_ema, articulators):
        """Build a function to load features associated with an utterance.

        See `extract_h5_features` documentation for arguments.
        """
        nonlocal load_acoustic, load_ema
        # Check that provided arguments make sense.
        if audio_features is None and ema_features is None:
            raise RuntimeError('No features were set to be included.')
        if inverter is not None:
            check_type_validity(inverter, (NeuralNetwork, None), 'inverter')
            if audio_features is None:
                raise RuntimeError(
                    'No acoustic features specified to feed the inverter.')
            elif ema_features is not None:
                raise RuntimeError(
                    'Both ema features and an inverter were specified.')
        # Build the acoustic features loading function.
        if audio_features is not None:
            window = (0 if inverter is None or inverter.input_shape[-1] % 11
                      else 5)
            load_audio = functools.partial(load_acoustic,
                                           audio_type=audio_features,
                                           context_window=window)
            # Optionally build and return an inverter-based features loader.
            if inverter is not None:

                def invert_features(utterance):
                    """Return the features inverted from an utterance."""
                    pred = inverter.predict(load_audio(utterance))
                    return pred

                return invert_features
            if ema_features is None:
                return load_audio
        # Build the articulatory features loading function.
        if ema_features is not None:
            load_articulatory = functools.partial(load_ema,
                                                  norm_type=ema_features,
                                                  use_dynamic=dynamic_ema,
                                                  articulators=articulators)
            if audio_features is None:
                return load_articulatory
        # When appropriate, build a global features loading function.
        def load_features(utterance):
            """Load the features associated with an utterance."""
            return np.concatenate(
                [load_audio(utterance),
                 load_articulatory(utterance)], axis=1)

        return load_features

    def extract_h5_features(audio_features=None,
                            ema_features=None,
                            inverter=None,
                            output_name='%s_features' % corpus,
                            articulators=None,
                            dynamic_ema=True,
                            sampling_rate=100):
        """Build an h5 file recording audio features associated with {0} data.

        audio_features : optional name of audio features to use, including
                         normalization indications
        ema_features   : optional name of ema features' normalization to use
                         (use '' for raw data and None for no EMA data)
        inverter       : optional acoustic-articulatory inverter whose
                         predictions to use, based on the audio features
        output_name    : base name of the output file (default '{0}_features')
        articulators   : optional list of articulators to keep among EMA data
        dynamic_ema    : whether to include dynamic articulatory features
                         (bool, default True)
        sampling_rate  : sampling rate of the frames, in Hz (int, default 100)
        """
        # Arguments serve modularity; pylint: disable=too-many-arguments
        nonlocal abx_folder, get_utterances, _setup_features_loader
        # Build the abx folder, if necessary.
        if not os.path.isdir(abx_folder):
            os.makedirs(abx_folder)
        # Check that the destination file does not exist.
        output_file = os.path.join(abx_folder, '%s.features' % output_name)
        if os.path.isfile(output_file):
            raise FileExistsError("File '%s' already exists." % output_file)
        # Set up the features loading function.
        load_features = _setup_features_loader(audio_features, ema_features,
                                               inverter, dynamic_ema,
                                               articulators)
        # Load the list of utterances and process them iteratively.
        utterances = get_utterances()
        with h5f.Writer(output_file) as writer:
            for i in range(0, len(utterances), 100):
                # Load or compute utterances list, features and time labels.
                items = utterances[i:i + 100]
                features = [load_features(item) for item in items]
                labels = [
                    np.arange(len(data)) / sampling_rate for data in features
                ]
                # Write the currently processed utterances' data to h5.
                writer.write(h5f.Data(items, labels, features, check=True),
                             groupname='features',
                             append=True)

    # Adjust the features extraction function's docstring and return it.
    extract_h5_features.__doc__ = extract_h5_features.__doc__.format(corpus)
    return extract_h5_features
Exemple #9
0
def build_abxpy_callers(corpus):
    """Define and return corpus-specific functions to run ABXpy tasks.

    Return four functions, in that order:
      - abx_from_features
      - make_abx_task
      - make_itemfile
      - load_abx_scores
    """
    # pylint: disable=too-many-statements
    # Load dependency path and function.
    abx_folder = os.path.join(CONSTANTS['%s_processed_folder' % corpus], 'abx')
    get_utterances = import_from_string(
        'ac2art.corpora.%s.load._load' % corpus, 'get_utterances')
    load_phone_labels = import_from_string(
        'ac2art.corpora.%s.raw._loaders' % corpus, 'load_phone_labels')

    # Define the functions.

    def _phones_to_itemfile(utterance, symbols):
        """Build a dict of item file rows for a given utterance."""
        nonlocal load_phone_labels
        phones = load_phone_labels(utterance)
        times = [round(time - phones[0][0], 3) for time, _ in phones[:-1]]
        phones = [symbols[phone] for _, phone in phones]
        return {
            '#file': [utterance] * (len(times) - 1),
            'onset':
            times[:-1],
            'offset':
            times[1:],
            '#phone':
            phones[1:-1],
            'context': [
                phones[i - 1] + '_' + phones[i + 1]
                for i in range(1, len(times))
            ],
            'speaker':
            utterance.split('_')[0]
        }

    def get_task_name(fileset, limit_phones):
        """Return the base name of an ABX task file based on parameters."""
        nonlocal corpus
        fileset = '' if fileset is None else fileset + '_'
        reduced = 'reduced_' * limit_phones
        return corpus + '_' + fileset + reduced

    def make_itemfile(fileset=None, limit_phones=False):
        """Build a .item file for ABXpy recording {0} phone labels.

        fileset      : optional set name whose utterances to use (str)
        limit_phones : whether to aggregate some phonemes, using
                       the 'common_reduced' column of the symbols
                       file as mapping (bool, default False)
        """
        nonlocal abx_folder, corpus, get_utterances, _phones_to_itemfile
        print('Creating item file...')
        # Establish the item file's location.
        output_file = get_task_name(fileset, limit_phones) + 'phones.item'
        output_file = os.path.join(abx_folder, output_file)
        # Write the item file's header.
        columns = ['#file', 'onset', 'offset', '#phone', 'context', 'speaker']
        with open(output_file, mode='w') as itemfile:
            itemfile.write(' '.join(columns) + '\n')
        # Load the corpus-specific to cross-corpus phone symbols mapping dict.
        # note: non-ipa cross-corpus symbols are used because ABXpy
        #       (python 2) does not support non-ascii characters
        symbols = pd.read_csv(
            CONSTANTS['symbols_file'],
            index_col=corpus)['common' + '_reduced' * limit_phones].to_dict()
        # Iteratively add utterances phone labels to the item file.
        for utterance in get_utterances(fileset):
            items = pd.DataFrame(_phones_to_itemfile(utterance, symbols))
            items[columns].to_csv(output_file,
                                  index=False,
                                  header=False,
                                  sep=' ',
                                  mode='a',
                                  encoding='utf-8')
        print('Done creating %s file.' % output_file)

    def make_abx_task(fileset=None, byspeaker=True, limit_phones=False):
        """Build a .abx ABXpy task file associated with {0} phones.

        fileset      : optional set name whose utterances to use (str)
        byspeaker    : whether to discriminate pairs from the same
                       speaker only (bool, default True)
        limit_phones : whether to aggregate some phonemes, using
                       the 'common_reduced' column of the symbols
                       file as mapping (bool, default False)
        """
        nonlocal abx_folder, corpus, make_itemfile
        print('Creating task file...')
        # Build the item file if necessary.
        task_name = get_task_name(fileset, limit_phones)
        item_file = os.path.join(abx_folder, task_name + 'phones.item')
        if not os.path.isfile(item_file):
            make_itemfile(fileset, limit_phones)
        else:
            print('Using found %s file.' % item_file)
        # Establish the task file's path and the ABXpy task's 'on' argument.
        output_file = os.path.join(
            abx_folder, task_name + ('byspk_' * byspeaker) + 'task.abx')
        within = 'context speaker' if byspeaker else 'context'
        # Run the ABXpy task module.
        abxpy_task(item_file, output_file, on='phone', by=within)
        print('Done creating %s file.' % output_file)

    def abx_from_features(features,
                          fileset=None,
                          byspeaker=True,
                          limit_phones=False,
                          n_jobs=1):
        """Run the ABXpy pipeline on a set of pre-extracted {0} features.

        features     : name of a h5 file of {0} features created with
                       the `extract_h5_features` function (str)
        fileset      : optional name of a fileset whose utterances'
                       features to use (str)
        byspeaker    : whether to discriminate pairs from the same
                       speaker only (bool, default True)
        limit_phones : whether to aggregate some phonemes, using
                       the 'ipa_reduced' column of the {0} symbols
                       file as mapping (bool, default False)
        n_jobs       : number of CPU cores to use (positive int, default 1)
        """
        nonlocal abx_folder, corpus, make_abx_task
        check_type_validity(features, str, 'features')
        check_type_validity(fileset, (str, type(None)), 'fileset')
        check_positive_int(n_jobs, 'n_jobs')
        # Declare the path to the task file.
        task_name = get_task_name(fileset, limit_phones)
        task_name += 'byspk_' * byspeaker
        task_file = os.path.join(abx_folder, task_name + 'task.abx')
        # Declare paths to the input features and output scores files.
        features_file = os.path.join(abx_folder, features + '.features')
        scores_file = features + '_' + task_name.split('_', 1)[1] + 'abx.csv'
        scores_file = os.path.join(abx_folder, scores_file)
        # Check that the features file exists.
        if not os.path.exists(features_file):
            raise FileNotFoundError("No such file: '%s'." % features_file)
        # Build the ABX task file if necessary.
        if not os.path.isfile(task_file):
            make_abx_task(fileset, byspeaker, limit_phones)
        else:
            print('Using found %s file.' % task_file)
        # Run the ABXpy pipeline.
        abxpy_pipeline(features_file, task_file, scores_file, n_jobs)
        # Replace phone symbols with IPA ones in the scores file.
        add_ipa_symbols(scores_file)

    def load_abx_scores(filename):
        """Load, aggregate and return some pre-computed abx scores."""
        nonlocal abx_folder, corpus
        # Load the ABX scores.
        path = os.path.join(abx_folder, filename + '_abx.csv')
        data = pd.read_csv(path)
        # Collapse the scores (i.e. forget about contexts and speakers).
        data['score'] *= data['n']
        data['phones'] = data.apply(
            lambda row: '_'.join(sorted([row['phone_1'], row['phone_2']])),
            axis=1)
        scores = data.groupby('phones')[['score', 'n']].sum()
        scores['score'] /= scores['n']
        # Return the properly-formatted scores.
        return scores

    # Adjust functions' docstrings and return them.
    functions = (abx_from_features, make_abx_task, make_itemfile,
                 load_abx_scores)
    for function in functions:
        function.__doc__ = function.__doc__.format(corpus)
    return functions