Beispiel #1
0
def find_longest_sequence(data_dir, speakers_list, basenames_list):
    """Find the number of speech frames from the longest sequence
    among all speakers

    # Arguments
        data_dir: directory of data to be used in the datatable.
        speakers_list: list of speakers to be used
        basenames_list: list of filenames to be used

    # Returns
        An integer with the number of frames of the longest sequence"""

    longest_sequence = 0

    for speaker in speakers_list:
        for basename in basenames_list:
            params = parse_file(
                1,
                path_join(data_dir, 'vocoded_s2s', speaker,
                          basename + '.' + 'lf0' + '.dat'))

            if params.shape[0] > longest_sequence:
                longest_sequence = params.shape[0]

    return longest_sequence
Beispiel #2
0
def find_longest_sequence(data_dir, speakers_list, basenames_list):
    """Find the number of speech frames from the longest sequence
    among all speakers

    # Arguments
        data_dir: directory of data to be used in the datatable.
                  This path must end in a '/'
        speakers_list: list of speakers to be used
        basenames_list: list of filenames to be used

    # Returns
        An integer with the number of frames of the longest sequence"""

    # Check that data_dir ends in a '/'
    try:
        assert data_dir[len(data_dir) - 1] == '/'
    except AssertionError:
        print("Please, make sure the data directory string ends with a '/'")

    longest_sequence = 0

    for speaker in speakers_list:
        for basename in basenames_list:
            params = parse_file(
                1, data_dir + 'vocoded_s2s/' + speaker + '/' + basename + '.' +
                'lf0' + '.dat')

            if params.shape[0] > longest_sequence:
                longest_sequence = params.shape[0]

    return longest_sequence
Beispiel #3
0
def seq2seq_build_file_table(source_dir, src_index, target_dir, trg_index,
                             basename, longest_seq):
    """Build a datatable from the vocoded parameters of a sequence
    from a source-target pair of speakers

    # Arguments
        source_dir: directory path to the source files
        src_index: index (0-9) of the source speaker in the speakers list
        target_dir: directory path to the target files
        trg_index: index (0-9) of the target speaker in the speakers list
        basename: name without extension of the file's params to be prepared
        longest_seq: number of frames of the longest sequence in the database

        All directory paths must end in '/'

    # Returns
        - Zero-padded (by frames) source and target datatables
        - Source and target mask vectors indicating which frames are padded (0)
          and which of them are original from the data (1)

        The mask vectors are to be used in Keras' fit method"""

    # Check that source_dir and target_dir end in a '/'
    try:
        assert source_dir[-1] == '/'
    except AssertionError:
        print(
            "Please make sure the source data directory string ends with a '/'"
        )

    try:
        assert target_dir[-1] == '/'
    except AssertionError:
        print(
            "Please make sure the target data directory string ends with a '/'"
        )

    # Parse parameter files
    source_mcp = parse_file(40, source_dir + basename + '.' + 'mcp' + '.dat')

    source_f0 = parse_file(1, source_dir + basename + '.' + 'lf0' + '.dat')
    source_f0_i = parse_file(1, source_dir + basename + '.' + 'lf0' +
                             '.i.dat')  # Interpolated data

    source_vf = parse_file(1, source_dir + basename + '.' + 'vf' + '.dat')
    source_vf_i = parse_file(1, source_dir + basename + '.' + 'vf' +
                             '.i.dat')  # Use interpolated data

    target_mcp = parse_file(40, target_dir + basename + '.' + 'mcp' + '.dat')

    target_f0 = parse_file(1, target_dir + basename + '.' + 'lf0' + '.dat')
    target_f0_i = parse_file(1, target_dir + basename + '.' + 'lf0' +
                             '.i.dat')  # Use interpolated data

    target_vf = parse_file(1, target_dir + basename + '.' + 'vf' + '.dat')
    target_vf_i = parse_file(1, target_dir + basename + '.' + 'vf' +
                             '.i.dat')  # Use interpolated data

    # Build voiced/unvoiced flag arrays
    # The flags are:
    #   1 -> voiced
    #   0 -> unvoiced
    assert source_vf.shape == source_f0.shape
    source_voiced = np.empty(source_vf.shape)
    for index, vf in enumerate(source_vf):
        source_voiced[index] = 1 - kronecker_delta(source_vf[index])

    assert target_vf.shape == target_f0.shape
    target_voiced = np.empty(target_vf.shape)
    for index, vf in enumerate(target_vf):
        target_voiced[index] = 1 - kronecker_delta(target_vf[index])

    # Initialize End-Of-Sequence flag
    src_eos_flag = np.zeros(source_vf.shape)
    src_eos_flag[-1, :] = 1

    trg_eos_flag = np.zeros(target_vf.shape)
    trg_eos_flag[-1, :] = 1

    # Initialize one-hot-encoded speaker indexes
    src_spk_index = to_categorical(src_index * np.ones(
        (source_vf.shape[0], ), dtype=int),
                                   nb_classes=10)
    trg_spk_index = to_categorical(trg_index * np.ones(
        (target_vf.shape[0], ), dtype=int),
                                   nb_classes=10)

    # Initialize padding masks, to be passed into keras' fit
    # Source mask
    source_mask = np.concatenate((np.zeros(
        (longest_seq - source_mcp.shape[0], 1)),
                                  np.ones((source_mcp.shape[0], 1))))

    # Target mask
    target_mask = np.concatenate((np.ones((target_mcp.shape[0], 1)),
                                  np.zeros(
                                      (longest_seq - target_mcp.shape[0], 1))))

    assert source_mask.shape == target_mask.shape

    # Concatenate zero-padded source and target params
    source_params = np.concatenate(
        (zero_pad_params(longest_seq, 'src', source_mcp),
         zero_pad_params(longest_seq, 'src', source_f0_i),
         zero_pad_params(longest_seq, 'src', source_vf_i),
         zero_pad_params(longest_seq, 'src', source_voiced),
         zero_pad_params(longest_seq, 'src', src_eos_flag),
         zero_pad_params(longest_seq, 'src', src_spk_index),
         zero_pad_params(longest_seq, 'src', trg_spk_index)),
        axis=1)

    target_params = np.concatenate(
        (zero_pad_params(longest_seq, 'trg', target_mcp),
         zero_pad_params(longest_seq, 'trg', target_f0_i),
         zero_pad_params(longest_seq, 'trg', target_vf_i),
         zero_pad_params(longest_seq, 'trg', target_voiced),
         zero_pad_params(longest_seq, 'trg', trg_eos_flag)),
        axis=1)

    return source_params, source_mask, target_params, target_mask
Beispiel #4
0
import numpy as np

from tfglib.construct_table import parse_file

# Define usage constant
usage = 'Usage: normalize.py [file_directory] [filename]' + \
        '[number of parameters]'

if len(argv) == 1 or \
        (len(argv) == 2 and (argv[1] == '-h' or argv[1] == '--help')):
    print(usage + "\n\nThe file directory must end in a '/'.")

elif len(argv) == 4:
    # Parse input file
    data = parse_file(int(argv[3]), argv[1] + argv[2])

    # Normalize data
    data_mean = np.mean(data, axis=0)
    data_std = np.std(data, axis=0)

    normalized_data = (data - data_mean) / data_std

    # Output normalized data to file
    np.savetxt(argv[1] + argv[2] + '.norm',
               normalized_data[:, 1:normalized_data.shape[1]],
               fmt='%.18f',
               delimiter='\t')

else:
    exit('Please, input two arguments as indicated in the usage.\n\n' + usage)
Beispiel #5
0
# Created by albert aparicio on 14/11/16
# coding: utf-8

# This script takes a .frames.txt file from SPTK's dtw output and converts
# it to time frames (5 ms frames)
#
# This script expects the path to the .frames.txt file to be passed as an
# argument

from sys import argv

import numpy as np

from tfglib.construct_table import parse_file

# Check that a file parameter has been passed
assert len(argv) == 2

# Parse frames and convert them into time
frames = 0.005 * parse_file(2, argv[1])

# Split the filename at each '.'
sp = argv[1].split('.')

# Output the resulting frames
np.savetxt(sp[0] + '.frames.dtw',
           frames,
           fmt='%.3f',
           delimiter='\t',
           header='REF\tTST\n----\t----')
Beispiel #6
0
    def seq2seq_build_file_table(self, source_dir, src_index, target_dir,
                                 trg_index, basename):
        """Build a datatable from the vocoded parameters of a sequence
    from a source-target pair of speakers
  
    # Arguments
        source_dir: directory path to the source files
        src_index: index (0-9) of the source speaker in the speakers list
        target_dir: directory path to the target files
        trg_index: index (0-9) of the target speaker in the speakers list
        basename: name without extension of the file's params to be prepared
        longest_seq: number of frames of the longest sequence in the database
  
    # Returns
        - Zero-padded (by frames) source and target datatables
        - Source and target mask vectors indicating which frames are padded (0)
          and which of them are original from the data (1)
  
        The mask vectors are to be used in Keras' fit method"""

        # Parse parameter files
        settings_dict = {
            'source': {
                'dir': source_dir,
                'params': {
                    'mcp': 40,
                    'lf0': 1,
                    'lf0.i': 1,
                    'vf': 1,
                    'vf.i': 1
                }
            },
            'target': {
                'dir': target_dir,
                'params': {
                    'mcp': 40,
                    'lf0': 1,
                    'lf0.i': 1,
                    'vf': 1,
                    'vf.i': 1
                },
            }
        }

        params_dict = {}

        for src_trg_key, src_trg_dict in settings_dict.items():
            params_dict[src_trg_key] = {}

            for extension, param_len in src_trg_dict['params'].items():
                params_dict[src_trg_key][extension] = parse_file(
                    param_len,
                    path_join(src_trg_dict['dir'],
                              basename + '.' + extension + '.dat'))

        # Build voiced/unvoiced flag arrays
        # The flags are:
        #   1 -> voiced
        #   0 -> unvoiced
        assert params_dict['source']['vf'].shape == params_dict['source'][
            'lf0'].shape
        params_dict['source']['uv'] = np.empty(
            params_dict['source']['vf'].shape, dtype=np.uint8)

        for index, vf in enumerate(params_dict['source']['vf']):
            params_dict['source']['uv'][index] = int(
                1 - kronecker_delta(params_dict['source']['vf'][index]))

        assert params_dict['target']['vf'].shape == params_dict['target'][
            'lf0'].shape
        params_dict['target']['uv'] = np.empty(
            params_dict['target']['vf'].shape, dtype=np.uint8)

        for index, vf in enumerate(params_dict['target']['vf']):
            params_dict['target']['uv'][index] = int(
                1 - kronecker_delta(params_dict['target']['vf'][index]))

        if self.shortseq:

            split_params = {}
            # - Split parameter vectors into chunks of size self.max_seq_length,
            # with a
            #   superposition of self.max_seq_length/2
            # - The last sub-sequence is padded with zeros
            # - Masks contain True for valid values, and False for padded values
            for origin, param_types in params_dict.items():
                split_params[origin] = {}

                for param_type, parameters in param_types.items():
                    split_params[origin][param_type] = {}

                    (
                        split_params[origin][param_type]['params'],
                        split_params[origin][param_type]['mask'],
                        split_params[origin][param_type]['seq_len']
                    ) = sliding_window(
                        parameters,
                        self.max_seq_length,
                        mode=origin,
                        # Hardcode padding mode at the left of the chunk
                        # parameters, self.max_seq_length, mode='source',
                        step=int(self.max_seq_length / 2))

            # Initialize an EOS flag vector for each sub-sequence
            split_params['source']['eos'] = np.zeros(
                split_params['source']['vf']['params'].shape, dtype=np.uint8)
            split_params['source']['eos'][:, -1] = 1

            split_params['target']['eos'] = np.zeros(
                split_params['target']['vf']['params'].shape, dtype=np.uint8)
            split_params['target']['eos'][:, -1] = 1

            # Assign a speaker index to each sub-sequence
            split_params['source']['src_spk'] = src_index * np.ones(
                split_params['source']['eos'].shape, dtype=np.int)
            split_params['source']['trg_spk'] = trg_index * np.ones(
                split_params['source']['eos'].shape, dtype=np.int)

            source_params = np.concatenate((
                split_params['source']['mcp']['params'],
                split_params['source']['lf0.i']['params'],
                split_params['source']['vf.i']['params'],
                split_params['source']['uv']['params'],
                split_params['source']['eos'],
                split_params['source']['src_spk'],
                split_params['source']['trg_spk'],
            ),
                                           axis=2)

            source_mask = split_params['source']['vf.i']['mask']

            source_seq_len = split_params['source']['vf.i']['seq_len']

            target_params = np.concatenate(
                (split_params['target']['mcp']['params'],
                 split_params['target']['lf0.i']['params'],
                 split_params['target']['vf.i']['params'],
                 split_params['target']['uv']['params'],
                 split_params['target']['eos']),
                axis=2)

            target_mask = split_params['target']['vf.i']['mask']

            target_seq_len = split_params['target']['vf.i']['seq_len']

        else:

            # Initialize End-Of-Sequence flag
            src_eos_flag = np.zeros(params_dict['source']['vf'].shape)
            src_eos_flag[-1, :] = 1

            trg_eos_flag = np.zeros(params_dict['target']['vf'].shape)
            trg_eos_flag[-1, :] = 1

            # Initialize one-hot-encoded speaker indexes
            src_spk_index = to_categorical(
                src_index * np.ones(
                    (params_dict['source']['vf'].shape[0], ), dtype=int), 10)
            trg_spk_index = to_categorical(
                trg_index * np.ones(
                    (params_dict['target']['vf'].shape[0], ), dtype=int), 10)

            # Initialize padding masks, to be passed into keras' fit
            # Source mask
            source_mask = np.concatenate((np.zeros(
                (self.max_seq_length - params_dict['source']['mcp'].shape[0],
                 1)), np.ones((params_dict['source']['mcp'].shape[0], 1))))

            # Target mask
            target_mask = np.concatenate(
                (np.ones((params_dict['target']['mcp'].shape[0], 1)),
                 np.zeros((self.max_seq_length -
                           params_dict['target']['mcp'].shape[0], 1))))

            assert source_mask.shape == target_mask.shape

            # Concatenate zero-padded source and target params
            source_params = np.concatenate(
                (zero_pad_params(self.max_seq_length, 'src',
                                 params_dict['source']['mcp']),
                 zero_pad_params(self.max_seq_length, 'src',
                                 params_dict['source']['lf0.i']),
                 zero_pad_params(self.max_seq_length, 'src',
                                 params_dict['source']['vf.i']),
                 zero_pad_params(self.max_seq_length, 'src',
                                 params_dict['source']['uv']),
                 zero_pad_params(self.max_seq_length, 'src', src_eos_flag),
                 zero_pad_params(self.max_seq_length, 'src', src_spk_index),
                 zero_pad_params(self.max_seq_length, 'src', trg_spk_index)),
                axis=1)

            target_params = np.concatenate(
                (zero_pad_params(self.max_seq_length, 'trg',
                                 params_dict['target']['mcp']),
                 zero_pad_params(self.max_seq_length, 'trg',
                                 params_dict['target']['lf0.i']),
                 zero_pad_params(self.max_seq_length, 'trg',
                                 params_dict['target']['vf.i']),
                 zero_pad_params(self.max_seq_length, 'trg',
                                 params_dict['target']['uv']),
                 zero_pad_params(self.max_seq_length, 'trg', trg_eos_flag)),
                axis=1)

            source_seq_len = target_seq_len = None

        # TODO Return basename with sequence(s)
        return (source_params, source_mask, source_seq_len, target_params,
                target_mask, target_seq_len)
Beispiel #7
0
# Load basenames #
##################
basenames_file = open('data/test/basenames.list', 'r')
basenames_lines = basenames_file.readlines()

# Strip '\n' characters
basenames = [line.split('\n')[0] for line in basenames_lines]

###################
# Loop over files #
###################
for basename in basenames:
    ###################
    # Load parameters #
    ###################
    mcp_params = parse_file(40,
                            'data/test/vocoded/SF1/' + basename + '.mcp.dat')
    lf0_params = parse_file(1,
                            'data/test/vocoded/SF1/' + basename + '.lf0.i.dat')
    mvf_params = parse_file(1,
                            'data/test/vocoded/SF1/' + basename + '.vf.i.dat')

    # Compute U/V flags
    assert mvf_params.shape == lf0_params.shape
    uv_flags = np.empty(mvf_params.shape)
    for index, vf in enumerate(uv_flags):
        uv_flags[index] = 1 - utils.kronecker_delta(mvf_params[index])

    # Prepare data for prediction
    mcp_params = (mcp_params - src_mcp_mean) / src_mcp_std
    mcp_params = utils.reshape_lstm(mcp_params, mcp_tsteps, mcp_data_dim)
def pretrain_save_data_parameters(
    data_dir,
    speakers_file='speakers.list',
    params_file='pretrain_params.h5',
):
    # TODO Document this function
    # Save processing start time
    start_time = time()

    print('Starting')

    longest_sequence = 0
    files_list = []

    num_spk = len([entry for entry in scandir(data_dir) if entry.is_dir()])

    spk_max = np.zeros((num_spk, 42))
    spk_min = 1e+50 * np.ones((num_spk, 42))

    speakers = open(os.path.join(data_dir, speakers_file), 'r').readlines()
    # Strip '\n' characters
    dirs = [line.split('\n')[0] for line in speakers]

    print("Processing speakers' data")
    for spk_index, a_dir in enumerate(dirs):
        for sub_root, _, sub_files in os.walk(os.path.join(data_dir, a_dir)):
            # Get basenames of files in directory
            basenames = list(
                set([
                    os.path.join(sub_root,
                                 file.split('.')[0]) for file in sub_files
                ]))

            files_list += basenames

            for basename in basenames:
                print('Processing ' + basename)

                lf0_params = parse_file(1, basename + '.lf0_log')

                if lf0_params.shape[0] > longest_sequence:
                    longest_sequence = lf0_params.shape[0]

                mcp_params = parse_file(40, basename + '.cc')

                mvf_params = parse_file(1, basename + '.i.fv')

                seq_params = np.concatenate(
                    (mcp_params, lf0_params, mvf_params), axis=1)

                # Compute maximum and minimum values
                spk_max[spk_index, :] = np.maximum(
                    spk_max[spk_index, :], np.ma.max(seq_params, axis=0))
                spk_min[spk_index, :] = np.minimum(
                    spk_min[spk_index, :], np.ma.min(seq_params, axis=0))

    print('Saving data to .h5 file')

    with File(os.path.join(data_dir, params_file), 'w') as f:
        # Save longest_sequence and the max and min values as attributes
        f.attrs.create('longest_sequence', longest_sequence, dtype=int)
        f.attrs.create('speakers_max', spk_max)
        f.attrs.create('speakers_min', spk_min)

        # TODO Support Python 2
        # sys.version_info -> Get running Python version
        dt = special_dtype(vlen=str)

        utf_list = [
            n.encode(encoding="utf-8", errors="ignore") for n in files_list
        ]
        f.create_dataset(name='files_list',
                         shape=(len(utf_list), 1),
                         data=utf_list,
                         dtype=dt)

        f.close()

    print('Elapsed time: ' + display_time(time() - start_time))
    longest_sequence = int(np.floor(longest_sequence * 1.7))

    return longest_sequence, spk_max, spk_min, files_list
def prepare_pretrain_slice(files_list,
                           params_path,
                           longest_sequence,
                           spk_max,
                           spk_min,
                           speakers_file='speakers.list',
                           dtw_prob_file='dtw_probabilities.h5',
                           basename_len=11,
                           shuffle_files=True,
                           replicate=True):
    speakers = open(os.path.join(params_path, speakers_file), 'r').readlines()
    # Strip '\n' characters
    speakers = [line.split('\n')[0] for line in speakers]

    with File(os.path.join(params_path, dtw_prob_file), 'r') as f:
        # Save numbers and probabilities
        values = f['values'][:]
        probabilities = f['probabilities'][:]

        f.close()

    # Read all files to have them loaded in memory
    mcp_params = []
    lf0_params = []
    mvf_params = []
    uv_flags = []

    for basename in files_list:
        mcp_params.append(parse_file(40, basename + '.cc'))
        lf0_params.append(parse_file(1, basename + '.lf0_log'))
        mvf_params.append(parse_file(1, basename + '.i.fv'))
        uv_flags.append(parse_file(1, basename + '.lf0_log.uv_mask'))

    # Initialize file indexes
    indexes = np.arange(len(files_list))

    while True:
        if shuffle_files:
            # Shuffle file indexs before each epoch
            np.random.shuffle(indexes)

        # Iterate over shuffled files
        for file_index in indexes:
            # Compute speaker index
            basename = files_list[file_index]

            spk_index = speakers.index(
                str(basename[-1 * (basename_len + 3):-1 * (basename_len + 1)]))

            # # Read parameters
            # mcp_params = parse_file(40, basename + '.cc')
            # lf0_params = parse_file(1, basename + '.lf0_log')
            # mvf_params = parse_file(1, basename + '.i.fv')
            # uv_flags = parse_file(1, basename + '.lf0_log.uv_mask')

            # Get max and min values for each speaker
            src_spk_max = spk_max[spk_index, :]
            src_spk_min = spk_min[spk_index, :]

            # Maxmin normalize
            src_normalized = (np.concatenate(
                (mcp_params[file_index], lf0_params[file_index],
                 mvf_params[file_index]),
                axis=1) - src_spk_min) / (src_spk_max - src_spk_min)

            # One-hot encode the speaker indexes
            spk_index_vector = np.repeat(spk_index,
                                         lf0_params[file_index].shape[0],
                                         axis=0)

            # Construct End-Of-Sequence flags
            eos_flags = np.zeros(lf0_params[file_index].shape[0])
            eos_flags[-1] = 1

            # Construct sequence "slice"
            seq_params = np.concatenate((
                src_normalized,
                uv_flags[file_index],
                np.reshape(eos_flags, (-1, 1)),
                np.reshape(spk_index_vector, (-1, 1)),
                np.reshape(spk_index_vector, (-1, 1)),
            ),
                                        axis=1)

            if replicate:
                # Replicate frames with dtw probabilities
                # TODO Change the function so it takes seq_params as separate args
                (src_res, trg_res, _,
                 trg_mask) = replicate_frames(seq_params, longest_sequence,
                                              values, probabilities)
            else:
                src_res = np.concatenate(
                    (seq_params,
                     np.zeros((longest_sequence - seq_params.shape[0],
                               seq_params.shape[1]))))
                trg_res = np.concatenate(
                    (seq_params[:, 0:44],
                     np.zeros((longest_sequence - seq_params.shape[0], 44))))
                trg_mask = np.concatenate(
                    (np.ones((seq_params.shape[0], 1)),
                     np.zeros((longest_sequence - seq_params.shape[0], 1))))

            # Prepare feedback data
            feedback_data = np.roll(trg_res, 1, axis=0)
            feedback_data[0, :] = 0

            # Return slice frames
            # print('Sliced ' + basename)
            yield (src_res[:, 0:44], src_res[:, 44:45].reshape(
                (-1)), src_res[:, 45:46].reshape((-1)), feedback_data,
                   trg_res[:, 0:42], trg_res[:, 42:44], trg_mask)
        fpr[src_spk][trg_spk] = dict()
        tpr[src_spk][trg_spk] = dict()
        roc_auc[src_spk][trg_spk] = dict()

        # for i in range(src_test_datatable.shape[0]):
        for i in range(len(basenames)):
            # TODO Consider plotting an averaged ROC for each spk combination
            print(src_spk + '->' + trg_spk + ' ' + basenames[i])
            # TODO figure out if this is necessary
            # fpr[src_spk][trg_spk][basenames[i]] = dict()
            # tpr[src_spk][trg_spk][basenames[i]] = dict()
            # roc_auc[src_spk][trg_spk][basenames[i]] = dict()

            # Load raw U/V flags
            raw_uv = parse_file(
                1, 'data/test/s2s_predicted/' + src_spk + '-' + trg_spk + '/' +
                basenames[i] + '.uv.dat')

            # Round U/V flags
            rounded_uv = np.round(raw_uv)

            # Compute ROC curve and the area under it
            (fpr[src_spk][trg_spk][basenames[i]],
             tpr[src_spk][trg_spk][basenames[i]], _) = roc_curve(
                 trg_test_datatable[i + (src_spk_ind + trg_spk_ind) *
                                    len(basenames), :, 42], rounded_uv)

            roc_auc[src_spk][trg_spk][basenames[i]] = auc(
                fpr[src_spk][trg_spk][basenames[i]],
                tpr[src_spk][trg_spk][basenames[i]])