Ejemplo n.º 1
0
def main():

    for data_type in ['train', 'dev', 'test']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' ' * 20)
        print('=' * 50)

        ########################################
        # inputs
        ########################################
        print('=> Processing input data...')
        if args.save_format in ['numpy', 'htk']:
            input_save_path = mkdir_join(args.feature_save_path,
                                         args.save_format)
            if isfile(join(input_save_path, data_type, 'complete.txt')):
                print('Already exists.')
            else:
                if args.tool == 'htk':
                    audio_paths = path.htk(data_type=data_type)
                else:
                    audio_paths = path.wav(data_type=data_type)

                if data_type != 'train':
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))
                else:
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None

                # Read htk or wav files, and save input data and frame num dict
                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # timit/feature/save_format/data_type/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # labels (character)
        ########################################
        print('\n=> Processing transcripts (char)...')
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if data_type == 'test' else False
        trans_dict = read_char(label_paths=path.trans(data_type=data_type),
                               vocab_file_save_path=mkdir_join(
                                   './config', 'vocab_files'),
                               save_vocab_file=save_vocab_file,
                               is_test=is_test)

        ########################################
        # dataset (character, csv)
        ########################################
        print('\n=> Saving dataset files (char)...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_type)
        df_char = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_char_capital = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        for utt_name, [char_indices,
                       char_indices_capital] in tqdm(trans_dict.items()):
            if args.save_format == 'numpy':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.npy')
                assert isfile(input_utt_save_path)
                input_utt = np.load(input_utt_save_path)
            elif args.save_format == 'htk':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.htk')
                assert isfile(input_utt_save_path)
                input_utt, _, _ = read(input_utt_save_path)
            elif args.save_format == 'wav':
                input_utt_save_path = path.utt2wav(utt_name)
                assert isfile(input_utt_save_path)
                input_utt = w2f_psf(input_utt_save_path,
                                    feature_type=CONFIG['feature_type'],
                                    feature_dim=CONFIG['channels'],
                                    use_energy=CONFIG['energy'],
                                    use_delta1=CONFIG['delta'],
                                    use_delta2=CONFIG['deltadelta'],
                                    window=CONFIG['window'],
                                    slide=CONFIG['slide'])
            else:
                raise ValueError('save_format is numpy or htk or wav.')
            frame_num = input_utt.shape[0]
            del input_utt

            series_char = pd.Series(
                [frame_num, input_utt_save_path, char_indices],
                index=df_char.columns)
            series_char_capital = pd.Series(
                [frame_num, input_utt_save_path, char_indices_capital],
                index=df_char_capital.columns)

            df_char = df_char.append(series_char, ignore_index=True)
            df_char_capital = df_char_capital.append(series_char_capital,
                                                     ignore_index=True)

        df_char.to_csv(join(dataset_save_path, 'character.csv'))
        df_char_capital.to_csv(
            join(dataset_save_path, 'character_capital_divide.csv'))

        ########################################
        # labels (phone)
        ########################################
        print('\n=> Processing transcripts (phone)...')
        trans_dict = read_phone(label_paths=path.phone(data_type=data_type),
                                vocab_file_save_path=mkdir_join(
                                    './config', 'vocab_files'),
                                save_vocab_file=save_vocab_file,
                                is_test=is_test)

        ########################################
        # dataset (phone, csv)
        ########################################
        print('\n=> Saving dataset files (phone)...')
        df_phone61 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone48 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone39 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        for utt_name, [phone61_indices, phone48_indices,
                       phone39_indices] in tqdm(trans_dict.items()):
            if args.save_format == 'numpy':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.npy')
                assert isfile(input_utt_save_path)
                input_utt = np.load(input_utt_save_path)
            elif args.save_format == 'htk':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.htk')
                assert isfile(input_utt_save_path)
                input_utt, _, _ = read(input_utt_save_path)
            elif args.save_format == 'wav':
                input_utt_save_path = path.utt2wav(utt_name)
                assert isfile(input_utt_save_path)
                input_utt = w2f_psf(input_utt_save_path,
                                    feature_type=CONFIG['feature_type'],
                                    feature_dim=CONFIG['channels'],
                                    use_energy=CONFIG['energy'],
                                    use_delta1=CONFIG['delta'],
                                    use_delta2=CONFIG['deltadelta'],
                                    window=CONFIG['window'],
                                    slide=CONFIG['slide'])
            else:
                raise ValueError('save_format is numpy or htk or wav.')
            frame_num = input_utt.shape[0]
            del input_utt

            series_phone61 = pd.Series(
                [frame_num, input_utt_save_path, phone61_indices],
                index=df_phone61.columns)
            series_phone48 = pd.Series(
                [frame_num, input_utt_save_path, phone48_indices],
                index=df_phone48.columns)
            series_phone39 = pd.Series(
                [frame_num, input_utt_save_path, phone39_indices],
                index=df_phone39.columns)

            df_phone61 = df_phone61.append(series_phone61, ignore_index=True)
            df_phone48 = df_phone48.append(series_phone48, ignore_index=True)
            df_phone39 = df_phone39.append(series_phone39, ignore_index=True)

        df_phone61.to_csv(join(dataset_save_path, 'phone61.csv'))
        df_phone48.to_csv(join(dataset_save_path, 'phone48.csv'))
        df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
Ejemplo n.º 2
0
def read_audio(audio_paths,
               tool,
               config,
               normalize,
               is_training,
               speaker_gender_dict,
               save_path=None,
               save_format=None,
               global_mean_male=None,
               global_mean_female=None,
               global_std_male=None,
               global_std_female=None,
               dtype=np.float32):
    """Read audio files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool): Set True if save as training set
        speaker_gender_dict (dict): A dictionary of speakers' gender information
            key (string) => speaker
            value (string) => F or M
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean_male (np.ndarray, optional): global mean of male over
            the training set
        global_std_male (np.ndarray, optional): global standard deviation
            of male over the training set
        global_mean_female (np.ndarray, optional): global mean of female
            over the training set
        global_std_female (np.ndarray, optional): global standard
            deviation of female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_std_male is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError('tool must be "htk" or "python_speech_features"' +
                        ' or "librosa".')

    audio_path_dict = {}
    audio_path_list_male, audio_path_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}

    # Loop 1: Divide all audio paths into speakers
    print('=====> Reading audio files...')
    for i, audio_path in enumerate(tqdm(audio_paths)):
        # ex.) audio_path: speaker-book-utt_index.***
        speaker, book, utt_index = basename(audio_path).split('.')[0].split(
            '-')
        if speaker not in audio_path_dict.keys():
            audio_path_dict[speaker] = []
        audio_path_dict[speaker].append(audio_path)

        if is_training:
            # Read each audio file
            if tool == 'htk':
                input_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                input_utt = w2f_psf(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])
            elif tool == 'librosa':
                input_utt = w2f_librosa(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])

            input_utt_sum = np.sum(input_utt, axis=0)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt.shape[1]
                global_mean_male = np.zeros((feature_dim, ), dtype=dtype)
                global_mean_female = np.zeros((feature_dim, ), dtype=dtype)
                global_std_male = np.zeros((feature_dim, ), dtype=dtype)
                global_std_female = np.zeros((feature_dim, ), dtype=dtype)

            # For computing global mean
            if speaker_gender_dict[speaker] == 'M':
                audio_path_list_male.append(input_utt)
                global_mean_male += input_utt_sum
                total_frame_num_male += input_utt.shape[0]
            elif speaker_gender_dict[speaker] == 'F':
                audio_path_list_female.append(input_utt)
                global_mean_female += input_utt_sum
                total_frame_num_female += input_utt.shape[0]
            else:
                raise ValueError('gender is M or F.')

            # For computing speaker mean
            if normalize == 'speaker':
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    # Initialize speaker statistics
                    speaker_mean_dict[speaker] = np.zeros((feature_dim, ),
                                                          dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros((feature_dim, ),
                                                         dtype=dtype)
                speaker_mean_dict[speaker] += input_utt_sum
                total_frame_num_dict[speaker] += input_utt.shape[0]

    # Loop 2: Computing global mean and sttdev
    if is_training and normalize != 'no':
        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()):
            if normalize == 'speaker':
                # Compute speaker mean
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

            for audio_path in audio_paths_speaker:
                speaker, book, utt_index = basename(audio_path).split(
                    '.')[0].split('-')

                # Read each audio file
                if tool == 'htk':
                    input_utt, sampPeriod, parmKind = read(audio_path)
                elif tool == 'python_speech_features':
                    input_utt = w2f_psf(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])
                elif tool == 'librosa':
                    input_utt = w2f_librosa(
                        audio_path,
                        feature_type=config['feature_type'],
                        feature_dim=config['channels'],
                        use_energy=config['energy'],
                        use_delta1=config['delta'],
                        use_delta2=config['deltadelta'],
                        window=config['window'],
                        slide=config['slide'])

                # For computing global stddev
                if speaker_gender_dict[speaker] == 'M':
                    global_std_male += np.sum(np.abs(input_utt -
                                                     global_mean_male)**2,
                                              axis=0)
                elif speaker_gender_dict[speaker] == 'F':
                    global_std_female += np.sum(np.abs(input_utt -
                                                       global_mean_female)**2,
                                                axis=0)
                else:
                    raise ValueError('gender is M or F.')

                if normalize == 'speaker':
                    # For computing speaker stddev
                    speaker_std_dict[speaker] += np.sum(
                        np.abs(input_utt - speaker_mean_dict[speaker])**2,
                        axis=0)

            if normalize == 'speaker':
                # Compute speaker stddev
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] /
                    (total_frame_num_dict[speaker] - 1))

        # Compute global stddev per gender
        global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(global_std_female /
                                    (total_frame_num_female - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Loop 3: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()):
        for audio_path in audio_paths_speaker:
            speaker, book, utt_index = basename(audio_path).split(
                '.')[0].split('-')

            # Read each audio file
            if tool == 'htk':
                input_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                input_utt = w2f_psf(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])
            elif tool == 'librosa':
                input_utt = w2f_librosa(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set per gender
                if speaker_gender_dict[speaker] == 'M':
                    input_utt -= global_mean_male
                    input_utt /= global_std_male
                elif speaker_gender_dict[speaker] == 'F':
                    input_utt -= global_mean_female
                    input_utt /= global_std_female
                else:
                    raise ValueError('gender is M or F.')
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt -= speaker_mean_dict[speaker]
                input_utt /= speaker_std_dict[speaker]
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                raise ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                input_name = basename(audio_path).split('.')[0]
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(save_path, speaker,
                                                      input_name + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    write(input_utt,
                          htk_path=mkdir_join(save_path, speaker,
                                              input_name + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_mean_female, global_std_male,
            global_std_female, frame_num_dict)
Ejemplo n.º 3
0
def main(data_size):

    for data_type in [
            'train', 'dev_clean', 'dev_other', 'test_clean', 'test_other'
    ]:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # inputs
        ########################################
        print('=> Processing input data...')
        if args.save_format in ['numpy', 'htk']:
            input_save_path = mkdir_join(args.feature_save_path,
                                         args.save_format, data_size)
            if isfile(join(input_save_path, data_type, 'complete.txt')):
                print('Already exists.')
            else:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           speaker_gender_dict=path.speaker_gender_dict,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_mean_female=global_mean_female,
                           global_std_male=global_std_male,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # librispeech/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # labels
        ########################################
        print('\n=> Processing transcripts...')
        if data_type == 'train':
            label_paths = path.trans(data_type='train' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'test' in data_type else False

        speaker_dict = read_trans(label_paths=label_paths,
                                  data_size=data_size,
                                  vocab_file_save_path=mkdir_join(
                                      './config', 'vocab_files'),
                                  save_vocab_file=save_vocab_file,
                                  is_test=is_test,
                                  data_type=data_type)

        ########################################
        # dataset (csv)
        ########################################
        print('\n=> Saving dataset files...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)
        df_char = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_char_capital = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq1 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq5 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq10 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq15 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])

        utt_count = 0
        df_char_list, df_char_capital_list = [], []
        df_word_freq1_list, df_word_freq5_list = [], []
        df_word_freq10_list, df_word_freq15_list = [], []
        for speaker, utt_dict in tqdm(speaker_dict.items()):
            for utt_name, indices_list in utt_dict.items():
                if args.save_format == 'numpy':
                    input_utt_save_path = join(input_save_path, data_type,
                                               speaker, utt_name + '.npy')
                    assert isfile(input_utt_save_path)
                    input_utt = np.load(input_utt_save_path)
                elif args.save_format == 'htk':
                    input_utt_save_path = join(input_save_path, data_type,
                                               speaker, utt_name + '.htk')
                    assert isfile(input_utt_save_path)
                    input_utt, _, _ = read(input_utt_save_path)
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_name)
                    assert isfile(input_utt_save_path)
                    input_utt = w2f_psf(input_utt_save_path,
                                        feature_type=CONFIG['feature_type'],
                                        feature_dim=CONFIG['channels'],
                                        use_energy=CONFIG['energy'],
                                        use_delta1=CONFIG['delta'],
                                        use_delta2=CONFIG['deltadelta'],
                                        window=CONFIG['window'],
                                        slide=CONFIG['slide'])
                else:
                    raise ValueError('save_format is numpy or htk or wav.')
                frame_num = input_utt.shape[0]
                del input_utt

                char_indices, char_indices_capital, word_freq1_indices = indices_list[:
                                                                                      3]
                word_freq5_indices, word_freq10_indices, word_freq15_indices = indices_list[
                    3:6]

                series_char = pd.Series(
                    [frame_num, input_utt_save_path, char_indices],
                    index=df_char.columns)
                series_char_capital = pd.Series(
                    [frame_num, input_utt_save_path, char_indices_capital],
                    index=df_char_capital.columns)
                series_word_freq1 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq1_indices],
                    index=df_word_freq1.columns)
                series_word_freq5 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq5_indices],
                    index=df_word_freq5.columns)
                series_word_freq10 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq10_indices],
                    index=df_word_freq10.columns)
                series_word_freq15 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq15_indices],
                    index=df_word_freq15.columns)

                df_char = df_char.append(series_char, ignore_index=True)
                df_char_capital = df_char_capital.append(series_char_capital,
                                                         ignore_index=True)
                df_word_freq1 = df_word_freq1.append(series_word_freq1,
                                                     ignore_index=True)
                df_word_freq5 = df_word_freq5.append(series_word_freq5,
                                                     ignore_index=True)
                df_word_freq10 = df_word_freq10.append(series_word_freq10,
                                                       ignore_index=True)
                df_word_freq15 = df_word_freq15.append(series_word_freq15,
                                                       ignore_index=True)

                utt_count += 1

                # Reset
                if utt_count == 50000:
                    df_char_list.append(df_char)
                    df_char_capital_list.append(df_char_capital)
                    df_word_freq1_list.append(df_word_freq1)
                    df_word_freq5_list.append(df_word_freq5)
                    df_word_freq10_list.append(df_word_freq10)
                    df_word_freq15_list.append(df_word_freq15)

                    df_char = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_char_capital = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq1 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq5 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq10 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq15 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])

                    utt_count = 0

        # Last dataframe
        df_char_list.append(df_char)
        df_char_capital_list.append(df_char_capital)
        df_word_freq1_list.append(df_word_freq1)
        df_word_freq5_list.append(df_word_freq5)
        df_word_freq10_list.append(df_word_freq10)
        df_word_freq15_list.append(df_word_freq15)

        # Concatenate all dataframes
        df_char = df_char_list[0]
        df_char_capital = df_char_capital_list[0]
        df_word_freq1 = df_word_freq1_list[0]
        df_word_freq5 = df_word_freq5_list[0]
        df_word_freq10 = df_word_freq10_list[0]
        df_word_freq15 = df_word_freq15_list[0]

        for df_i in df_char_list[1:]:
            df_char = pd.concat([df_char, df_i], axis=0)
        for df_i in df_char_list[1:]:
            df_char_capital = pd.concat([df_char_capital, df_i], axis=0)
        for df_i in df_word_freq1_list[1:]:
            df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0)
        for df_i in df_word_freq5_list[1:]:
            df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0)
        for df_i in df_word_freq10_list[1:]:
            df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0)
        for df_i in df_word_freq15_list[1:]:
            df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0)

        df_char.to_csv(join(dataset_save_path, 'character.csv'))
        df_char_capital.to_csv(
            join(dataset_save_path, 'character_capital_divide.csv'))
        df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv'))
        df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv'))
        df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv'))
        df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
Ejemplo n.º 4
0
def read_audio(audio_paths,
               speaker_dict,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format='numpy',
               global_mean_male=None,
               global_mean_female=None,
               global_std_male=None,
               global_std_female=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        speaker_dict (dict): dictionary of speakers
            key => speaker
            value => dictionary of utterance information of each speaker
                key => utterance index
                value => [start_frame, end_frame, trans_kana, trans_kanji]
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool, optional): training or not
        save_path (string): path to save npy files
        save_format (string, optional): numpy or htk
        global_mean_male (np.ndarray, optional): global mean of male over the
            training set
        global_std_male (np.ndarray, optional): global standard deviation of
            male over the training set
        global_mean_female (np.ndarray, optional): global mean of female over
            the training set
        global_std_female (np.ndarray, optional): global standard deviation of
            female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_mean_female is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError('tool must be "htk" or "python_speech_features"' +
                        ' or "librosa".')

    audio_path_list_male, audio_path_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict = {}

    # NOTE: 講演ごとに異なるspeakerとみなす

    # Loop 1: Computing global mean and statistics
    if is_training and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = basename(audio_path).split('.')[0]

            # Divide each audio file into utterances
            _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt_sum.shape[0]
                global_mean_male = np.zeros((feature_dim, ), dtype=dtype)
                global_mean_female = np.zeros((feature_dim, ), dtype=dtype)
                global_std_male = np.zeros((feature_dim, ), dtype=dtype)
                global_std_female = np.zeros((feature_dim, ), dtype=dtype)

            # For computing global mean
            if speaker[3] == 'M':
                audio_path_list_male.append(audio_path)
                global_mean_male += input_utt_sum
                total_frame_num_male += total_frame_num_speaker
            elif speaker[3] == 'F':
                audio_path_list_female.append(audio_path)
                global_mean_female += input_utt_sum
                total_frame_num_female += total_frame_num_speaker
            else:
                raise ValueError

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_mean_dict[speaker] = speaker_mean
                total_frame_num_dict[speaker] = total_frame_num_speaker
                # NOTE: speaker mean is already computed

        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        for audio_path in tqdm(audio_paths):
            speaker = basename(audio_path).split('.')[0]

            # Divide each audio into utterances
            input_data_dict_speaker, _, _, _, _ = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            # For computing global stddev
            if speaker[3] == 'M':
                for input_utt in input_data_dict_speaker.values():
                    global_std_male += np.sum(np.abs(input_utt -
                                                     global_mean_male)**2,
                                              axis=0)
            elif speaker[3] == 'F':
                for input_utt in input_data_dict_speaker.values():
                    global_std_female += np.sum(np.abs(input_utt -
                                                       global_mean_female)**2,
                                                axis=0)
            else:
                raise ValueError

        # Compute global stddev per gender
        global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(global_std_female /
                                    (total_frame_num_female - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Loop 2: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = basename(audio_path).split('.')[0]

        if normalize == 'speaker' and is_training:
            speaker_mean = speaker_mean_dict[speaker]
        else:
            speaker_mean = None

        # Divide each audio into utterances
        input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment(
            audio_path,
            speaker,
            speaker_dict[speaker],
            is_training=is_training,
            sil_duration=0,
            tool=tool,
            config=config,
            mean=speaker_mean)  # for compute speaker sttdev
        # NOTE: input_data_dict_speaker have been not normalized yet

        for utt_index, input_utt in input_data_dict_speaker.items():

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set per gender
                if speaker[3] == 'M':
                    input_utt -= global_mean_male
                    input_utt /= global_std_male
                elif speaker[3] == 'F':
                    input_utt -= global_mean_female
                    input_utt /= global_std_female
                else:
                    raise ValueError
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt = (input_utt - speaker_mean) / speaker_std
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                raise ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(
                        save_path, speaker, speaker + '_' + utt_index + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    if sampPeriod is None:
                        _, sampPeriod, parmKind = read(audio_path)
                    write(input_utt,
                          htk_path=mkdir_join(
                              save_path, speaker,
                              speaker + '_' + utt_index + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_mean_female, global_std_male,
            global_std_female, frame_num_dict)
Ejemplo n.º 5
0
def read_audio(audio_paths,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format=None,
               global_mean_male=None,
               global_std_male=None,
               global_mean_female=None,
               global_std_female=None,
               dtype=np.float32):
    """Read audio files.
    Args:
        audio_paths (list): paths to audio files
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool, optional):  Set True when proccessing the training set
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean_male (np.ndarray, optional): global mean of male over
            the training set
        global_std_male (np.ndarray, optional): global standard deviation
            of male over the training set
        global_mean_female (np.ndarray, optional): global mean of female
            over the training set
        global_std_female (np.ndarray, optional): global standard
            deviation of female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_std_male is None:
            raise ValueError(
                'Set global mean & std computed over the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')

    # Read each audio file
    print('=====> Reading audio files...')
    audio_paths_male, audio_paths_female = [], []
    input_data_list_male, input_data_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}
    for audio_path in tqdm(audio_paths):
        speaker = audio_path.split('/')[-2]
        gender = speaker[0]  # f (female) or m (male)
        utt_index = basename(audio_path).split('.')[0]

        if tool == 'htk':
            input_utt, sampPeriod, parmKind = read(audio_path)
            # NOTE: audio_path is a htk file path in this case
        elif tool == 'python_speech_features':
            input_utt = w2f_psf(audio_path,
                                feature_type=config['feature_type'],
                                feature_dim=config['channels'],
                                use_energy=config['energy'],
                                use_delta1=config['delta'],
                                use_delta2=config['deltadelta'],
                                window=config['window'],
                                slide=config['slide'])
        elif tool == 'librosa':
            input_utt = w2f_librosa(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])

        # for debug
        # print(input_utt.shape)

        if gender == 'm':
            input_data_list_male.append(input_utt)
            audio_paths_male.append(audio_path)
        elif gender == 'f':
            input_data_list_female.append(input_utt)
            audio_paths_female.append(audio_path)
        else:
            raise ValueError('gender is m or f.')

        if is_training:
            speaker = audio_path.split('/')[-2]
            gender = speaker[0]
            frame_num_utt, feat_dim = input_utt.shape

            if gender == 'm':
                total_frame_num_male += frame_num_utt
            elif gender == 'f':
                total_frame_num_female += frame_num_utt
            else:
                raise ValueError('gender is m or f.')

            if normalize == 'speaker':
                # Initialization
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    speaker_mean_dict[speaker] = np.zeros((feat_dim, ),
                                                          dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros((feat_dim, ),
                                                         dtype=dtype)

                total_frame_num_dict[speaker] += frame_num_utt
                speaker_mean_dict[speaker] += np.sum(input_utt, axis=0)
    # NOTE: Load all data in advance because TIMIT is a small dataset.

    if is_training and normalize != 'no':
        # Compute speaker mean
        if normalize == 'speaker':
            for speaker in speaker_mean_dict.keys():
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

        # Compute global mean & std per gender
        print('=====> Computing global mean & std over the training set...')
        frame_offset = 0
        feat_dim = input_data_list_male[0].shape[1]
        train_data_male = np.empty((total_frame_num_male, feat_dim))
        train_data_female = np.empty((total_frame_num_female, feat_dim))
        # male
        for input_utt, audio_path in zip(tqdm(input_data_list_male),
                                         audio_paths_male):
            speaker = audio_path.split('/')[-2]
            frame_num_utt = input_utt.shape[0]
            train_data_male[frame_offset:frame_offset +
                            frame_num_utt] = input_utt
            frame_offset += frame_num_utt

            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0)
        # female
        frame_offset = 0
        for input_utt, audio_path in zip(tqdm(input_data_list_female),
                                         audio_paths_female):
            speaker = audio_path.split('/')[-2]
            frame_num_utt = input_utt.shape[0]
            train_data_female[frame_offset:frame_offset +
                              frame_num_utt] = input_utt
            frame_offset += frame_num_utt

            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0)

        # Compute speaker std
        if normalize == 'speaker':
            for speaker in speaker_std_dict.keys():
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] /
                    (total_frame_num_dict[speaker] - 1))

        global_mean_male = np.mean(train_data_male, axis=0)
        global_std_male = np.std(train_data_male, axis=0)
        global_mean_female = np.mean(train_data_female, axis=0)
        global_std_female = np.std(train_data_female, axis=0)

        if save_path is not None:
            # Save global mean & std
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Save input features as npy files
    print('=====> Normalization...')
    frame_num_dict = {}
    for input_utt, audio_path in zip(
            tqdm(input_data_list_male + input_data_list_female),
            audio_paths_male + audio_paths_female):
        speaker = audio_path.split('/')[-2]
        utt_index = basename(audio_path).split('.')[0]
        gender = speaker[0]

        if normalize == 'no':
            pass
        elif normalize == 'global' or not is_training:
            # Normalize by global mean & std over the training set
            if gender == 'm':
                input_utt -= global_mean_male
                input_utt /= global_std_male
            elif gender == 'f':
                input_utt -= global_mean_female
                input_utt /= global_std_female
            else:
                raise ValueError('gender is m or f.')
        elif normalize == 'speaker':
            # Normalize by mean & std per speaker
            input_utt -= speaker_mean_dict[speaker]
            input_utt /= speaker_std_dict[speaker]
        elif normalize == 'utterance':
            # Normalize by mean & std per utterance
            utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
            utt_std = np.std(input_utt, axis=0, dtype=dtype)
            input_utt = (input_utt - utt_mean) / utt_std
        else:
            raise ValueError

        frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

        if save_path is not None:
            # Save input features
            if save_format == 'numpy':
                np.save(
                    mkdir_join(save_path, speaker,
                               speaker + '_' + utt_index + '.npy'), input_utt)
            elif save_format == 'htk':
                write(input_utt,
                      htk_path=mkdir_join(save_path, speaker,
                                          speaker + '_' + utt_index + '.htk'),
                      sampPeriod=sampPeriod,
                      parmKind=parmKind)
            else:
                raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_std_male, global_mean_female,
            global_std_female, frame_num_dict)
Ejemplo n.º 6
0
def main(data_size):

    speaker_dict_dict = {}  # dict of speaker_dict
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # labels
        ########################################
        if data_type == 'train':
            label_paths = path.trans(data_type='train_' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'eval' in data_type else False

        print('=> Processing transcripts...')
        speaker_dict_dict[data_type] = read_sdb(
            label_paths=label_paths,
            data_size=data_size,
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test,
            data_type=data_type)

        ########################################
        # inputs
        ########################################
        print('\n=> Processing input data...')
        input_save_path = mkdir_join(args.feature_save_path, args.save_format,
                                     data_size)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='train' + data_size)
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train_' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train_' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           speaker_dict=speaker_dict_dict[data_type],
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

    ########################################
    # dataset (csv)
    ########################################
    print('\n=> Saving dataset files...')
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)

        print('---------- %s ----------' % data_type)
        df_kanji = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_kana = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])

        utt_count = 0
        df_kanji_list, df_kana_list, df_phone_list = [], [], []
        for speaker, utt_dict in tqdm(speaker_dict_dict[data_type].items()):
            for utt_index, utt_info in utt_dict.items():
                trans_kanji, trans_kana, trans_phone = utt_info[2:]
                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.npy')
                    assert isfile(input_utt_save_path)
                    input_utt = np.load(input_utt_save_path)
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.htk')
                    assert isfile(input_utt_save_path)
                    input_utt, _, _ = read(input_utt_save_path)
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                    assert isfile(input_utt_save_path)
                    input_utt = w2f_psf(input_utt_save_path,
                                        feature_type=CONFIG['feature_type'],
                                        feature_dim=CONFIG['channels'],
                                        use_energy=CONFIG['energy'],
                                        use_delta1=CONFIG['delta'],
                                        use_delta2=CONFIG['deltadelta'],
                                        window=CONFIG['window'],
                                        slide=CONFIG['slide'])
                else:
                    raise ValueError('save_format is numpy or htk or wav.')
                frame_num = input_utt.shape[0]
                del input_utt

                series_kanji = pd.Series(
                    [frame_num, input_utt_save_path, trans_kanji],
                    index=df_kanji.columns)
                series_kana = pd.Series(
                    [frame_num, input_utt_save_path, trans_kana],
                    index=df_kana.columns)
                series_phone = pd.Series(
                    [frame_num, input_utt_save_path, trans_phone],
                    index=df_phone.columns)

                df_kanji = df_kanji.append(series_kanji, ignore_index=True)
                df_kana = df_kana.append(series_kana, ignore_index=True)
                df_phone = df_phone.append(series_phone, ignore_index=True)

                utt_count += 1

                # Reset
                if utt_count == 50000:
                    df_kanji_list.append(df_kanji)
                    df_kana_list.append(df_kana)
                    df_phone_list.append(df_phone)
                    df_kanji = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_kana = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_phone = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    utt_count = 0

            # Last dataframe
            df_kanji_list.append(df_kanji)
            df_kana_list.append(df_kana)
            df_phone_list.append(df_phone)

            # Concatenate all dataframes
            df_kanji = df_kanji_list[0]
            df_kana = df_kana_list[0]
            df_phone = df_phone_list[0]
            for df_i in df_kanji_list[1:]:
                df_kanji = pd.concat([df_kanji, df_i], axis=0)
            for df_i in df_kana_list[1:]:
                df_kana = pd.concat([df_kana, df_i], axis=0)
            for df_i in df_phone_list[1:]:
                df_phone = pd.concat([df_phone, df_i], axis=0)

        df_kanji.to_csv(join(dataset_save_path, 'dataset_kanji.csv'))
        df_kana.to_csv(join(dataset_save_path, 'dataset_kana.csv'))
        df_phone.to_csv(join(dataset_save_path, 'dataset_phone.csv'))

        # Use the first 4000 utterances as the dev set
        if data_type == 'train':
            df_kanji[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kanji.csv'))
            df_kana[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kana.csv'))
            df_phone[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_phone.csv'))
Ejemplo n.º 7
0
def read_audio(audio_paths,
               speaker_dict,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format=None,
               global_mean=None,
               global_std=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        speaker_dict (dict): A dictionary of speakers' gender information
            key (string) => speaker
            value (dict) => dictionary of utterance information of each speaker
                key (string) => utterance index
                value (list) => [start_frame, end_frame, transcript]
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool): training or not
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean (np.ndarray, optional): global mean over the training set
        global_std (np.ndarray, optional): global standard deviation over the
            training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean (np.ndarray): global mean over the training set
        global_std (np.ndarray): global standard deviation over the
            training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean is None or global_std is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')

    total_frame_num = 0
    total_frame_num_dict = {}
    speaker_mean_dict = {}

    # Loop 1: Computing global mean and statistics
    if is_training and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = basename(audio_path).split('.')[0]

            # Fix speaker name
            speaker = speaker.replace('sw0', 'sw')
            # ex.) sw04771-A => sw4771-A (LDC97S62)
            speaker = speaker.replace('sw_', 'sw')
            # ex.) sw_4771-A => sw4771-A (eval2000, swbd)
            speaker = speaker.replace('en_', 'en')
            # ex.) en_4156-A => en4156-A (eval2000, ch)

            # Divide each audio file into utterances
            _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt_sum.shape[0]
                global_mean = np.zeros((feature_dim, ), dtype=dtype)
                global_std = np.zeros((feature_dim, ), dtype=dtype)

            global_mean += input_utt_sum
            total_frame_num += total_frame_num_speaker

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_mean_dict[speaker] = speaker_mean
                total_frame_num_dict[speaker] = total_frame_num_speaker
                # NOTE: speaker mean is already computed

        print('=====> Computing global mean & stddev...')
        # Compute global mean
        global_mean /= total_frame_num

        for audio_path in tqdm(audio_paths):
            speaker = basename(audio_path).split('.')[0]

            # Normalize speaker name
            speaker = speaker.replace('sw0', 'sw')
            speaker = speaker.replace('sw_', 'sw')
            speaker = speaker.replace('en_', 'en')

            # Divide each audio into utterances
            input_data_dict_speaker, _, _, _, _ = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            # For computing global stddev
            for input_utt in input_data_dict_speaker.values():
                global_std += np.sum(np.abs(input_utt - global_mean)**2,
                                     axis=0)

        # Compute global stddev
        global_std = np.sqrt(global_std / (total_frame_num - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean.npy'), global_mean)
            np.save(join(save_path, 'global_std.npy'), global_std)

    # Loop 2: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = basename(audio_path).split('.')[0]

        # Normalize speaker name
        speaker = speaker.replace('sw0', 'sw')
        speaker = speaker.replace('sw_', 'sw')
        speaker = speaker.replace('en_', 'en')

        if normalize == 'speaker' and is_training:
            speaker_mean = speaker_mean_dict[speaker]
        else:
            speaker_mean = None

        # Divide each audio into utterances
        input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment(
            audio_path,
            speaker,
            speaker_dict[speaker],
            is_training=is_training,
            sil_duration=0,
            tool=tool,
            config=config,
            mean=speaker_mean)  # for compute speaker sttdev
        # NOTE: input_data_dict_speaker have been not normalized yet

        for utt_index, input_utt in input_data_dict_speaker.items():

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set
                input_utt -= global_mean
                input_utt /= global_std
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt = (input_utt - speaker_mean) / speaker_std
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(
                        save_path, speaker, speaker + '_' + utt_index + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    if sampPeriod is None:
                        _, sampPeriod, parmKind = read(audio_path)
                    write(input_utt,
                          htk_path=mkdir_join(
                              save_path, speaker,
                              speaker + '_' + utt_index + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return global_mean, global_std, frame_num_dict