Beispiel #1
0
    def check(self, normalize, tool):

        print('==================================================')
        print('  normalize: %s' % normalize)
        print('  tool: %s' % tool)
        print('==================================================')

        audio_paths = htk_paths if tool == 'htk' else wav_paths

        print('---------- train ----------')
        speaker_dict = read_sdb(label_paths=label_paths['train'],
                                data_size='subset',
                                vocab_file_save_path='../config/mapping_files',
                                is_training=True,
                                save_vocab_file=True)
        global_mean_male, global_mean_female, global_std_male, global_std_female = read_audio(
            audio_paths=audio_paths['train'],
            speaker_dict=speaker_dict,
            tool=tool,
            config=CONFIG,
            normalize=normalize,
            is_training=True)

        for data_type in ['dev', 'eval1', 'eval2', 'eval3']:
            print('---------- %s ----------' % data_type)
            speaker_dict = read_sdb(
                label_paths=label_paths[data_type],
                data_size='subset',
                vocab_file_save_path='../config/mapping_files',
                is_test=True)
            read_audio(audio_paths=audio_paths[data_type],
                       speaker_dict=speaker_dict,
                       tool=tool,
                       config=CONFIG,
                       normalize=normalize,
                       is_training=False,
                       global_mean_male=global_mean_male,
                       global_mean_female=global_mean_female,
                       global_std_male=global_std_male,
                       global_std_female=global_std_female)
Beispiel #2
0
def main(data_size):

    speaker_dict_dict = {}  # dict of speaker_dict
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # labels
        ########################################
        if data_type == 'train':
            label_paths = path.trans(data_type='train_' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'eval' in data_type else False

        print('=> Processing transcripts...')
        speaker_dict_dict[data_type] = read_sdb(
            label_paths=label_paths,
            data_size=data_size,
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test,
            data_type=data_type)

        ########################################
        # inputs
        ########################################
        print('\n=> Processing input data...')
        input_save_path = mkdir_join(args.feature_save_path, args.save_format,
                                     data_size)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='train' + data_size)
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train_' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train_' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           speaker_dict=speaker_dict_dict[data_type],
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # dataset (csv)
        ########################################
        print('\n=> Saving dataset files...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)

        df_columns = ['frame_num', 'input_path', 'transcript']
        df_kanji = pd.DataFrame([], columns=df_columns)
        df_kanji_divide = pd.DataFrame([], columns=df_columns)
        df_kana = pd.DataFrame([], columns=df_columns)
        df_kana_divide = pd.DataFrame([], columns=df_columns)
        df_phone = pd.DataFrame([], columns=df_columns)
        df_phone_divide = pd.DataFrame([], columns=df_columns)
        df_word_freq1 = pd.DataFrame([], columns=df_columns)
        df_word_freq5 = pd.DataFrame([], columns=df_columns)
        df_word_freq10 = pd.DataFrame([], columns=df_columns)
        df_word_freq15 = pd.DataFrame([], columns=df_columns)

        with open(join(input_save_path, data_type, 'frame_num.pickle'),
                  'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_kanji_list, df_kanji_divide_list = [], []
        df_kana_list, df_kana_divide_list = [], []
        df_phone_list, df_phone_divide_list = [], []
        df_word_freq1_list, df_word_freq5_list = [], []
        df_word_freq10_list, df_word_freq15_list = [], []
        speaker_dict = speaker_dict_dict[data_type]
        for speaker, utt_dict in tqdm(speaker_dict.items()):
            for utt_index, utt_info in utt_dict.items():
                kanji_indices, kanji_divide_indices = utt_info[2:4]
                kana_indices, kana_divide_indices = utt_info[4:6]
                phone_indices, phone_divide_indices = utt_info[6:8]
                word_freq1_indices, word_freq5_indices = utt_info[8:10]
                word_freq10_indices, word_freq15_indices = utt_info[10:12]

                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.npy')
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.htk')
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                else:
                    raise ValueError('save_format is numpy or htk or wav.')

                assert isfile(input_utt_save_path)
                frame_num = frame_num_dict[speaker + '_' + utt_index]

                df_kanji = add_element(
                    df_kanji, [frame_num, input_utt_save_path, kanji_indices])
                df_kanji_divide = add_element(
                    df_kanji_divide,
                    [frame_num, input_utt_save_path, kanji_divide_indices])
                df_kana = add_element(
                    df_kana, [frame_num, input_utt_save_path, kana_indices])
                df_kana_divide = add_element(
                    df_kana_divide,
                    [frame_num, input_utt_save_path, kana_divide_indices])
                df_phone = add_element(
                    df_phone, [frame_num, input_utt_save_path, phone_indices])
                df_phone_divide = add_element(
                    df_phone_divide,
                    [frame_num, input_utt_save_path, phone_divide_indices])
                df_word_freq1 = add_element(
                    df_word_freq1,
                    [frame_num, input_utt_save_path, word_freq1_indices])
                df_word_freq5 = add_element(
                    df_word_freq5,
                    [frame_num, input_utt_save_path, word_freq5_indices])
                df_word_freq10 = add_element(
                    df_word_freq10,
                    [frame_num, input_utt_save_path, word_freq10_indices])
                df_word_freq15 = add_element(
                    df_word_freq15,
                    [frame_num, input_utt_save_path, word_freq15_indices])
                utt_count += 1

                # Reset
                if utt_count == 10000:
                    df_kanji_list.append(df_kanji)
                    df_kanji_divide_list.append(df_kanji_divide)
                    df_kana_list.append(df_kana)
                    df_kana_divide_list.append(df_kana_divide)
                    df_phone_list.append(df_phone)
                    df_phone_divide_list.append(df_phone_divide)
                    df_word_freq1_list.append(df_word_freq1)
                    df_word_freq5_list.append(df_word_freq5)
                    df_word_freq10_list.append(df_word_freq10)
                    df_word_freq15_list.append(df_word_freq15)

                    df_kanji = pd.DataFrame([], columns=df_columns)
                    df_kanji_divide = pd.DataFrame([], columns=df_columns)
                    df_kana = pd.DataFrame([], columns=df_columns)
                    df_kana_divide = pd.DataFrame([], columns=df_columns)
                    df_phone = pd.DataFrame([], columns=df_columns)
                    df_phone_divide = pd.DataFrame([], columns=df_columns)
                    df_word_freq1 = pd.DataFrame([], columns=df_columns)
                    df_word_freq5 = pd.DataFrame([], columns=df_columns)
                    df_word_freq10 = pd.DataFrame([], columns=df_columns)
                    df_word_freq15 = pd.DataFrame([], columns=df_columns)
                    utt_count = 0

        # Last dataframe
        df_kanji_list.append(df_kanji)
        df_kanji_divide_list.append(df_kanji_divide)
        df_kana_list.append(df_kana)
        df_kana_divide_list.append(df_kana_divide)
        df_phone_list.append(df_phone)
        df_phone_divide_list.append(df_phone_divide)
        df_word_freq1_list.append(df_word_freq1)
        df_word_freq5_list.append(df_word_freq5)
        df_word_freq10_list.append(df_word_freq10)
        df_word_freq15_list.append(df_word_freq15)

        # Concatenate all dataframes
        df_kanji = df_kanji_list[0]
        df_kanji_divide = df_kanji_divide_list[0]
        df_kana = df_kana_list[0]
        df_kana_divide = df_kana_divide_list[0]
        df_phone = df_phone_list[0]
        df_phone_divide = df_phone_divide_list[0]
        df_word_freq1 = df_word_freq1_list[0]
        df_word_freq5 = df_word_freq5_list[0]
        df_word_freq10 = df_word_freq10_list[0]
        df_word_freq15 = df_word_freq15_list[0]

        for df_i in df_kanji_list[1:]:
            df_kanji = pd.concat([df_kanji, df_i], axis=0)
        for df_i in df_kanji_divide_list[1:]:
            df_kanji_divide = pd.concat([df_kanji_divide, df_i], axis=0)
        for df_i in df_kana_list[1:]:
            df_kana = pd.concat([df_kana, df_i], axis=0)
        for df_i in df_kana_divide_list[1:]:
            df_kana_divide = pd.concat([df_kana_divide, df_i], axis=0)
        for df_i in df_phone_list[1:]:
            df_phone = pd.concat([df_phone, df_i], axis=0)
        for df_i in df_phone_divide_list[1:]:
            df_phone_divide = pd.concat([df_phone_divide, df_i], axis=0)
        for df_i in df_word_freq1_list[1:]:
            df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0)
        for df_i in df_word_freq5_list[1:]:
            df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0)
        for df_i in df_word_freq10_list[1:]:
            df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0)
        for df_i in df_word_freq15_list[1:]:
            df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0)

        df_kanji.to_csv(join(dataset_save_path, 'kanji.csv'))
        df_kanji_divide.to_csv(join(dataset_save_path, 'kanji_divide.csv'))
        df_kana.to_csv(join(dataset_save_path, 'kana.csv'))
        df_kana_divide.to_csv(join(dataset_save_path, 'kana_divide.csv'))
        df_phone.to_csv(join(dataset_save_path, 'phone.csv'))
        df_phone_divide.to_csv(join(dataset_save_path, 'phone_divide.csv'))
        df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv'))
        df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv'))
        df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv'))
        df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
Beispiel #3
0
def main(data_size):

    speaker_dict_dict = {}  # dict of speaker_dict
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # labels
        ########################################
        if data_type == 'train':
            label_paths = path.trans(data_type='train_' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'eval' in data_type else False

        print('=> Processing transcripts...')
        speaker_dict_dict[data_type] = read_sdb(
            label_paths=label_paths,
            data_size=data_size,
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test,
            data_type=data_type)

        ########################################
        # inputs
        ########################################
        print('\n=> Processing input data...')
        input_save_path = mkdir_join(args.feature_save_path, args.save_format,
                                     data_size)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='train' + data_size)
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train_' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train_' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           speaker_dict=speaker_dict_dict[data_type],
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

    ########################################
    # dataset (csv)
    ########################################
    print('\n=> Saving dataset files...')
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)

        print('---------- %s ----------' % data_type)
        df_kanji = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_kana = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])

        utt_count = 0
        df_kanji_list, df_kana_list, df_phone_list = [], [], []
        for speaker, utt_dict in tqdm(speaker_dict_dict[data_type].items()):
            for utt_index, utt_info in utt_dict.items():
                trans_kanji, trans_kana, trans_phone = utt_info[2:]
                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.npy')
                    assert isfile(input_utt_save_path)
                    input_utt = np.load(input_utt_save_path)
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.htk')
                    assert isfile(input_utt_save_path)
                    input_utt, _, _ = read(input_utt_save_path)
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                    assert isfile(input_utt_save_path)
                    input_utt = w2f_psf(input_utt_save_path,
                                        feature_type=CONFIG['feature_type'],
                                        feature_dim=CONFIG['channels'],
                                        use_energy=CONFIG['energy'],
                                        use_delta1=CONFIG['delta'],
                                        use_delta2=CONFIG['deltadelta'],
                                        window=CONFIG['window'],
                                        slide=CONFIG['slide'])
                else:
                    raise ValueError('save_format is numpy or htk or wav.')
                frame_num = input_utt.shape[0]
                del input_utt

                series_kanji = pd.Series(
                    [frame_num, input_utt_save_path, trans_kanji],
                    index=df_kanji.columns)
                series_kana = pd.Series(
                    [frame_num, input_utt_save_path, trans_kana],
                    index=df_kana.columns)
                series_phone = pd.Series(
                    [frame_num, input_utt_save_path, trans_phone],
                    index=df_phone.columns)

                df_kanji = df_kanji.append(series_kanji, ignore_index=True)
                df_kana = df_kana.append(series_kana, ignore_index=True)
                df_phone = df_phone.append(series_phone, ignore_index=True)

                utt_count += 1

                # Reset
                if utt_count == 50000:
                    df_kanji_list.append(df_kanji)
                    df_kana_list.append(df_kana)
                    df_phone_list.append(df_phone)
                    df_kanji = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_kana = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_phone = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    utt_count = 0

            # Last dataframe
            df_kanji_list.append(df_kanji)
            df_kana_list.append(df_kana)
            df_phone_list.append(df_phone)

            # Concatenate all dataframes
            df_kanji = df_kanji_list[0]
            df_kana = df_kana_list[0]
            df_phone = df_phone_list[0]
            for df_i in df_kanji_list[1:]:
                df_kanji = pd.concat([df_kanji, df_i], axis=0)
            for df_i in df_kana_list[1:]:
                df_kana = pd.concat([df_kana, df_i], axis=0)
            for df_i in df_phone_list[1:]:
                df_phone = pd.concat([df_phone, df_i], axis=0)

        df_kanji.to_csv(join(dataset_save_path, 'dataset_kanji.csv'))
        df_kana.to_csv(join(dataset_save_path, 'dataset_kana.csv'))
        df_phone.to_csv(join(dataset_save_path, 'dataset_phone.csv'))

        # Use the first 4000 utterances as the dev set
        if data_type == 'train':
            df_kanji[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kanji.csv'))
            df_kana[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kana.csv'))
            df_phone[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_phone.csv'))