Python mkdir_join Beispiele

Programmiersprache: Python

Namespace / Paketname: utils.directory

Methode / Funktion: mkdir_join

Beispiele auf hotexamples.com: 41

Python mkdir_join - 41 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die utils.directory.mkdir_join, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def main():

    for data_type in ['train', 'dev', 'test']:
        print('=' * 50)
        print(' ' * 20 + data_type)
        print('=' * 50)

        # Convert transcript to index
        print('=> Processing transcripts...')
        trans_dict = read_text(text_path=join(args.data_save_path, data_type,
                                              'text'),
                               vocab_save_path=mkdir_join(
                                   args.data_save_path, 'vocab'),
                               data_type=data_type,
                               phone_map_file_path=args.phone_map_file_path)

        # Make dataset file (.csv)
        print('=> Saving dataset files...')
        csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool,
                                   data_type)

        df_columns = ['frame_num', 'input_path', 'transcript']
        df_phone61 = pd.DataFrame([], columns=df_columns)
        df_phone48 = pd.DataFrame([], columns=df_columns)
        df_phone39 = pd.DataFrame([], columns=df_columns)

        with open(
                join(args.data_save_path, 'feature', args.tool, data_type,
                     'frame_num.pickle'), 'rb') as f:
            frame_num_dict = pickle.load(f)

        for utt_idx, trans_list in tqdm(trans_dict.items()):
            feat_utt_save_path = join(args.data_save_path, 'feature',
                                      args.tool, data_type, utt_idx + '.npy')
            frame_num = frame_num_dict[utt_idx]

            if not isfile(feat_utt_save_path):
                raise ValueError('There is no file: %s' % feat_utt_save_path)

            phone61_indices, phone48_indices, phone39_indices = trans_list

            df_phone61 = add_element(
                df_phone61, [frame_num, feat_utt_save_path, phone61_indices])
            df_phone48 = add_element(
                df_phone48, [frame_num, feat_utt_save_path, phone48_indices])
            df_phone39 = add_element(
                df_phone39, [frame_num, feat_utt_save_path, phone39_indices])

        df_phone61.to_csv(join(csv_save_path, 'phone61.csv'))
        df_phone48.to_csv(join(csv_save_path, 'phone48.csv'))
        df_phone39.to_csv(join(csv_save_path, 'phone39.csv'))

Beispiel #2

Datei anzeigen

Datei: plot_hierarchical_ctc_probs.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def plot(model, dataset, eval_batch_size, save_path=None):
    """
    Args:
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        save_path (string): path to save figures of CTC posteriors
    """
    # Set batch size in the evaluation
    if eval_batch_size is not None:
        dataset.batch_size = eval_batch_size

    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2word = Idx2word(dataset.vocab_file_path)
    idx2char = Idx2char(
        dataset.vocab_file_path,
        capital_divide=dataset.label_type_sub == 'character_capital_divide')

    for batch, is_new_epoch in dataset:

        # Get CTC probs
        probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1)
        probs_sub = model.posteriors(batch['xs'],
                                     batch['x_lens'],
                                     is_sub_task=True,
                                     temperature=1)
        # NOTE: probs: '[B, T, num_classes]'
        # NOTE: probs_sub: '[B, T, num_classes_sub]'

        # Decode
        best_hyps = model.decode(batch['xs'], batch['x_lens'], beam_width=1)
        best_hyps_sub = model.decode(batch['xs'],
                                     batch['x_lens'],
                                     beam_width=1,
                                     is_sub_task=True)

        # Visualize
        for b in range(len(batch['xs'])):

            # Convert from list of index to string
            str_hyp = idx2word(best_hyps[b])
            str_hyp_sub = idx2char(best_hyps_sub[b])

            speaker = batch['input_names'][b].split('_')[0]
            plot_hierarchical_ctc_probs(probs[b, :batch['x_lens'][b], :],
                                        probs_sub[b, :batch['x_lens'][b], :],
                                        frame_num=batch['x_lens'][b],
                                        num_stack=dataset.num_stack,
                                        str_hyp=str_hyp,
                                        str_hyp_sub=str_hyp_sub,
                                        save_path=mkdir_join(
                                            save_path, speaker,
                                            batch['input_names'][b] + '.png'))

        if is_new_epoch:
            break

Beispiel #3

Datei anzeigen

def plot(model, dataset, beam_width,
         eval_batch_size=None, save_path=None):
    """Visualize attention weights of attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        beam_width: (int): the size of beam
        eval_batch_size (int, optional): the batch size when evaluating the model
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    if 'char' in dataset.label_type:
        map_fn = Idx2char(dataset.vocab_file_path,
                          capital_divide=dataset.label_type == 'character_capital_divide',
                          return_list=True)
        max_decode_len = MAX_DECODE_LEN_CHAR
    else:
        map_fn = Idx2word(dataset.vocab_file_path, return_list=True)
        max_decode_len = MAX_DECODE_LEN_WORD

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, aw, perm_idx = model.attention_weights(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width,
            max_decode_len=max_decode_len)

        ys = batch['ys'][perm_idx]
        y_lens = batch['y_lens'][perm_idx]

        for b in range(len(batch['xs'])):
            ##############################
            # Reference
            ##############################
            if dataset.is_test:
                str_ref = ys[b][0]
                # NOTE: transcript is seperated by space('_')
            else:
                # Convert from list of index to string
                str_ref = map_fn(ys[b][:y_lens[b]])

            token_list = map_fn(best_hyps[b])

            speaker = '_'.join(batch['input_names'][b].split('_')[:2])
            plot_attention_weights(
                aw[b, :len(token_list), :batch['x_lens'][b]],
                label_list=token_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                str_ref=str_ref,
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(20, 8))

        if is_new_epoch:
            break

Beispiel #4

Datei anzeigen

Datei: plot_multitask_ctc_prob.py Projekt: zz12375/tensorflow_end2end_speech_recognition

def do_plot(model, params, epoch, eval_batch_size):
    """Plot the multi-task CTC posteriors.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch in evaluation
    """
    # Load dataset
    test_data = Dataset(data_type='test',
                        label_type_main=params['label_type_main'],
                        label_type_sub=params['label_type_sub'],
                        batch_size=eval_batch_size,
                        splice=params['splice'],
                        num_stack=params['num_stack'],
                        num_skip=params['num_skip'],
                        sort_utt=False,
                        progressbar=True)

    # Define placeholders
    model.create_placeholders()

    # Add to the graph each operation (including model definition)
    _, logits_main, logits_sub = model.compute_loss(
        model.inputs_pl_list[0], model.labels_pl_list[0],
        model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0],
        model.keep_prob_pl_list[0])
    posteriors_op_main, posteriors_op_sub = model.posteriors(
        logits_main, logits_sub)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            # Use last saved model
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        plot(session=sess,
             posteriors_op_main=posteriors_op_main,
             posteriors_op_sub=posteriors_op_sub,
             model=model,
             dataset=test_data,
             label_type_main=params['label_type_main'],
             label_type_sub=params['label_type_sub'],
             num_stack=params['num_stack'],
             save_path=mkdir_join(save_path, 'ctc_output'),
             show=False)

Beispiel #5

Datei anzeigen

Datei: plot_multitask_ctc_prob.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def do_plot(model, params, epoch, eval_batch_size):
    """Plot the multi-task CTC posteriors.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch in evaluation
    """
    # Load dataset
    test_data = Dataset(
        data_type='test', label_type_main=params['label_type_main'],
        label_type_sub=params['label_type_sub'],
        batch_size=eval_batch_size, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=False, progressbar=True)

    # Define placeholders
    model.create_placeholders()

    # Add to the graph each operation (including model definition)
    _, logits_main, logits_sub = model.compute_loss(
        model.inputs_pl_list[0],
        model.labels_pl_list[0],
        model.labels_sub_pl_list[0],
        model.inputs_seq_len_pl_list[0],
        model.keep_prob_pl_list[0])
    posteriors_op_main, posteriors_op_sub = model.posteriors(
        logits_main, logits_sub)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            # Use last saved model
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        plot(session=sess,
             posteriors_op_main=posteriors_op_main,
             posteriors_op_sub=posteriors_op_sub,
             model=model,
             dataset=test_data,
             label_type_main=params['label_type_main'],
             label_type_sub=params['label_type_sub'],
             num_stack=params['num_stack'],
             save_path=mkdir_join(save_path, 'ctc_output'),
             show=False)

Beispiel #6

Datei anzeigen

Datei: plot_ctc_probs.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def plot(model, dataset, eval_batch_size=None, save_path=None,
         space_index=None):
    """
    Args:
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int, optional): the batch size when evaluating the model
        save_path (string): path to save figures of CTC posteriors
        space_index (int, optional):
    """
    # Set batch size in the evaluation
    if eval_batch_size is not None:
        dataset.batch_size = eval_batch_size

    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    vocab_file_path = '../metrics/vocab_files/' + \
        dataset.label_type + '_' + dataset.data_size + '.txt'
    if dataset.label_type == 'character':
        map_fn = Idx2char(vocab_file_path)
    elif dataset.label_type == 'character_capital_divide':
        map_fn = Idx2char(vocab_file_path, capital_divide=True)
    else:
        map_fn = Idx2word(vocab_file_path)

    for batch, is_new_epoch in dataset:

        # Get CTC probs
        probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1)
        # NOTE: probs: '[B, T, num_classes]'

        # Decode
        best_hyps _ = model.decode(batch['xs'], batch['x_lens'], beam_width=1)

        # Visualize
        for b in range(len(batch['xs'])):

            # Convert from list of index to string
            str_pred = map_fn(best_hyps[b])

            speaker, book = batch['input_names'][b].split('-')[:2]
            plot_ctc_probs(
                probs[b, :batch['x_lens'][b], :],
                frame_num=batch['x_lens'][b],
                num_stack=dataset.num_stack,
                space_index=space_index,
                str_pred=str_pred,
                save_path=mkdir_join(save_path, speaker, book, batch['input_names'][b] + '.png'))

        if is_new_epoch:
            break

Beispiel #7

Datei anzeigen

def plot(model, dataset, eval_batch_size, beam_width, beam_width_sub,
         length_penalty, save_path=None):
    """Visualize attention weights of Attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        beam_width: (int): the size of beam in the main task
        beam_width_sub: (int): the size of beam in the sub task
        length_penalty (float):
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    map_fn_main = Idx2word(dataset.vocab_file_path, return_list=True)
    map_fn_sub = Idx2char(dataset.vocab_file_path_sub, return_list=True)

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width,
            max_decode_len=MAX_DECODE_LEN_WORD)
        best_hyps_sub, aw_sub, _ = model.decode(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_CHAR,
            task_index=1)

        for b in range(len(batch['xs'])):

            word_list = map_fn_main(best_hyps[b])
            char_list = map_fn_sub(best_hyps_sub[b])

            speaker = batch['input_names'][b].split('_')[0]

            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8)
            )

        if is_new_epoch:
            break

Beispiel #8

Datei anzeigen

Datei: plot_nested_attention_weights.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    test_data = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval1',
        # data_type='eval2',
        # data_type='eval3',
        data_size=params['data_size'],
        label_type=params['label_type'],
        label_type_sub=params['label_type_sub'],
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=False,
        reverse=False,
        tool=params['tool'])

    params['num_classes'] = test_data.num_classes
    params['num_classes_sub'] = test_data.num_classes_sub

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    a2c_oracle = False

    # Visualize
    plot(model=model,
         dataset=test_data,
         eval_batch_size=args.eval_batch_size,
         beam_width=args.beam_width,
         beam_width_sub=args.beam_width_sub,
         length_penalty=args.length_penalty,
         a2c_oracle=a2c_oracle,
         save_path=mkdir_join(args.model_path, 'att_weights'))

Beispiel #9

Datei anzeigen

Datei: plot_attention_weights.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    vocab_file_path = '../metrics/vocab_files/' + \
        params['label_type'] + '_' + params['data_size'] + '.txt'
    test_data = Dataset(
        backend=params['backend'],
        input_channel=params['input_channel'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='test_clean',
        # data_type='test_other',
        data_size=params['data_size'],
        label_type=params['label_type'],
        vocab_file_path=vocab_file_path,
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=True,
        reverse=True,
        save_format=params['save_format'])
    params['num_classes'] = test_data.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    # Visualize
    plot_attention(model=model,
                   dataset=test_data,
                   max_decode_len=args.max_decode_len,
                   eval_batch_size=args.eval_batch_size,
                   save_path=mkdir_join(args.model_path, 'att_weights'))

Beispiel #10

Datei anzeigen

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    test_data = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval2000_swbd',
        # data_type='eval2000_ch',
        data_size=params['data_size'],
        label_type=params['label_type'],
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=True,
        reverse=True,
        tool=params['tool'])

    params['num_classes'] = test_data.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    # Visualize
    plot(model=model,
         dataset=test_data,
         eval_batch_size=args.eval_batch_size,
         save_path=mkdir_join(args.model_path, 'ctc_probs'))

Beispiel #11

Datei anzeigen

Datei: plot_ctc_probs.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    vocab_file_path = '../metrics/vocab_files/' + \
        params['label_type'] + '_' + params['data_size'] + '.txt'
    test_data = Dataset(
        backend=params['backend'],
        input_channel=params['input_channel'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='test_clean',
        # data_type='test_other',
        data_size=params['data_size'],
        label_type=params['label_type'], vocab_file_path=vocab_file_path,
        batch_size=args.eval_batch_size, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=True, reverse=False, save_format=params['save_format'])
    params['num_classes'] = test_data.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    space_index = 27 if params['label_type'] == 'character' else None
    # NOTE: index 0 is reserved for blank in warpctc_pytorch

    # Visualize
    plot(model=model,
         dataset=test_data,
         eval_batch_size=args.eval_batch_size,
         save_path=mkdir_join(args.model_path, 'ctc_probs'),
         space_index=space_index)

Beispiel #12

Datei anzeigen

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    params['num_classes'] = 28

    # Model setting
    model = StudentCTC(
        encoder_type=params['encoder_type'],
        input_size=params['input_size'] *
        params['num_stack'] * params['splice'],
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_classes=params['num_classes'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        weight_decay=params['weight_decay'])

    # Set process name
    setproctitle(
        'tf_libri_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'])

    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'student_ctc', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

Beispiel #13

Datei anzeigen

Datei: train_joint_ctc_attention.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise TypeError

    # Model setting
    model = JointCTCAttention(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        lambda_weight=params['lambda_weight'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' +
                 params['label_type'] + '_' + params['attention_type'])

    model.name += '_en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    model.name += '_lambda' + str(params['lambda_weight'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'joint_ctc_attention', params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #14

Datei anzeigen

Datei: plot_nested_attention_weights.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def plot(model,
         dataset,
         beam_width,
         beam_width_sub,
         eval_batch_size=None,
         a2c_oracle=False,
         save_path=None):
    """Visualize attention weights of Attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        beam_width: (int): the size of beam i nteh main task
        beam_width_sub: (int): the size of beam in the sub task
        eval_batch_size (int, optional): the batch size when evaluating the model
        a2c_oracle (bool, optional):
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2word = Idx2word(dataset.vocab_file_path, return_list=True)
    idx2char = Idx2char(dataset.vocab_file_path_sub, return_list=True)

    for batch, is_new_epoch in dataset:
        batch_size = len(batch['xs'])

        if a2c_oracle:
            if dataset.is_test:
                max_label_num = 0
                for b in range(batch_size):
                    if max_label_num < len(list(batch['ys_sub'][b][0])):
                        max_label_num = len(list(batch['ys_sub'][b][0]))

                ys_sub = np.zeros((batch_size, max_label_num), dtype=np.int32)
                ys_sub -= 1  # pad with -1
                y_lens_sub = np.zeros((batch_size, ), dtype=np.int32)
                for b in range(batch_size):
                    indices = char2idx(batch['ys_sub'][b][0])
                    ys_sub[b, :len(indices)] = indices
                    y_lens_sub[b] = len(indices)
                    # NOTE: transcript is seperated by space('_')
            else:
                ys_sub = batch['ys_sub']
                y_lens_sub = batch['y_lens_sub']
        else:
            ys_sub = None
            y_lens_sub = None

        best_hyps, best_hyps_sub, aw, aw_sub, aw_dec = model.attention_weights(
            batch['xs'],
            batch['x_lens'],
            beam_width=beam_width,
            beam_width_sub=beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_WORD,
            max_decode_len_sub=MAX_DECODE_LEN_CHAR,
            teacher_forcing=a2c_oracle,
            ys_sub=ys_sub,
            y_lens_sub=y_lens_sub)

        for b in range(len(batch['xs'])):
            word_list = idx2word(best_hyps[b])
            if 'word' in dataset.label_type_sub:
                char_list = idx2word(best_hyps_sub[b])
            else:
                char_list = idx2char(best_hyps_sub[b])

            # word to acoustic & character to acoustic
            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8))

            # word to characater
            plot_word2char_attention_weights(
                aw_dec[b][:len(word_list), :len(char_list)],
                label_list=word_list,
                label_list_sub=char_list,
                save_path=mkdir_join(
                    save_path, batch['input_names'][b] + '_word2char.png'),
                figsize=(40, 8))

            # with open(join(save_path, speaker, batch['input_names'][b] + '.txt'), 'w') as f:
            #     f.write(batch['ys'][b][0])

        if is_new_epoch:
            break

Beispiel #15

Datei anzeigen

Datei: save_ctc_prob.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def save(session, posteriors_op, model, dataset, data_type,
         save_prob=False, save_soft_targets=False,
         num_stack=1, save_path=None):

    # Initialize
    pbar = tqdm(total=len(dataset))
    total_num_frames = 0
    pool_input_frames = None
    pool_prob_frames = None
    num_frames_per_block = 1024 * 100
    frame_counter = 0
    block_counter = 0
    pool_counter = 0
    accumulated_total_num_frames = 0

    ########################################
    # Count total frame number
    ########################################
    # for data, is_new_epoch in dataset:
    #
    #     # Create feed dictionary for next mini batch
    #     inputs, _, inputs_seq_len, input_names = data
    #
    #     batch_size = inputs[0].shape[0]
    #     for i_batch in range(batch_size):
    #         total_num_frames += inputs_seq_len[0][i_batch]
    #
    #         pbar.update(1)
    #
    #     if is_new_epoch:
    #         print(total_num_frames)
    #         break

    ########################################
    # Save probabilities per utterance
    ########################################
    pbar = tqdm(total=len(dataset))
    for data, is_new_epoch in dataset:

        # Create feed dictionary for next mini batch
        inputs, _, inputs_seq_len, input_names = data
        feed_dict = {
            model.inputs_pl_list[0]: inputs[0],
            model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
            model.keep_prob_pl_list[0]: 1.0
        }

        batch_size, max_time = inputs[0].shape[:2]

        probs = session.run(posteriors_op, feed_dict=feed_dict)
        probs = probs.reshape(batch_size, max_time, model.num_classes)

        if pool_input_frames is None:
            # Initialize
            total_num_frames = TOTAL_NUM_FRAMES_DICT[data_type]

            pool_num_frames = total_num_frames // NUM_POOLS + 1
            pool_capacity = pool_num_frames

            pool_input_frames = np.zeros(
                (pool_num_frames, 120 * 2 * 5))
            # NOTE: input_size == 120 * 2 (num_stack == 2), splice == 5
            pool_prob_frames = np.zeros(
                (pool_num_frames, model.num_classes))

        for i_batch in range(batch_size):
            speaker = input_names[0][i_batch].split('-')[0]

            # Mask
            inputs_seq_len_i = inputs_seq_len[0][i_batch]
            inputs_i = inputs[0][i_batch][:inputs_seq_len_i]
            probs_i = probs[i_batch][:inputs_seq_len_i]

            # Save probabilities as npy file per utterance
            if save_prob:
                prob_save_path = mkdir_join(
                    save_path, 'probs_utt', speaker, input_names[0][i_batch] + '.npy')
                np.save(prob_save_path, probs_i)
                # NOTE: `[T, num_classes]`

            if dataset.splice == 1:
                # NOTE: teacher is expected to be BLSTM
                # Splicing
                inputs_i = do_splice(inputs_i.reshape(1, inputs_seq_len_i, -1),
                                     splice=5,
                                     batch_size=1,
                                     num_stack=dataset.num_stack)
                inputs_i = inputs_i.reshape(inputs_seq_len_i, -1)

            else:
                # NOTE: teahcer is expected to be VGG (use features as it is)
                pass

            # Register
            if pool_capacity > inputs_seq_len_i:
                pool_input_frames[frame_counter:frame_counter +
                                  inputs_seq_len_i] = inputs_i
                pool_prob_frames[frame_counter: frame_counter +
                                 inputs_seq_len_i] = probs_i
                frame_counter += inputs_seq_len_i
                pool_capacity -= inputs_seq_len_i
            else:
                # Fulfill pool
                pool_input_frames[frame_counter:frame_counter +
                                  pool_capacity] = inputs_i[:pool_capacity]
                pool_prob_frames[frame_counter:frame_counter +
                                 pool_capacity] = probs_i[:pool_capacity]

                ##################################################
                # Shuffle frames, divide into blocks, and save
                ##################################################
                num_blocks = pool_num_frames // num_frames_per_block
                data_indices = list(range(pool_num_frames))
                random.shuffle(data_indices)

                for i_block in range(num_blocks):
                    block_indices = data_indices[:num_frames_per_block]
                    data_indices = data_indices[num_frames_per_block:]

                    # Pick up block
                    block_inputs_frames = pool_input_frames[block_indices]
                    # NOTE: `[1024 * 100, input_size]`
                    block_probs_frames = pool_prob_frames[block_indices]
                    # NOTE：`[1024 * 100, num_classes]`

                    # Save block
                    if save_soft_targets:
                        print(' ==> Saving: block%d' % block_counter)
                        input_save_path = mkdir_join(
                            save_path, 'inputs', 'block' + str(block_counter) + '.npy')
                        label_save_path = mkdir_join(
                            save_path, 'labels', 'block' + str(block_counter) + '.npy')
                        np.save(input_save_path, block_inputs_frames)
                        np.save(label_save_path, block_probs_frames)

                    block_counter += 1
                    accumulated_total_num_frames += len(block_indices)

                pool_carry_over_num_frames = pool_num_frames - num_frames_per_block * num_blocks
                utt_carry_over_num_frames = inputs_seq_len_i - pool_capacity
                carry_over_num_frames = pool_carry_over_num_frames + utt_carry_over_num_frames

                pool_carry_over_input_frames = pool_input_frames[data_indices]
                pool_carry_over_prob_frames = pool_prob_frames[data_indices]

                # Initialize
                if pool_counter != NUM_POOLS - 1:
                    pool_num_frames = total_num_frames // NUM_POOLS + 1 + carry_over_num_frames
                else:
                    # last pool
                    pool_num_frames = total_num_frames - accumulated_total_num_frames

                pool_input_frames = np.zeros(
                    (pool_num_frames, 120 * 2 * 5))
                # NOTE: input_size == 120 * 2 (num_stack == 2), splice == 5
                pool_prob_frames = np.zeros(
                    (pool_num_frames, model.num_classes))
                frame_counter = 0
                pool_counter += 1

                # Register carry over frames
                pool_input_frames[:pool_carry_over_num_frames] = pool_carry_over_input_frames
                pool_prob_frames[:pool_carry_over_num_frames] = pool_carry_over_prob_frames
                frame_counter += pool_carry_over_num_frames
                pool_input_frames[frame_counter:frame_counter +
                                  utt_carry_over_num_frames] = inputs_i[-utt_carry_over_num_frames:]
                pool_prob_frames[frame_counter:frame_counter +
                                 utt_carry_over_num_frames] = probs_i[-utt_carry_over_num_frames:]
                frame_counter += utt_carry_over_num_frames

                pool_capacity = pool_num_frames - carry_over_num_frames
                print('=== next pool ===')

        pbar.update(batch_size)

        if is_new_epoch:
            ##################################################
            # Save last pool
            ##################################################
            # Pick up block
            block_inputs_frames = pool_input_frames[:frame_counter]
            # NOTE: `[1024 * 100, input_size]`
            block_probs_frames = pool_prob_frames[:frame_counter]
            # NOTE：`[1024 * 100, num_classes]`

            # Save last lock
            if save_soft_targets:
                print(' ==> Saving: block%d' % block_counter)
                np.save(mkdir_join(save_path, 'inputs', 'block' +
                                   str(block_counter) + '.npy'), block_inputs_frames)
                np.save(mkdir_join(save_path, 'labels', 'block' +
                                   str(block_counter) + '.npy'), block_probs_frames)

            break

Beispiel #16

Datei anzeigen

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise TypeError

    # Model setting
    model = AttentionSeq2Seq(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'],
        sigmoid_smoothing=params['sigmoid_smoothing'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' + params['label_type'] + '_' +
                 params['attention_type'])

    model.name = 'en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    if bool(params['sigmoid_smoothing']):
        model.name += '_smoothing'

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'attention',
                                 params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #17

Datei anzeigen

def read_text(text_path, vocab_save_path, data_type, lexicon_path=None):
    """Read transcripts (.sdb) & save files (.npy).
    Args:
        text_path (string): path to a text file of kaldi
        vocab_save_path (string): path to save vocabulary files
        data_type (string): train or dev or eval2000_swbd or eval2000_ch
        lexicon_path (string):
    Returns:
        speaker_dict (dict): the dictionary of utterances of each speaker
            key (string) => speaker
            value (dict) => the dictionary of utterance information of each speaker
                key (string) => utterance index
                value (dict) => list of [word_indices, char_indices, char_capital_indices]
    """
    # Make vocabulary files
    word_vocab_path = mkdir_join(vocab_save_path, 'word.txt')
    char_vocab_path = mkdir_join(vocab_save_path, 'character.txt')
    char_capital_vocab_path = mkdir_join(
        vocab_save_path, 'character_capital_divide.txt')

    trans_dict = {}
    char_set = set([])
    char_capital_set = set([])
    word_set = set([])
    word_dict = {}
    with open(text_path, 'r') as f:
        for line in f:
            line = line.strip()
            utt_idx = line.split(' ')[0]
            trans = ' '.join(line.split(' ')[1:]).lower()

            # text normalization
            trans = trans.replace('<noise>', NOISE)
            trans = trans.replace('.period', 'period')
            trans = trans.replace('\'single-quote', 'single-quote')
            trans = trans.replace('-hyphen', 'hyphen')
            trans = trans.replace('`', '\'')  # 47rc020w
            trans = re.sub(r'[(){}*,?!":;&/~]+', '', trans)
            trans = re.sub(r'<.*>', '', trans)
            trans = re.sub(r'[\s]+', ' ', trans)
            trans = trans.replace(' ', SPACE)

            if len(trans) == 0:
                continue
                # NOTE: utterances such as ~~
                # 46uc030b
                # 47hc0418

            trans_capital = ''
            for w in trans.split(SPACE):
                # Count word frequency
                if w not in word_dict.keys():
                    word_dict[w] = 1
                else:
                    word_dict[w] += 1

                word_set.add(w)
                char_set |= set(list(w))

                # Capital-divided
                if len(w) == 1:
                    char_capital_set.add(w)
                    trans_capital += w
                else:
                    # Replace the first character with the capital
                    # letter
                    w = w[0].upper() + w[1:]

                    # Check double-letters
                    for i in range(0, len(w) - 1, 1):
                        if w[i:i + 2] in DOUBLE_LETTERS:
                            char_capital_set.add(w[i:i + 2])
                        else:
                            char_capital_set.add(w[i])
                    trans_capital += w

            trans_dict[utt_idx] = [trans, trans_capital]

    # Save vocabulary files
    if 'train' in data_type:
        # word-level (threshold == 3)
        with codecs.open(word_vocab_path, 'w', 'utf-8') as f:
            word_list = sorted([w for w, freq in list(word_dict.items())
                                if freq >= 3]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # character-level
        with open(char_vocab_path, 'w') as f:
            char_list = sorted(list(char_set)) + [SPACE]
            for c in char_list:
                f.write('%s\n' % c)

        # character-level (capital-divided)
        with open(char_capital_vocab_path, 'w') as f:
            char_capital_list = sorted(list(char_capital_set))
            for c in char_capital_list:
                f.write('%s\n' % c)

    # Compute OOV rate
    if 'train' not in data_type:
        with codecs.open(mkdir_join(vocab_save_path, 'oov', data_type + '.txt'), 'w', 'utf-8') as f:
            # word-level (threshold == 3)
            oov_rate = compute_oov_rate(word_dict, word_vocab_path)
            f.write('Word (freq3):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

    # Convert to index
    print('=====> Convert to index...')
    word2idx = Word2idx(word_vocab_path)
    char2idx = Char2idx(char_vocab_path)
    char2idx_capital = Char2idx(char_capital_vocab_path, capital_divide=True)

    for utt_idx, [trans, trans_capital] in tqdm(trans_dict.items()):
        if data_type == 'test_eval92':
            trans_dict[utt_idx] = {"word": trans,
                                   "char": trans,
                                   "char_capital": trans}
            # NOTE: save as it is
        else:
            word_indices = word2idx(trans)
            char_indices = char2idx(trans)
            char_capital_indices = char2idx_capital(trans)

            word_indices = ' '.join(
                list(map(str, word_indices.tolist())))
            char_indices = ' '.join(
                list(map(str, char_indices.tolist())))
            char_capital_indices = ' '.join(
                list(map(str, char_capital_indices.tolist())))

            trans_dict[utt_idx] = {"word": word_indices,
                                   "char": char_indices,
                                   "char_capital": char_capital_indices}

    return trans_dict

Beispiel #18

Datei anzeigen

def read_text(text_path,
              vocab_save_path,
              data_type,
              kana2phone_path,
              lexicon_path=None):
    """Read transcripts (.sdb) & save files (.npy).
    Args:
        text_path (string): path to a text file of kaldi
        vocab_save_path (string): path to save vocabulary files
        data_type (string): train or dev or eval1 or eval2 or eval3
        kana2phone_path (string):
        lexicon_path (string, optional):
    Returns:
        speaker_dict (dict): the dictionary of utterances of each speaker
            key (string) => speaker
            value (dict) => the dictionary of utterance information of each speaker
                key (string) => utterance index
                value (dict)
                    key => label type
                    value => indices
    """
    # Make kana set
    kana_set = set([])
    with codecs.open(kana2phone_path, 'r', 'utf-8') as f:
        for line in f:
            line = line.strip()
            kana, phone_seq = line.split('+')
            kana_set.add(kana)

    # Make vocabulary files
    word_vocab_path = mkdir_join(vocab_save_path, 'word.txt')
    char_vocab_path = mkdir_join(vocab_save_path, 'character.txt')
    char_wb_vocab_path = mkdir_join(vocab_save_path, 'character_wb.txt')
    char_wb_left_vocab_path = mkdir_join(vocab_save_path,
                                         'character_wb_left.txt')
    char_wb_right_vocab_path = mkdir_join(vocab_save_path,
                                          'character_wb_right.txt')
    char_wb_both_vocab_path = mkdir_join(vocab_save_path,
                                         'character_wb_both.txt')
    char_wb_remove_vocab_path = mkdir_join(vocab_save_path,
                                           'character_wb_remove.txt')
    # phone_vocab_path = mkdir_join(vocab_save_path, 'phone' + '.txt')
    # phone_wb_vocab_path = mkdir_join(vocab_save_path, 'phone_wb' + '.txt')
    pos_vocab_path = mkdir_join(vocab_save_path, 'pos' + '.txt')

    trans_dict = {}
    char_set = set([])
    char_set_remove = set([])
    word_set = set([])
    pos_set = set([])
    word_dict = {}
    with codecs.open(text_path, 'r', 'utf-8') as f:
        for line in f:
            line = line.strip()
            utt_idx, trans_w_pos = line.split('  ')
            trans_w_pos = trans_w_pos.replace('<sp>', SHORT_PAUSE)
            trans = SPACE.join(
                [w.split('+')[0] for w in trans_w_pos.split(' ')])
            trans_pos = SPACE.join([
                w.split('+')[1].split('/')[0] if '+' in w else SHORT_PAUSE
                for w in trans_w_pos.split(' ')
            ])
            # NOTE: word and POS sequence are the same length

            ###################################
            # with filler and disfluency
            ###################################
            trans_left_list, trans_right_list, trans_both_list, trans_remove_list = [], [], [], []
            for w in trans_w_pos.split(' '):
                if '言いよどみ' in w:
                    w_left = SOD + w.split('+')[0]
                    w_right = w.split('+')[0] + EOD
                    w_both = SOD + w.split('+')[0] + EOD
                elif '感動詞' in w:
                    w_left = SOF + w.split('+')[0]
                    w_right = w.split('+')[0] + EOF
                    w_both = SOF + w.split('+')[0] + EOF
                else:
                    w_left = w.split('+')[0]
                    w_right = w.split('+')[0]
                    w_both = w.split('+')[0]
                    if w != SHORT_PAUSE:
                        trans_remove_list.append(w.split('+')[0])
                trans_left_list.append(w_left)
                trans_right_list.append(w_right)
                trans_both_list.append(w_both)
            trans_left = SPACE.join(trans_left_list)
            trans_right = SPACE.join(trans_right_list)
            trans_both = SPACE.join(trans_both_list)
            trans_remove = SPACE.join(trans_remove_list)

            trans_dict[utt_idx] = [
                trans, trans_pos, trans_left, trans_right, trans_both,
                trans_remove
            ]

            for w in trans.split(SPACE):
                # Count word frequency
                if w not in word_dict.keys():
                    word_dict[w] = 1
                else:
                    word_dict[w] += 1
                word_set.add(w)
                char_set |= set(list(w))

            for w in trans_remove.split(SPACE):
                char_set_remove |= set(list(w))

            for pos in trans_pos.split(SPACE):
                pos_set.add(pos)

    # TODO: load lexicon

    # Save vocabulary files
    if data_type == 'train':
        # word-level (threshold == 3)
        with codecs.open(word_vocab_path, 'w', 'utf-8') as f:
            word_list = sorted(
                [w
                 for w, freq in list(word_dict.items()) if freq >= 3]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # character-level (char, char_wb)
        char_list = sorted(list(char_set))
        with codecs.open(char_vocab_path, 'w', 'utf-8') as f:
            for c in char_list + [OOV]:
                f.write('%s\n' % c)
        with codecs.open(char_wb_vocab_path, 'w', 'utf-8') as f:
            for c in char_list + [SPACE, OOV]:
                f.write('%s\n' % c)

        # character-level (char_wb + left, right, both, remove)
        with codecs.open(char_wb_left_vocab_path, 'w', 'utf-8') as f:
            for c in char_list + [SPACE, OOV, SOF, SOD]:
                f.write('%s\n' % c)
        with codecs.open(char_wb_right_vocab_path, 'w', 'utf-8') as f:
            for c in char_list + [SPACE, OOV, EOF, EOD]:
                f.write('%s\n' % c)
        with codecs.open(char_wb_both_vocab_path, 'w', 'utf-8') as f:
            for c in char_list + [SPACE, OOV, SOF, EOF, SOD, EOD]:
                f.write('%s\n' % c)
        with codecs.open(char_wb_remove_vocab_path, 'w', 'utf-8') as f:
            char_list_remove = sorted(list(char_set_remove))
            for c in char_list_remove + [SPACE, OOV]:
                f.write('%s\n' % c)

        # phone-level (phone, phone_wb)
        # with codecs.open(phone_vocab_path, 'w', 'utf-8') as f, codecs.open(phone_wb_vocab_path, 'w', 'utf-8') as f_wb:
        #     phone_list = sorted(list(phone_set))
        #     for phone in phone_list:
        #         f.write('%s\n' % phone)
        #     for phone in phone_list + [SIL]:
        #         f_wb.write('%s\n' % phone)

        # pos-level
        with codecs.open(pos_vocab_path, 'w', 'utf-8') as f:
            pos_list = sorted(list(pos_set))
            for pos in pos_list:
                f.write('%s\n' % pos)

    # Compute OOV rate
    if data_type != 'train':
        with codecs.open(
                mkdir_join(vocab_save_path, 'oov', data_type + '.txt'), 'w',
                'utf-8') as f:
            # word-level (threshold == 3)
            oov_rate = compute_oov_rate(word_dict, word_vocab_path)
            f.write('Word (freq3):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

    # Convert to index
    print('=====> Convert to index...')
    word2idx = Word2idx(word_vocab_path)
    char2idx = Char2idx(char_vocab_path)
    char2idx_wb = Char2idx(char_wb_vocab_path)
    char2idx_wb_left = Char2idx(char_wb_left_vocab_path)
    char2idx_wb_right = Char2idx(char_wb_right_vocab_path)
    char2idx_wb_both = Char2idx(char_wb_both_vocab_path)
    char2idx_wb_remove = Char2idx(char_wb_remove_vocab_path)
    # phone2idx = Phone2idx(phone_vocab_path)
    # phone2idx_wb = Phone2idx(phone_wb_vocab_path)
    pos2idx = Word2idx(pos_vocab_path)

    for utt_idx, [
            trans, trans_pos, trans_left, trans_right, trans_both, trans_remove
    ] in tqdm(trans_dict.items()):
        if 'eval' in data_type:
            trans_dict[utt_idx] = {
                "word": trans,
                "char": trans.replace(SPACE, ''),
                "char_wb": trans,
                "char_wb_left": trans,
                "char_wb_right": trans,
                "char_wb_both": trans,
                "char_wb_remove": trans_remove,
                "phone": None,
                # "phone": trans_phone,
                "phone_wb": None,
                # "phone_wb": trans_phone.replace(SIL, '').replace('  ', ' '),
                "pos": trans_pos,
            }
            # NOTE: save as it is
        else:
            word_indices = word2idx(trans)
            char_indices = char2idx(trans.replace(SPACE, ''))
            char_wb_indices = char2idx_wb(trans)
            char_wb_left_indices = char2idx_wb_left(trans_left)
            char_wb_right_indices = char2idx_wb_right(trans_right)
            char_wb_both_indices = char2idx_wb_both(trans_both)
            char_wb_remove_indices = char2idx_wb_remove(trans_remove)
            # phone_indices = phone2idx(
            #     trans_phone.replace(SIL, '').replace('  ', ' '))
            # phone_wb_indices = phone2idx_wb(trans_phone)
            pos_indices = pos2idx(trans_pos)

            word_indices = ' '.join(list(map(str, word_indices.tolist())))
            char_indices = ' '.join(list(map(str, char_indices.tolist())))
            char_wb_indices = ' '.join(list(map(str,
                                                char_wb_indices.tolist())))
            char_wb_left_indices = ' '.join(
                list(map(str, char_wb_left_indices.tolist())))
            char_wb_right_indices = ' '.join(
                list(map(str, char_wb_right_indices.tolist())))
            char_wb_both_indices = ' '.join(
                list(map(str, char_wb_both_indices.tolist())))
            char_wb_remove_indices = ' '.join(
                list(map(str, char_wb_remove_indices.tolist())))
            # phone_indices = ' '.join(
            #     list(map(str, phone_indices.tolist())))
            # phone_wb_indices = ' '.join(
            #     list(map(str, phone_wb_indices.tolist())))
            pos_indices = ' '.join(list(map(str, pos_indices.tolist())))

            trans_dict[utt_idx] = {
                "word": word_indices,
                "char": char_indices,
                "char_wb": char_wb_indices,
                "char_wb_left": char_wb_left_indices,
                "char_wb_right": char_wb_right_indices,
                "char_wb_both": char_wb_both_indices,
                "char_wb_remove": char_wb_remove_indices,
                # "phone": phone_indices,
                # "phone_wb": phone_wb_indices,
                "pos": pos_indices,
            }

    return trans_dict

Beispiel #19

Datei anzeigen

Datei: feature_extraction.py Projekt: carolinebear/pytorch_end2end_speech_recognition

def main():
    print('=> Processing input data...')
    for data_type in [
            'train_' + args.data_size, 'dev', 'eval1', 'eval2', 'eval3'
    ]:
        print('===> %s' % data_type)
        feature_save_path = mkdir_join(args.data_save_path, 'feature',
                                       args.tool, args.data_size,
                                       data_type.split('_')[0])

        utt_indices = []
        with codecs.open(join(args.data_save_path, data_type, 'text'), 'r',
                         'utf-8') as f:
            for line in f:
                line = line.strip()
                utt_indices.append(line.split('  ')[0])

        segment_dict = {}
        utt_num = 0
        with open(join(args.data_save_path, data_type, 'segments'), 'r') as f:
            for line in f:
                line = line.strip()
                utt_idx, speaker, start_time, end_time = line.split(' ')

                if speaker not in segment_dict.keys():
                    segment_dict[speaker] = OrderedDict()
                segment_dict[speaker][utt_idx] = [
                    int(float(start_time) * 100 + 0.5),
                    int(float(end_time) * 100 + 0.5)
                ]
                utt_num += 1
        assert len(utt_indices) == utt_num

        spk2audio = {}
        if args.tool == 'htk':
            with open(join(args.data_save_path, data_type, 'htk.scp'),
                      'r') as f:
                for line in f:
                    htk_path = line.strip()
                    speaker = basename(htk_path).split('.')[0]
                    spk2audio[speaker] = htk_path
        else:
            with open(join(args.data_save_path, data_type, 'wav.scp'),
                      'r') as f:
                for line in f:
                    line = line.strip()
                    speaker = line.split(' ')[0]
                    wav_path = line.split(' ')[2]
                    spk2audio[speaker] = wav_path

        if 'train' in data_type:
            global_mean_male, global_std_male = None, None
            global_mean_female, global_std_female = None, None
        else:
            # Load statistics over train dataset
            global_mean_male = np.load(
                join(args.data_save_path, 'feature', args.tool, args.data_size,
                     'train/global_mean_male.npy'))
            global_std_male = np.load(
                join(args.data_save_path, 'feature', args.tool, args.data_size,
                     'train/global_std_male.npy'))
            global_mean_female = np.load(
                join(args.data_save_path, 'feature', args.tool, args.data_size,
                     'train/global_mean_female.npy'))
            global_std_female = np.load(
                join(args.data_save_path, 'feature', args.tool, args.data_size,
                     'train/global_std_female.npy'))

        read_audio(data_type=data_type,
                   spk2audio=spk2audio,
                   segment_dict=segment_dict,
                   tool=args.tool,
                   config=CONFIG,
                   normalize=args.normalize,
                   save_path=feature_save_path,
                   global_mean_male=global_mean_male,
                   global_std_male=global_std_male,
                   global_mean_female=global_mean_female,
                   global_std_female=global_std_female)

Beispiel #20

Datei anzeigen

Datei: plot_ctc_prob.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def do_plot(model, params, epoch, eval_batch_size):
    """Plot the CTC posteriors.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch in evaluation
    """
    # Load dataset
    test_clean_data = Dataset(
        data_type='test_clean',
        train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=eval_batch_size, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False)
    test_other_data = Dataset(
        data_type='test_other',
        train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=eval_batch_size, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False)

    with tf.name_scope('tower_gpu0'):
        # Define placeholders
        model.create_placeholders()

        # Add to the graph each operation (including model definition)
        _, logits = model.compute_loss(
            model.inputs_pl_list[0],
            model.labels_pl_list[0],
            model.inputs_seq_len_pl_list[0],
            model.keep_prob_input_pl_list[0],
            model.keep_prob_hidden_pl_list[0],
            model.keep_prob_output_pl_list[0],
            softmax_temperature=params['softmax_temperature'])
        posteriors_op = model.posteriors(logits, blank_prior=1)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            # Use last saved model
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        # Visualize
        posterior_test(session=sess,
                       posteriors_op=posteriors_op,
                       model=model,
                       dataset=test_clean_data,
                       label_type=params['label_type'],
                       num_stack=params['num_stack'],
                       #    save_path=None)
                       save_path=mkdir_join(model.save_path, 'ctc_output', 'test-clean'))

        posterior_test(session=sess,
                       posteriors_op=posteriors_op,
                       model=model,
                       dataset=test_other_data,
                       label_type=params['label_type'],
                       num_stack=params['num_stack'],
                       #    save_path=None)
                       save_path=mkdir_join(model.save_path, 'ctc_output', 'test-other'))

Beispiel #21

Datei anzeigen

Datei: train_ctc.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle(
        'tf_csj_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if params['bottleneck_dim'] != 0:
        model.name += '_bottle' + str(params['bottleneck_dim'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

Beispiel #22

Datei anzeigen

Datei: plot_attention_weights.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def do_plot(model, params, epoch, eval_batch_size):
    """Decode the Attention outputs.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch when evaluation
    """
    map_file_path = '../metrics/mapping_files/' + \
        params['label_type'] + '.txt'

    # Load dataset
    test_data = Dataset(
        data_type='test', label_type=params['label_type'],
        batch_size=eval_batch_size, map_file_path=map_file_path,
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=False, progressbar=True)

    # Define placeholders
    model.create_placeholders()

    # Add to the graph each operation (including model definition)
    _, _, decoder_outputs_train, decoder_outputs_infer = model.compute_loss(
        model.inputs_pl_list[0],
        model.labels_pl_list[0],
        model.inputs_seq_len_pl_list[0],
        model.labels_seq_len_pl_list[0],
        model.keep_prob_encoder_pl_list[0],
        model.keep_prob_decoder_pl_list[0],
        model.keep_prob_embedding_pl_list[0])
    _, decode_op_infer = model.decode(
        decoder_outputs_train,
        decoder_outputs_infer)
    attention_weights_op = decoder_outputs_infer.attention_weights

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        # Visualize
        plot(session=sess,
             decode_op=decode_op_infer,
             attention_weights_op=attention_weights_op,
             model=model,
             dataset=test_data,
             label_type=params['label_type'],
             is_test=True,
             save_path=mkdir_join(model.save_path, 'attention_weights'),
             #  save_path=None,
             show=False)

Beispiel #23

Datei anzeigen

Datei: train_multitask_ctc.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type_main'] == 'character':
        params['num_classes_main'] = 28
    elif params['label_type_main'] == 'character_capital_divide':
        params['num_classes_main'] = 72
    else:
        raise TypeError
    if params['label_type_sub'] == 'phone61':
        params['num_classes_sub'] = 61
    elif params['label_type_sub'] == 'phone48':
        params['num_classes_sub'] = 48
    elif params['label_type_sub'] == 'phone39':
        params['num_classes_sub'] = 39
    else:
        raise TypeError

    # Model setting
    model = MultitaskCTC(encoder_type=params['encoder_type'],
                         input_size=params['input_size'],
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_units=params['num_units'],
                         num_layers_main=params['num_layers_main'],
                         num_layers_sub=params['num_layers_sub'],
                         num_classes_main=params['num_classes_main'],
                         num_classes_sub=params['num_classes_sub'],
                         main_task_weight=params['main_task_weight'],
                         lstm_impl=params['lstm_impl'],
                         use_peephole=params['use_peephole'],
                         parameter_init=params['weight_init'],
                         clip_grad_norm=params['clip_grad_norm'],
                         clip_activation=params['clip_activation'],
                         num_proj=params['num_proj'],
                         weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' +
                 params['label_type_main'] + '_' + params['label_type_sub'])

    model.name += '_' + str(params['num_units'])
    model.name += '_main' + str(params['num_layers_main'])
    model.name += '_sub' + str(params['num_layers_sub'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    model.name += '_main' + str(params['main_task_weight'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', 'char_' + params['label_type_sub'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #24

Datei anzeigen

def read_text(text_path, vocab_save_path, data_type, lexicon_path=None):
    """Read transcripts (.sdb) & save files (.npy).
    Args:
        text_path (string): path to a text file of kaldi
        vocab_save_path (string): path to save vocabulary files
        data_type (string): train or dev or eval2000_swbd or eval2000_ch
        lexicon_path (string, optional):
    Returns:
        speaker_dict (dict): the dictionary of utterances of each speaker
            key (string) => speaker
            value (dict) => the dictionary of utterance information of each speaker
                key (string) => utterance index
                value (list) => list of
                    [word1_indices, word5_indices,
                     word10_indices, word15_indices
                     char_indices, char_capital_indices]
    """
    # Make vocabulary files
    word1_vocab_path = mkdir_join(vocab_save_path, 'word1.txt')
    word5_vocab_path = mkdir_join(vocab_save_path, 'word5.txt')
    word10_vocab_path = mkdir_join(vocab_save_path, 'word10.txt')
    word15_vocab_path = mkdir_join(vocab_save_path, 'word15.txt')
    word20_vocab_path = mkdir_join(vocab_save_path, 'word20.txt')
    char_vocab_path = mkdir_join(vocab_save_path, 'character.txt')
    char_capital_vocab_path = mkdir_join(vocab_save_path,
                                         'character_capital_divide.txt')
    char_left_vocab_path = mkdir_join(vocab_save_path, 'character_left.txt')
    char_right_vocab_path = mkdir_join(vocab_save_path, 'character_right.txt')
    char_both_vocab_path = mkdir_join(vocab_save_path, 'character_both.txt')
    char_remove_vocab_path = mkdir_join(vocab_save_path,
                                        'character_remove.txt')

    # TODO: ここまで
    raise ValueError

    trans_dict = {}
    char_set = set([])
    char_capital_set = set([])
    word_set = set([])
    word_dict = {}
    with open(text_path, 'r') as f:
        for line in f:
            line = line.strip()
            utt_idx = line.split(' ')[0]
            trans = ' '.join(line.split(' ')[1:]).lower()

            if data_type == 'eval2000_swbd' and utt_idx[:2] == 'en':
                continue
            if data_type == 'eval2000_ch' and utt_idx[:2] == 'sw':
                continue

            # text normalization
            trans = trans.replace('[laughter]', LAUGHTER)
            trans = trans.replace('[noise]', NOISE)
            trans = trans.replace('[vocalized-noise]', VOCALIZED_NOISE)

            if 'eval' in data_type:
                trans = trans.replace('<b_aside>', '')
                trans = trans.replace('<e_aside>', '')
                trans = re.sub(r'[()]+', '', trans)

                # Remove consecutive spaces
                trans = re.sub(r'[\s]+', ' ', trans)

                # Remove the first and last spaces
                if trans[0] == ' ':
                    trans = trans[1:]
                if trans[-1] == ' ':
                    trans = trans[:-1]

            ###################################
            # with filler and disfluency
            ###################################
            trans_left_list, trans_right_list, trans_both_list, trans_remove_list = [], [], [], []
            for w in trans.split(' '):
                if '言いよどみ' in w:
                    w_left = SOD + w.split('+')[0]
                    w_right = w.split('+')[0] + EOD
                    w_both = SOD + w.split('+')[0] + EOD
                elif '感動詞' in w:
                    w_left = SOF + w.split('+')[0]
                    w_right = w.split('+')[0] + EOF
                    w_both = SOF + w.split('+')[0] + EOF
                else:
                    w_left = w.split('+')[0]
                    w_right = w.split('+')[0]
                    w_both = w.split('+')[0]
                    if w != SHORT_PAUSE:
                        trans_remove_list.append(w.split('+')[0])
                trans_left_list.append(w_left)
                trans_right_list.append(w_right)
                trans_both_list.append(w_both)
            trans_left = SPACE.join(trans_left_list)
            trans_right = SPACE.join(trans_right_list)
            trans_both = SPACE.join(trans_both_list)
            trans_remove = SPACE.join(trans_remove_list)

            trans = trans.replace(' ', SPACE)

            trans_capital = ''
            for word in trans.split(SPACE):
                # Count word frequency
                if word not in word_dict.keys():
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1

                word_set.add(word)
                char_set |= set(list(word))

                # Capital-divided
                if len(word) == 1:
                    char_capital_set.add(word)
                    trans_capital += word
                else:
                    # Replace the first character with the capital
                    # letter
                    word = word[0].upper() + word[1:]

                    # Check double-letters
                    for i in range(0, len(word) - 1, 1):
                        if word[i:i + 2] in DOUBLE_LETTERS:
                            char_capital_set.add(word[i:i + 2])
                        else:
                            char_capital_set.add(word[i])
                    trans_capital += word

            trans_dict[utt_idx] = [
                trans, trans_capital, trans_left, trans_right, trans_both,
                trans_remove
            ]

    # Reserve some indices
    char_set.discard('L')
    char_set.discard('A')
    char_set.discard('N')
    char_set.discard('Z')
    char_set.discard('V')

    # Save vocabulary files
    if data_type == 'train':
        # word-level (threshold == 1)
        with open(word1_vocab_path, 'w') as f:
            word_list = sorted(list(word_set)) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)
            # NOTE: OOV index is reserved for the dev set

        # word-level (threshold == 5)
        with open(word5_vocab_path, 'w') as f:
            word_list = sorted(
                [w
                 for w, freq in list(word_dict.items()) if freq >= 5]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # word-level (threshold == 10)
        with open(word10_vocab_path, 'w') as f:
            word_list = sorted([
                w for w, freq in list(word_dict.items()) if freq >= 10
            ]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # word-level (threshold == 15)
        with open(word15_vocab_path, 'w') as f:
            word_list = sorted([
                w for w, freq in list(word_dict.items()) if freq >= 15
            ]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # word-level (threshold == 20)
        with open(word20_vocab_path, 'w') as f:
            word_list = sorted([
                w for w, freq in list(word_dict.items()) if freq >= 20
            ]) + [OOV]
            for w in word_list:
                f.write('%s\n' % w)

        # character-level
        with open(char_vocab_path, 'w') as f:
            char_list = sorted(list(char_set)) + \
                [SPACE, LAUGHTER, NOISE, VOCALIZED_NOISE]
            for c in char_list:
                f.write('%s\n' % c)

        # character-level (capital-divided)
        with open(char_capital_vocab_path, 'w') as f:
            char_capital_list = sorted(list(char_capital_set)) + \
                [LAUGHTER, NOISE, VOCALIZED_NOISE]
            for c in char_capital_list:
                f.write('%s\n' % c)

        # character-level
        # with open(char_left_vocab_path, 'w') as f:
        #     char_left_list = sorted(list(char_set)) + \
        #         [SPACE, LAUGHTER, NOISE, VOCALIZED_NOISE]
        #     for c in char_left_list:
        #         f.write('%s\n' % c)
        raise ValueError

    # Compute OOV rate
    if data_type != 'train':
        with open(mkdir_join(vocab_save_path, 'oov', data_type + '.txt'),
                  'w') as f:

            # word-level (threshold == 1)
            oov_rate = compute_oov_rate(word_dict, word1_vocab_path)
            f.write('Word (freq1):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

            # word-level (threshold == 5)
            oov_rate = compute_oov_rate(word_dict, word5_vocab_path)
            f.write('Word (freq5):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

            # word-level (threshold == 10)
            oov_rate = compute_oov_rate(word_dict, word10_vocab_path)
            f.write('Word (freq10):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

            # word-level (threshold == 15)
            oov_rate = compute_oov_rate(word_dict, word15_vocab_path)
            f.write('Word (freq15):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

            # word-level (threshold == 20)
            oov_rate = compute_oov_rate(word_dict, word20_vocab_path)
            f.write('Word (freq20):\n')
            f.write('  OOV rate: %f %%\n' % oov_rate)

    # Convert to index
    print('=====> Convert to index...')
    word2idx_freq1 = Word2idx(word1_vocab_path)
    word2idx_freq5 = Word2idx(word5_vocab_path)
    word2idx_freq10 = Word2idx(word10_vocab_path)
    word2idx_freq15 = Word2idx(word15_vocab_path)
    word2idx_freq20 = Word2idx(word20_vocab_path)
    char2idx = Char2idx(char_vocab_path)
    char2idx_capital = Char2idx(char_capital_vocab_path, capital_divide=True)
    char2idx_left = Char2idx(char_left_vocab_path)
    char2idx_right = Char2idx(char_right_vocab_path)
    char2idx_both = Char2idx(char_both_vocab_path)
    char2idx_remove = Char2idx(char_remove_vocab_path)

    for utt_idx, [trans, trans_left, trans_right, trans_both,
                  trans_remove] in tqdm(trans_dict.items()):
        if 'eval' in data_type:
            trans_dict[utt_idx] = {
                "word1": trans,
                "word5": trans,
                "word10": trans,
                "word15": trans,
                "word20": trans,
                "char": trans,
                "char_capital": trans,
                "char_left": trans,
                "char_right": trans,
                "char_both": trans,
                "char_remove": trans_remove,
            }
            # NOTE: save as it is
        else:
            word1_indices = word2idx_freq1(trans)
            word5_indices = word2idx_freq5(trans)
            word10_indices = word2idx_freq10(trans)
            word15_indices = word2idx_freq15(trans)
            word20_indices = word2idx_freq20(trans)
            char_indices = char2idx(trans)
            char_capital_indices = char2idx_capital(trans)
            char_left_indices = char2idx_left(trans_left)
            char_right_indices = char2idx_right(trans_right)
            char_both_indices = char2idx_both(trans_both)
            char_remove_indices = char2idx_remove(trans_remove)

            word1_indices = ' '.join(list(map(str, word1_indices.tolist())))
            word5_indices = ' '.join(list(map(str, word5_indices.tolist())))
            word10_indices = ' '.join(list(map(str, word10_indices.tolist())))
            word15_indices = ' '.join(list(map(str, word15_indices.tolist())))
            word20_indices = ' '.join(list(map(str, word20_indices.tolist())))
            char_indices = ' '.join(list(map(str, char_indices.tolist())))
            char_capital_indices = ' '.join(
                list(map(str, char_capital_indices.tolist())))
            char_left_indices = ' '.join(
                list(map(str, char_left_indices.tolist())))
            char_right_indices = ' '.join(
                list(map(str, char_right_indices.tolist())))
            char_both_indices = ' '.join(
                list(map(str, char_both_indices.tolist())))
            char_remove_indices = ' '.join(
                list(map(str, char_remove_indices.tolist())))

            trans_dict[utt_idx] = {
                "word1": word1_indices,
                "word5": word5_indices,
                "word10": word10_indices,
                "word15": word15_indices,
                "word20": word20_indices,
                "char": char_indices,
                "char_capital": char_capital_indices,
                "char_left": char_left_indices,
                "char_right": char_right_indices,
                "char_both": char_both_indices,
                "char_remove": char_remove_indices,
            }

    return trans_dict

Beispiel #25

Datei anzeigen

Datei: train_ctc.py Projekt: zz12375/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['feature'] == 'fbank':
        input_size = 123
    elif params['feature'] == 'is13':
        input_size = 141

    if params['label_type'] in ['original', 'phone3']:
        params['num_classes'] = 3
    elif params['label_type'] == 'phone4':
        params['num_classes'] = 4
    elif params['label_type'] == 'phone43':
        params['num_classes'] = 43

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=input_size * params['num_stack'],
                splice=params['splice'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_svc_' + model.name + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #26

Datei anzeigen

def main():

    # for data_type in ['train', 'dev', 'eval2000_swbd', 'eval2000_ch']:
    for data_type in ['eval2000_swbd', 'eval2000_ch']:
        print('=' * 50)
        print(' ' * 20 + data_type)
        print('=' * 50)

        if 'eval' in data_type:
            data_type_tmp = 'eval2000'
        else:
            data_type_tmp = data_type

        # Convert transcript to index
        print('=> Processing transcripts...')
        trans_dict = read_text(text_path=join(args.data_save_path,
                                              data_type_tmp, 'text'),
                               vocab_save_path=mkdir_join(
                                   args.data_save_path, 'vocab'),
                               data_type=data_type,
                               lexicon_path=None)

        # Make dataset file (.csv)
        print('=> Saving dataset files...')
        csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool,
                                   data_type)

        df_columns = ['frame_num', 'input_path', 'transcript']
        df_word1 = pd.DataFrame([], columns=df_columns)
        df_word5 = pd.DataFrame([], columns=df_columns)
        df_word10 = pd.DataFrame([], columns=df_columns)
        df_word15 = pd.DataFrame([], columns=df_columns)
        df_word20 = pd.DataFrame([], columns=df_columns)
        df_char = pd.DataFrame([], columns=df_columns)
        df_char_capital = pd.DataFrame([], columns=df_columns)
        df_char_left = pd.DataFrame([], columns=df_columns)
        df_char_right = pd.DataFrame([], columns=df_columns)
        df_char_both = pd.DataFrame([], columns=df_columns)
        df_char_remove = pd.DataFrame([], columns=df_columns)

        with open(
                join(args.data_save_path, 'feature', args.tool, data_type,
                     'frame_num.pickle'), 'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_word1_list, df_word5_list, df_word10_list, df_word15_list, df_word20_list = [], [], [], [], []
        df_char_list, df_char_capital_list = [], []
        df_char_left_list, df_char_right_list, df_char_both_list, df_char_remove_list = [], [], [], []
        for utt_idx, trans in tqdm(trans_dict.items()):
            speaker = '_'.join(utt_idx.split('_')[:2])
            feat_utt_save_path = join(args.data_save_path, 'feature',
                                      args.tool, data_type, speaker,
                                      utt_idx + '.npy')
            frame_num = frame_num_dict[utt_idx]

            if not isfile(feat_utt_save_path):
                raise ValueError('There is no file: %s' % feat_utt_save_path)

            df_word1 = add_element(
                df_word1, [frame_num, feat_utt_save_path, trans['word1']])
            df_word5 = add_element(
                df_word5, [frame_num, feat_utt_save_path, trans['word5']])
            df_word10 = add_element(
                df_word10, [frame_num, feat_utt_save_path, trans['word10']])
            df_word15 = add_element(
                df_word15, [frame_num, feat_utt_save_path, trans['word15']])
            df_word20 = add_element(
                df_word20, [frame_num, feat_utt_save_path, trans['word20']])
            df_char = add_element(
                df_char, [frame_num, feat_utt_save_path, trans['char']])
            df_char_capital = add_element(
                df_char_capital,
                [frame_num, feat_utt_save_path, trans['char_capital']])
            df_char_left = add_element(
                df_char_left,
                [frame_num, feat_utt_save_path, trans['char_left']])
            df_char_right = add_element(
                df_char_right,
                [frame_num, feat_utt_save_path, trans['char_right']])
            df_char_both = add_element(
                df_char_both,
                [frame_num, feat_utt_save_path, trans['char_both']])
            df_char_remove = add_element(
                df_char_remove,
                [frame_num, feat_utt_save_path, trans['char_remove']])
            utt_count += 1

            # Reset
            if utt_count == 10000:
                df_word1_list.append(df_word1)
                df_word5_list.append(df_word5)
                df_word10_list.append(df_word10)
                df_word15_list.append(df_word15)
                df_word20_list.append(df_word20)
                df_char_list.append(df_char)
                df_char_capital_list.append(df_char_capital)
                df_char_left_list.append(df_char_left)
                df_char_right_list.append(df_char_right)
                df_char_both_list.append(df_char_both)
                df_char_remove_list.append(df_char_remove)

                df_word1 = pd.DataFrame([], columns=df_columns)
                df_word5 = pd.DataFrame([], columns=df_columns)
                df_word10 = pd.DataFrame([], columns=df_columns)
                df_word15 = pd.DataFrame([], columns=df_columns)
                df_word20 = pd.DataFrame([], columns=df_columns)
                df_char = pd.DataFrame([], columns=df_columns)
                df_char_capital = pd.DataFrame([], columns=df_columns)
                df_char_left = pd.DataFrame([], columns=df_columns)
                df_char_right = pd.DataFrame([], columns=df_columns)
                df_char_both = pd.DataFrame([], columns=df_columns)
                df_char_remove = pd.DataFrame([], columns=df_columns)

                utt_count = 0

        # Last dataframe
        df_word1_list.append(df_word1)
        df_word5_list.append(df_word5)
        df_word10_list.append(df_word10)
        df_word15_list.append(df_word15)
        df_word20_list.append(df_word20)
        df_char_list.append(df_char)
        df_char_capital_list.append(df_char_capital)
        df_char_left_list.append(df_char_left)
        df_char_right_list.append(df_char_right)
        df_char_both_list.append(df_char_both)
        df_char_remove_list.append(df_char_remove)

        # Concatenate all dataframes
        df_word1 = df_word1_list[0]
        df_word5 = df_word5_list[0]
        df_word10 = df_word10_list[0]
        df_word15 = df_word15_list[0]
        df_word20 = df_word20_list[0]
        df_char = df_char_list[0]
        df_char_capital = df_char_capital_list[0]
        df_char_left = df_char_left_list[0]
        df_char_right = df_char_right_list[0]
        df_char_both = df_char_both_list[0]
        df_char_remove = df_char_remove_list[0]

        for i in df_word1_list[1:]:
            df_word1 = pd.concat([df_word1, i], axis=0)
        for i in df_word5_list[1:]:
            df_word5 = pd.concat([df_word5, i], axis=0)
        for i in df_word10_list[1:]:
            df_word10 = pd.concat([df_word10, i], axis=0)
        for i in df_word15_list[1:]:
            df_word15 = pd.concat([df_word15, i], axis=0)
        for i in df_word15_list[1:]:
            df_word20 = pd.concat([df_word20, i], axis=0)
        for i in df_char_list[1:]:
            df_char = pd.concat([df_char, i], axis=0)
        for i in df_char_capital_list[1:]:
            df_char_capital = pd.concat([df_char_capital, i], axis=0)
        for i in df_char_left_list[1:]:
            df_char_left = pd.concat([df_char_left, i], axis=0)
        for i in df_char_right_list[1:]:
            df_char_right = pd.concat([df_char_right, i], axis=0)
        for i in df_char_both_list[1:]:
            df_char_both = pd.concat([df_char_both, i], axis=0)
        for i in df_char_remove_list[1:]:
            df_char_remove = pd.concat([df_char_remove, i], axis=0)

        df_word1.to_csv(join(csv_save_path, 'word1.csv'), encoding='utf-8')
        df_word5.to_csv(join(csv_save_path, 'word5.csv'), encoding='utf-8')
        df_word10.to_csv(join(csv_save_path, 'word10.csv'), encoding='utf-8')
        df_word15.to_csv(join(csv_save_path, 'word15.csv'), encoding='utf-8')
        df_word20.to_csv(join(csv_save_path, 'word20.csv'), encoding='utf-8')
        df_char.to_csv(join(csv_save_path, 'character.csv'), encoding='utf-8')
        df_char_capital.to_csv(join(csv_save_path,
                                    'character_capital_divide.csv'),
                               encoding='utf-8')
        df_char_left.to_csv(join(csv_save_path, 'char_left.csv'),
                            encoding='utf-8')
        df_char_right.to_csv(join(csv_save_path, 'char_right.csv'),
                             encoding='utf-8')
        df_char_both.to_csv(join(csv_save_path, 'char_both.csv'),
                            encoding='utf-8')
        df_char_remove.to_csv(join(csv_save_path, 'char_remove.csv'),
                              encoding='utf-8')

Beispiel #27

Datei anzeigen

Datei: train_joint_ctc_attention.py Projekt: sky1170447398/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    params['sos_index'] = 0
    params['eos_index'] = 1
    if params['label_type'] == 'phone61':
        params['att_num_classes'] = 63
        params['ctc_num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['att_num_classes'] = 50
        params['ctc_num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['att_num_classes'] = 41
        params['ctc_num_classes'] = 39
    elif params['label_type'] == 'character':
        params['att_num_classes'] = 30
        params['ctc_num_classes'] = 28

    # Model setting
    # AttentionModel = load(model_type=config['model_name'])
    model = JointCTCAttention(
        input_size=params['input_size'],
        encoder_num_unit=params['encoder_num_unit'],
        encoder_num_layer=params['encoder_num_layer'],
        attention_dim=params['attention_dim'],
        attention_type=params['attention_type'],
        decoder_num_unit=params['decoder_num_unit'],
        decoder_num_layer=params['decoder_num_layer'],
        embedding_dim=params['embedding_dim'],
        att_num_classes=params['att_num_classes'],
        ctc_num_classes=params['ctc_num_classes'],
        att_task_weight=params['att_task_weight'],
        sos_index=params['sos_index'],
        eos_index=params['eos_index'],
        max_decode_length=params['max_decode_length'],
        # attention_smoothing=params['attention_smoothing'],
        attention_weights_tempareture=params['attention_weights_tempareture'],
        logits_tempareture=params['logits_tempareture'],
        parameter_init=params['weight_init'],
        clip_grad=params['clip_grad'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('timit_' + model.name + '_' + params['label_type'])

    model.name = params['model']
    model.name += '_encoder' + str(params['encoder_num_unit'])
    model.name += '_' + str(params['encoder_num_layer'])
    model.name += '_attdim' + str(params['attention_dim'])
    model.name += '_decoder' + str(params['decoder_num_unit'])
    model.name += '_' + str(params['decoder_num_layer'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    # if bool(params['attention_smoothing']):
    #     model.name += '_smoothing'
    if params['attention_weights_tempareture'] != 1:
        model.name += '_sharpening' + \
            str(params['attention_weights_tempareture'])
    if params['weight_decay'] != 0:
        model.name += '_weightdecay' + str(params['weight_decay'])

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'attention',
                                 params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            # tf.gfile.DeleteRecursively(new_model_path)
            # tf.gfile.MakeDirs(new_model_path)
            # break
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    # sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #28

Datei anzeigen

def main():

    for data_type in ['train', 'dev', 'eval1', 'eval2', 'eval3']:
        # TODO: data_size
        print('=' * 50)
        print(' ' * 20 + data_type)
        print('=' * 50)

        # Convert transcript to index
        print('=> Processing transcripts...')
        trans_dict = read_text(
            text_path=join(args.data_save_path, data_type, 'text'),
            vocab_save_path=mkdir_join(args.data_save_path, 'vocab'),
            data_type=data_type,
            kana2phone_path='./local/csj_make_trans/kana2phone',
            lexicon_path=None)

        # Make dataset file (.csv)
        print('=> Saving dataset files...')
        csv_save_path = mkdir_join(args.data_save_path, 'dataset', args.tool,
                                   data_type)

        df_columns = ['frame_num', 'input_path', 'transcript']
        df_word = pd.DataFrame([], columns=df_columns)
        df_char = pd.DataFrame([], columns=df_columns)
        df_char_wb = pd.DataFrame([], columns=df_columns)
        df_char_wb_left = pd.DataFrame([], columns=df_columns)
        df_char_wb_right = pd.DataFrame([], columns=df_columns)
        df_char_wb_both = pd.DataFrame([], columns=df_columns)
        df_char_wb_remove = pd.DataFrame([], columns=df_columns)
        # df_phone = pd.DataFrame([], columns=df_columns)
        # df_phone_wb = pd.DataFrame([], columns=df_columns)
        df_pos = pd.DataFrame([], columns=df_columns)

        with open(
                join(args.data_save_path, 'feature', args.tool, data_type,
                     'frame_num.pickle'), 'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_word_list = []
        df_char_list, df_char_wb_list = [], []
        df_char_wb_left_list, df_char_wb_right_list = [], []
        df_char_wb_both_list, df_char_wb_remove_list = [], []
        # df_phone_list, df_phone_wb_list = [], []
        df_pos_list = []
        for utt_idx, trans in tqdm(trans_dict.items()):
            speaker = utt_idx.split('_')[0]
            feat_utt_save_path = join(args.data_save_path, 'feature',
                                      args.tool, data_type, speaker,
                                      utt_idx + '.npy')
            frame_num = frame_num_dict[utt_idx]

            if not isfile(feat_utt_save_path):
                raise ValueError('There is no file: %s' % feat_utt_save_path)

            df_word = add_element(
                df_word, [frame_num, feat_utt_save_path, trans['word']])
            df_char = add_element(
                df_char, [frame_num, feat_utt_save_path, trans['char']])
            df_char_wb = add_element(
                df_char_wb, [frame_num, feat_utt_save_path, trans['char_wb']])
            df_char_wb_left = add_element(
                df_char_wb_left,
                [frame_num, feat_utt_save_path, trans['char_wb_left']])
            df_char_wb_right = add_element(
                df_char_wb_right,
                [frame_num, feat_utt_save_path, trans['char_wb_right']])
            df_char_wb_both = add_element(
                df_char_wb_both,
                [frame_num, feat_utt_save_path, trans['char_wb_both']])
            df_char_wb_remove = add_element(
                df_char_wb_remove,
                [frame_num, feat_utt_save_path, trans['char_wb_remove']])
            # df_phone = add_element(
            #     df_phone, [frame_num, feat_utt_save_path, phone_indices])
            # df_phone_wb = add_element(
            #     df_phone_wb, [frame_num, feat_utt_save_path, phone_wb_indices])
            df_pos = add_element(df_pos,
                                 [frame_num, feat_utt_save_path, trans['pos']])
            utt_count += 1

            # Reset
            if utt_count == 10000:
                df_word_list.append(df_word)
                df_char_list.append(df_char)
                df_char_wb_list.append(df_char_wb)
                df_char_wb_left_list.append(df_char_wb_left)
                df_char_wb_right_list.append(df_char_wb_right)
                df_char_wb_both_list.append(df_char_wb_both)
                df_char_wb_remove_list.append(df_char_wb_remove)
                # df_phone_list.append(df_phone)
                # df_phone_wb_list.append(df_phone_wb)
                df_pos_list.append(df_pos)

                df_word = pd.DataFrame([], columns=df_columns)
                df_char = pd.DataFrame([], columns=df_columns)
                df_char_wb = pd.DataFrame([], columns=df_columns)
                df_char_wb_left = pd.DataFrame([], columns=df_columns)
                df_char_wb_right = pd.DataFrame([], columns=df_columns)
                df_char_wb_both = pd.DataFrame([], columns=df_columns)
                df_char_wb_remove = pd.DataFrame([], columns=df_columns)
                # df_phone = pd.DataFrame([], columns=df_columns)
                # df_phone_wb = pd.DataFrame([], columns=df_columns)
                df_pos = pd.DataFrame([], columns=df_columns)
                utt_count = 0

        # Last dataframe
        df_word_list.append(df_word)
        df_char_list.append(df_char)
        df_char_wb_list.append(df_char_wb)
        df_char_wb_left_list.append(df_char_wb_left)
        df_char_wb_right_list.append(df_char_wb_right)
        df_char_wb_both_list.append(df_char_wb_both)
        df_char_wb_remove_list.append(df_char_wb_remove)
        # df_phone_list.append(df_phone)
        # df_phone_wb_list.append(df_phone_wb)
        df_pos_list.append(df_pos)

        # Concatenate all dataframes
        df_word = df_word_list[0]
        df_char = df_char_list[0]
        df_char_wb = df_char_wb_list[0]
        df_char_wb_left = df_char_wb_left_list[0]
        df_char_wb_right = df_char_wb_right_list[0]
        df_char_wb_both = df_char_wb_both_list[0]
        df_char_wb_remove = df_char_wb_remove_list[0]
        # df_phone = df_phone_list[0]
        # df_phone_wb = df_phone_wb_list[0]
        df_pos = df_pos_list[0]

        for i in df_word_list[1:]:
            df_word = pd.concat([df_word, i], axis=0)
        for i in df_char_list[1:]:
            df_char = pd.concat([df_char, i], axis=0)
        for i in df_char_wb_list[1:]:
            df_char_wb = pd.concat([df_char_wb, i], axis=0)
        for i in df_char_wb_left_list[1:]:
            df_char_wb_left = pd.concat([df_char_wb_left, i], axis=0)
        for i in df_char_wb_right_list[1:]:
            df_char_wb_right = pd.concat([df_char_wb_right, i], axis=0)
        for i in df_char_wb_both_list[1:]:
            df_char_wb_both = pd.concat([df_char_wb_both, i], axis=0)
        for i in df_char_wb_remove_list[1:]:
            df_char_wb_remove = pd.concat([df_char_wb_remove, i], axis=0)
        # for i in df_phone_list[1:]:
        #     df_phone = pd.concat([df_phone, i], axis=0)
        # for i in df_phone_wb_list[1:]:
        #     df_phone_wb = pd.concat([df_phone_wb, i], axis=0)
        for i in df_pos_list[1:]:
            df_pos = pd.concat([df_pos, i], axis=0)

        df_word.to_csv(join(csv_save_path, 'word.csv'), encoding='utf-8')
        df_char.to_csv(join(csv_save_path, 'character.csv'), encoding='utf-8')
        df_char_wb.to_csv(join(csv_save_path, 'character_wb.csv'),
                          encoding='utf-8')
        df_char_wb_left.to_csv(join(csv_save_path, 'character_wb_left.csv'),
                               encoding='utf-8')
        df_char_wb_right.to_csv(join(csv_save_path, 'character_wb_right.csv'),
                                encoding='utf-8')
        df_char_wb_both.to_csv(join(csv_save_path, 'character_wb_both.csv'),
                               encoding='utf-8')
        df_char_wb_remove.to_csv(join(csv_save_path,
                                      'character_wb_remove.csv'),
                                 encoding='utf-8')
        # df_phone.to_csv(join(csv_save_path, 'phone.csv'), encoding='utf-8')
        # df_phone_wb.to_csv(join(csv_save_path, 'phone_wb.csv'), encoding='utf-8')
        df_pos.to_csv(join(csv_save_path, 'pos.csv'), encoding='utf-8')

Beispiel #29

Datei anzeigen

Datei: train_multitask_ctc.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Read a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type_main'] == 'kanji':
        params['num_classes_main'] = 3386
    elif params['label_type_main'] == 'kana':
        params['num_classes_main'] = 147
    else:
        raise TypeError

    if params['label_type_sub'] == 'kana':
        params['num_classes_sub'] = 147
    elif params['label_type_sub'] == 'phone':
        params['num_classes_sub'] = 38
    else:
        TypeError

    # Model setting
    model = load(model_type=params['model'])
    model = model(batch_size=params['batch_size'],
                  input_size=params['input_size'],
                  splice=params['splice'],
                  num_stack=params['num_stack'],
                  num_units=params['num_units'],
                  num_layer_main=params['num_layer_main'],
                  num_layer_sub=params['num_layer_sub'],
                  #    bottleneck_dim=params['bottleneck_dim'],
                  num_classes_main=params['num_classes_main'],
                  num_classes_sub=params['num_classes_sub'],
                  main_task_weight=params['main_task_weight'],
                  parameter_init=params['weight_init'],
                  clip_grad_norm=params['clip_grad_norm'],
                  clip_activation=params['clip_activation'],
                  num_proj=params['num_proj'],
                  weight_decay=params['weight_decay'])

    model.model_name = params['model']
    model.model_name += '_' + str(params['num_units'])
    model.model_name += '_main' + str(params['num_layer_main'])
    model.model_name += '_sub' + str(params['num_layer_sub'])
    model.model_name += '_' + params['optimizer']
    model.model_name += '_lr' + str(params['learning_rate'])
    if params['bottleneck_dim'] != 0:
        model.model_name += '_bottoleneck' + str(params['bottleneck_dim'])
    if params['num_proj'] != 0:
        model.model_name += '_proj' + str(params['num_proj'])
    if params['num_stack'] != 1:
        model.model_name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.model_name += '_weightdecay' + str(params['weight_decay'])
    model.model_name += '_taskweight' + str(params['main_task_weight'])
    if params['train_data_size'] == 'large':
        model.model_name += '_large'

    # Set save path
    model.save_path = mkdir(model_save_path)
    model.save_path = mkdir_join(model.save_path, 'ctc')
    model.save_path = mkdir_join(
        model.save_path,
        params['label_type_main'] + '_' + params['label_type_sub'])
    model.save_path = mkdir_join(model.save_path, model.model_name)

    # Reset model directory
    if not isfile(join(model.save_path, 'complete.txt')):
        tf.gfile.DeleteRecursively(model.save_path)
        tf.gfile.MakeDirs(model.save_path)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('csj_multictc_' + params['label_type_main'] + '_' +
                 params['label_type_sub'] + '_' + params['train_data_size'])

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    do_train(model=model, params=params)

Beispiel #30

Datei anzeigen

def main():

    for data_size in ['train_si84', 'train_si284']:
        print('=' * 100)
        print(' ' * 40 + data_size)
        print('=' * 100)
        for data_type in [data_size, 'test_dev93', 'test_eval92']:
            print('=' * 50)
            print(' ' * 20 + data_type)
            print('=' * 50)

            # Convert transcript to index
            print('=> Processing transcripts...')
            trans_dict = read_text(
                text_path=join(args.data_save_path, data_type, 'text'),
                vocab_save_path=mkdir_join(
                    args.data_save_path, 'vocab', data_size),
                data_type=data_type,
                lexicon_path=None)

            # Make dataset file (.csv)
            print('=> Saving dataset files...')
            csv_save_path = mkdir_join(
                args.data_save_path, 'dataset', args.tool, data_size, data_type)

            df_columns = ['frame_num', 'input_path', 'transcript']
            df_word = pd.DataFrame([], columns=df_columns)
            df_char = pd.DataFrame([], columns=df_columns)
            df_char_capital = pd.DataFrame([], columns=df_columns)

            with open(join(args.data_save_path, 'feature', args.tool, data_size, data_type, 'frame_num.pickle'), 'rb') as f:
                frame_num_dict = pickle.load(f)

            utt_count = 0
            df_word_list = []
            df_char_list, df_char_capital_list = [], []
            for utt_idx, trans in tqdm(trans_dict.items()):
                speaker = utt_idx[:3]
                feat_utt_save_path = join(
                    args.data_save_path, 'feature', args.tool, data_size, data_type,
                    speaker, utt_idx + '.npy')
                frame_num = frame_num_dict[utt_idx]

                if not isfile(feat_utt_save_path):
                    raise ValueError('There is no file: %s' %
                                     feat_utt_save_path)

                df_word = add_element(
                    df_word, [frame_num, feat_utt_save_path, trans['word']])
                df_char = add_element(
                    df_char, [frame_num, feat_utt_save_path, trans['char']])
                df_char_capital = add_element(
                    df_char_capital, [frame_num, feat_utt_save_path, trans['char_capital']])
                utt_count += 1

                # Reset
                if utt_count == 10000:
                    df_word_list.append(df_word)
                    df_char_list.append(df_char)
                    df_char_capital_list.append(df_char_capital)

                    df_word = pd.DataFrame([], columns=df_columns)
                    df_char = pd.DataFrame([], columns=df_columns)
                    df_char_capital = pd.DataFrame([], columns=df_columns)

                    utt_count = 0

            # Last dataframe
            df_word_list.append(df_word)
            df_char_list.append(df_char)
            df_char_capital_list.append(df_char_capital)

            # Concatenate all dataframes
            df_word = df_word_list[0]
            df_char = df_char_list[0]
            df_char_capital = df_char_capital_list[0]

            for i in df_word_list[1:]:
                df_word = pd.concat([df_word, i], axis=0)
            for i in df_char_list[1:]:
                df_char = pd.concat([df_char, i], axis=0)
            for i in df_char_capital_list[1:]:
                df_char_capital = pd.concat([df_char_capital, i], axis=0)

            df_word.to_csv(
                join(csv_save_path, 'word.csv'), encoding='utf-8')
            df_char.to_csv(
                join(csv_save_path, 'character.csv'), encoding='utf-8')
            df_char_capital.to_csv(
                join(csv_save_path, 'character_capital_divide.csv'), encoding='utf-8')

Beispiel #31

Datei anzeigen

Datei: feature_extraction.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def main():
    print('=> Processing input data...')
    for data_size in ['train_si84', 'train_si284']:
        for data_type in [data_size, 'test_dev93', 'test_eval92']:

            print('===> %s' % data_type)
            feature_save_path = mkdir_join(
                args.data_save_path, 'feature', args.tool, data_size, data_type)

            utt_indices = []
            with open(join(args.data_save_path, data_type, 'text'), 'r') as f:
                for line in f:
                    line = line.strip()
                    utt_idx = line.split(' ')[0]
                    utt_indices.append(utt_idx)

            audio_paths = []
            if args.tool == 'htk':
                with open(join(args.data_save_path, data_type, 'htk.scp'), 'r') as f:
                    for line in f:
                        htk_path = line.strip()
                        audio_paths.append(htk_path)
            else:
                with open(join(args.data_save_path, data_type, 'wav.scp'), 'r') as f:
                    for line in f:
                        line = line.strip()
                        wav_path = line.split(' ')[4]
                        audio_paths.append(wav_path)

            spk2gender = {}
            with open(join(args.data_save_path, data_type, 'spk2gender'), 'r') as f:
                for line in f:
                    line = line.strip()
                    speaker, gender = line.split(' ')
                    spk2gender[speaker] = gender

            if 'train' in data_type:
                global_mean_male, global_std_male = None, None
                global_mean_female, global_std_female = None, None
            else:
                # Load statistics over train dataset
                global_mean_male = np.load(
                    join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_mean_male.npy'))
                global_std_male = np.load(
                    join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_std_male.npy'))
                global_mean_female = np.load(
                    join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_mean_female.npy'))
                global_std_female = np.load(
                    join(args.data_save_path, 'feature', args.tool, data_size, data_size, 'global_std_female.npy'))

            read_audio(data_type=data_type,
                       audio_paths=audio_paths,
                       spk2gender=spk2gender,
                       tool=args.tool,
                       config=CONFIG,
                       normalize=args.normalize,
                       save_path=feature_save_path,
                       global_mean_male=global_mean_male,
                       global_std_male=global_std_male,
                       global_mean_female=global_mean_female,
                       global_std_female=global_std_female)

Beispiel #32

Datei anzeigen

def main():

    args = parser.parse_args()

    ##################################################
    # DATSET
    ##################################################
    if args.model_save_path is not None:
        # Load a config file (.yml)
        params = load_config(args.config_path)
    # NOTE: Retrain the saved model from the last checkpoint
    elif args.saved_model_path is not None:
        params = load_config(os.path.join(args.saved_model_path, 'config.yml'))
    else:
        raise ValueError("Set model_save_path or saved_model_path.")

    # Load dataset
    train_data = Dataset(data_save_path=args.data_save_path,
                         backend=params['backend'],
                         input_channel=params['input_channel'],
                         use_delta=params['use_delta'],
                         use_double_delta=params['use_double_delta'],
                         data_type='train',
                         data_size=params['data_size'],
                         label_type=params['label_type'],
                         batch_size=params['batch_size'],
                         max_epoch=params['num_epoch'],
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_skip=params['num_skip'],
                         sort_utt=True,
                         sort_stop_epoch=params['sort_stop_epoch'],
                         tool=params['tool'],
                         num_enque=None,
                         dynamic_batching=params['dynamic_batching'])
    dev_clean_data = Dataset(data_save_path=args.data_save_path,
                             backend=params['backend'],
                             input_channel=params['input_channel'],
                             use_delta=params['use_delta'],
                             use_double_delta=params['use_double_delta'],
                             data_type='dev_clean',
                             data_size=params['data_size'],
                             label_type=params['label_type'],
                             batch_size=params['batch_size'],
                             splice=params['splice'],
                             num_stack=params['num_stack'],
                             num_skip=params['num_skip'],
                             shuffle=True,
                             tool=params['tool'])
    dev_other_data = Dataset(data_save_path=args.data_save_path,
                             backend=params['backend'],
                             input_channel=params['input_channel'],
                             use_delta=params['use_delta'],
                             use_double_delta=params['use_double_delta'],
                             data_type='dev_other',
                             data_size=params['data_size'],
                             label_type=params['label_type'],
                             batch_size=params['batch_size'],
                             splice=params['splice'],
                             num_stack=params['num_stack'],
                             num_skip=params['num_skip'],
                             shuffle=True,
                             tool=params['tool'])
    test_clean_data = Dataset(data_save_path=args.data_save_path,
                              backend=params['backend'],
                              input_channel=params['input_channel'],
                              use_delta=params['use_delta'],
                              use_double_delta=params['use_double_delta'],
                              data_type='test_clean',
                              data_size=params['data_size'],
                              label_type=params['label_type'],
                              batch_size=params['batch_size'],
                              splice=params['splice'],
                              num_stack=params['num_stack'],
                              num_skip=params['num_skip'],
                              tool=params['tool'])
    test_other_data = Dataset(data_save_path=args.data_save_path,
                              backend=params['backend'],
                              input_channel=params['input_channel'],
                              use_delta=params['use_delta'],
                              use_double_delta=params['use_double_delta'],
                              data_type='test_other',
                              data_size=params['data_size'],
                              label_type=params['label_type'],
                              batch_size=params['batch_size'],
                              splice=params['splice'],
                              num_stack=params['num_stack'],
                              num_skip=params['num_skip'],
                              tool=params['tool'])

    params['num_classes'] = train_data.num_classes

    ##################################################
    # MODEL
    ##################################################
    # Model setting
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    if args.model_save_path is not None:

        # Set save path
        save_path = mkdir_join(args.model_save_path, params['backend'],
                               params['model_type'], params['label_type'],
                               params['data_size'], model.name)
        model.set_save_path(save_path)

        # Save config file
        save_config(config_path=args.config_path, save_path=model.save_path)

        # Setting for logging
        logger = set_logger(model.save_path)

        if os.path.isdir(params['char_init']):
            # NOTE: Start training from the pre-trained character model
            model.load_checkpoint(save_path=params['char_init'],
                                  epoch=-1,
                                  load_pretrained_model=True)

        # Count total parameters
        for name in sorted(list(model.num_params_dict.keys())):
            num_params = model.num_params_dict[name]
            logger.info("%s %d" % (name, num_params))
        logger.info("Total %.3f M parameters" %
                    (model.total_parameters / 1000000))

        # Define optimizer
        model.set_optimizer(optimizer=params['optimizer'],
                            learning_rate_init=float(params['learning_rate']),
                            weight_decay=float(params['weight_decay']),
                            clip_grad_norm=params['clip_grad_norm'],
                            lr_schedule=False,
                            factor=params['decay_rate'],
                            patience_epoch=params['decay_patient_epoch'])

        epoch, step = 1, 0
        learning_rate = float(params['learning_rate'])
        metric_dev_best = 1

    # NOTE: Retrain the saved model from the last checkpoint
    elif args.saved_model_path is not None:

        # Set save path
        model.save_path = args.saved_model_path

        # Setting for logging
        logger = set_logger(model.save_path, restart=True)

        # Define optimizer
        model.set_optimizer(
            optimizer=params['optimizer'],
            learning_rate_init=float(params['learning_rate']),  # on-the-fly
            weight_decay=float(params['weight_decay']),
            clip_grad_norm=params['clip_grad_norm'],
            lr_schedule=False,
            factor=params['decay_rate'],
            patience_epoch=params['decay_patient_epoch'])

        # Restore the last saved model
        epoch, step, learning_rate, metric_dev_best = model.load_checkpoint(
            save_path=args.saved_model_path, epoch=-1, restart=True)

    else:
        raise ValueError("Set model_save_path or saved_model_path.")

    train_data.epoch = epoch - 1

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    logger.info('PID: %s' % os.getpid())
    logger.info('USERNAME: %s' % os.uname()[1])

    # Set process name
    setproctitle('libri_' + params['backend'] + '_' + params['model_type'] +
                 '_' + params['label_type'] + '_' + params['data_size'])

    ##################################################
    # TRAINING LOOP
    ##################################################
    # Define learning rate controller
    lr_controller = Controller(
        learning_rate_init=learning_rate,
        backend=params['backend'],
        decay_start_epoch=params['decay_start_epoch'],
        decay_rate=params['decay_rate'],
        decay_patient_epoch=params['decay_patient_epoch'],
        lower_better=True)

    # Setting for tensorboard
    if params['backend'] == 'pytorch':
        tf_writer = SummaryWriter(model.save_path)

    # Train model
    csv_steps, csv_loss_train, csv_loss_dev = [], [], []
    start_time_train = time.time()
    start_time_epoch = time.time()
    start_time_step = time.time()
    not_improved_epoch = 0
    best_model = model
    loss_train_mean = 0.
    pbar_epoch = tqdm(total=len(train_data))
    while True:
        # Compute loss in the training set (including parameter update)
        batch_train, is_new_epoch = train_data.next()
        model, loss_train_val = train_step(model,
                                           batch_train,
                                           params['clip_grad_norm'],
                                           backend=params['backend'])
        loss_train_mean += loss_train_val

        pbar_epoch.update(len(batch_train['xs']))

        if (step + 1) % params['print_step'] == 0:

            # Compute loss in the dev set
            batch_dev = dev_clean_data.next()[0]
            loss_dev = model(batch_dev['xs'],
                             batch_dev['ys'],
                             batch_dev['x_lens'],
                             batch_dev['y_lens'],
                             is_eval=True)

            loss_train_mean /= params['print_step']
            csv_steps.append(step)
            csv_loss_train.append(loss_train_mean)
            csv_loss_dev.append(loss_dev)

            # Logging by tensorboard
            if params['backend'] == 'pytorch':
                tf_writer.add_scalar('train/loss', loss_train_mean, step + 1)
                tf_writer.add_scalar('dev/loss', loss_dev, step + 1)
                for name, param in model.named_parameters():
                    name = name.replace('.', '/')
                    tf_writer.add_histogram(name,
                                            param.data.cpu().numpy(), step + 1)
                    tf_writer.add_histogram(name + '/grad',
                                            param.grad.data.cpu().numpy(),
                                            step + 1)

            duration_step = time.time() - start_time_step
            logger.info(
                "...Step:%d(epoch:%.3f) loss:%.3f(%.3f)/lr:%.5f/batch:%d/x_lens:%d (%.3f min)"
                % (step + 1, train_data.epoch_detail, loss_train_mean,
                   loss_dev, learning_rate, train_data.current_batch_size,
                   max(batch_train['x_lens']) * params['num_stack'],
                   duration_step / 60))
            start_time_step = time.time()
            loss_train_mean = 0.
        step += 1

        # Save checkpoint and evaluate model per epoch
        if is_new_epoch:
            duration_epoch = time.time() - start_time_epoch
            logger.info('===== EPOCH:%d (%.3f min) =====' %
                        (epoch, duration_epoch / 60))

            # Save fugure of loss
            plot_loss(csv_loss_train,
                      csv_loss_dev,
                      csv_steps,
                      save_path=model.save_path)

            if epoch < params['eval_start_epoch']:
                # Save the model
                model.save_checkpoint(model.save_path, epoch, step,
                                      learning_rate, metric_dev_best)
            else:
                start_time_eval = time.time()
                # dev
                if 'word' in params['label_type']:
                    metric_dev_epoch, _ = do_eval_wer(
                        models=[model],
                        dataset=dev_clean_data,
                        beam_width=1,
                        max_decode_len=MAX_DECODE_LEN_WORD,
                        eval_batch_size=1)
                    logger.info('  WER (dev-clean): %.3f %%' %
                                (metric_dev_epoch * 100))
                else:
                    metric_dev_epoch, wer_dev_clean_epoch, _ = do_eval_cer(
                        models=[model],
                        dataset=dev_clean_data,
                        beam_width=1,
                        max_decode_len=MAX_DECODE_LEN_CHAR,
                        eval_batch_size=1)
                    logger.info('  CER / WER (dev-clean): %.3f %% / %.3f %%' %
                                ((metric_dev_epoch * 100),
                                 (wer_dev_clean_epoch * 100)))

                if metric_dev_epoch < metric_dev_best:
                    metric_dev_best = metric_dev_epoch
                    not_improved_epoch = 0
                    best_model = copy.deepcopy(model)
                    logger.info('||||| Best Score |||||')

                    # Save the model
                    model.save_checkpoint(model.save_path, epoch, step,
                                          learning_rate, metric_dev_best)

                    # dev-other & test
                    if 'word' in params['label_type']:
                        metric_dev_other_epoch, _ = do_eval_wer(
                            models=[model],
                            dataset=dev_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (dev-other): %.3f %%' %
                                    (metric_dev_other_epoch * 100))

                        wer_test_clean, _ = do_eval_wer(
                            models=[model],
                            dataset=test_clean_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (test-clean): %.3f %%' %
                                    (wer_test_clean * 100))

                        wer_test_other, _ = do_eval_wer(
                            models=[model],
                            dataset=test_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_WORD,
                            eval_batch_size=1)
                        logger.info('  WER (test-other): %.3f %%' %
                                    (wer_test_other * 100))

                        logger.info(
                            '  WER (test-mean): %.3f %%' %
                            ((wer_test_clean + wer_test_other) * 100 / 2))
                    else:
                        metric_dev_other_epoch, wer_dev_other_epoch, _ = do_eval_cer(
                            models=[model],
                            dataset=dev_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (dev-other): %.3f %% / %.3f %%' %
                            ((metric_dev_other_epoch * 100),
                             (wer_dev_other_epoch * 100)))

                        cer_test_clean, wer_test_clean, _ = do_eval_cer(
                            models=[model],
                            dataset=test_clean_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (test-clean): %.3f %% / %.3f %%' %
                            ((cer_test_clean * 100), (wer_test_clean * 100)))

                        cer_test_other, wer_test_other, _ = do_eval_cer(
                            models=[model],
                            dataset=test_other_data,
                            beam_width=1,
                            max_decode_len=MAX_DECODE_LEN_CHAR,
                            eval_batch_size=1)
                        logger.info(
                            '  CER / WER (test-other): %.3f %% / %.3f %%' %
                            ((cer_test_other * 100), (wer_test_other * 100)))

                        logger.info(
                            '  CER / WER (test-mean): %.3f %% / %.3f %%' %
                            (((cer_test_clean + cer_test_other) * 100 / 2),
                             ((wer_test_clean + wer_test_other) * 100 / 2)))

                else:
                    not_improved_epoch += 1

                duration_eval = time.time() - start_time_eval
                logger.info('Evaluation time: %.3f min' % (duration_eval / 60))

                # Early stopping
                if not_improved_epoch == params['not_improved_patient_epoch']:
                    break

                # Update learning rate
                model.optimizer, learning_rate = lr_controller.decay_lr(
                    optimizer=model.optimizer,
                    learning_rate=learning_rate,
                    epoch=epoch,
                    value=metric_dev_epoch)

                if epoch == params['convert_to_sgd_epoch']:
                    # Convert to fine-tuning stage
                    model.set_optimizer(
                        'sgd',
                        learning_rate_init=learning_rate,
                        weight_decay=float(params['weight_decay']),
                        clip_grad_norm=params['clip_grad_norm'],
                        lr_schedule=False,
                        factor=params['decay_rate'],
                        patience_epoch=params['decay_patient_epoch'])
                    logger.info('========== Convert to SGD ==========')

                    # Inject Gaussian noise to all parameters
                    if float(params['weight_noise_std']) > 0:
                        model.weight_noise_injection = True

            pbar_epoch = tqdm(total=len(train_data))
            print('========== EPOCH:%d (%.3f min) ==========' %
                  (epoch, duration_epoch / 60))

            if epoch == params['num_epoch']:
                break

            start_time_step = time.time()
            start_time_epoch = time.time()
            epoch += 1

    # TODO: evaluate the best model by beam search here

    duration_train = time.time() - start_time_train
    logger.info('Total time: %.3f hour' % (duration_train / 3600))

    if params['backend'] == 'pytorch':
        tf_writer.close()

    # Training was finished correctly
    with open(os.path.join(model.save_path, 'COMPLETE'), 'w') as f:
        f.write('')

Beispiel #33

Datei anzeigen

Datei: save_ctc_prob.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def do_save(model, params, epoch, eval_batch_size, temperature):
    """Save the CTC outputs.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch in evaluation
        temperature (int):
    """
    print('=' * 30)
    print('  frame stack %d' % int(params['num_stack']))
    print('  splice %d' % int(params['splice']))
    print('  temperature (training): %d' % temperature)
    print('=' * 30)

    # Load dataset
    train_data = Dataset(
        data_type='train', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        max_epoch=3, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=True, num_gpu=1)
    dev_clean_data = Dataset(
        data_type='dev_clean', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        max_epoch=3, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=True, num_gpu=1)
    dev_other_data = Dataset(
        data_type='dev_other', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        max_epoch=3, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=True, num_gpu=1)
    test_clean_data = Dataset(
        data_type='test_clean', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        max_epoch=3, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=True, num_gpu=1)
    test_other_data = Dataset(
        data_type='test_other', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=params['batch_size'] if eval_batch_size == -
        1 else eval_batch_size,
        max_epoch=3, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        shuffle=True, num_gpu=1)

    with tf.name_scope('tower_gpu0'):
        # Define placeholders
        model.create_placeholders()

        # Add to the graph each operation (including model definition)
        _, logits = model.compute_loss(
            model.inputs_pl_list[0],
            model.labels_pl_list[0],
            model.inputs_seq_len_pl_list[0],
            model.keep_prob_pl_list[0])
        logits /= temperature
        posteriors_op = model.posteriors(logits, blank_prior=1)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        #########################
        # Save soft targets
        #########################
        # train100h
        # save(session=sess,
        #      posteriors_op=posteriors_op,
        #      model=model,
        #      dataset=train_data,
        #      data_type='train',
        #      num_stack=params['num_stack'],
        #      save_prob=False,
        #      save_soft_targets=True,
        #      save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'train'))

        # dev
        # save(session=sess,
        #      posteriors_op=posteriors_op,
        #      model=model,
        #      dataset=dev_clean_data,
        #      data_type='dev_clean',
        #      num_stack=params['num_stack'],
        #      save_prob=False,
        #      save_soft_targets=True,
        #      save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_clean'))
        # save(session=sess,
        #      posteriors_op=posteriors_op,
        #      model=model,
        #      dataset=dev_other_data,
        #      data_type='dev_other',
        #      num_stack=params['num_stack'],
        #      save_prob=False,
        #      save_soft_targets=True,
        #      save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'dev_other'))

        # test
        save(session=sess,
             posteriors_op=posteriors_op,
             model=model,
             dataset=test_clean_data,
             data_type='test_clean',
             num_stack=params['num_stack'],
             save_prob=True,
             save_soft_targets=False,
             save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_clean'))
        save(session=sess,
             posteriors_op=posteriors_op,
             model=model,
             dataset=test_other_data,
             data_type='test_other',
             num_stack=params['num_stack'],
             save_prob=True,
             save_soft_targets=False,
             save_path=mkdir_join(model.save_path, 'temp' + str(temperature), 'test_other'))

Beispiel #34

Datei anzeigen

Datei: feature_extraction.py Projekt: carolinebear/pytorch_end2end_speech_recognition

def read_audio(data_type,
               spk2audio,
               segment_dict,
               tool,
               config,
               normalize,
               save_path,
               global_mean_male=None,
               global_std_male=None,
               global_mean_female=None,
               global_std_female=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        data_type (string):
        spk2audio (dict):
            key (string) =>
            value () =>
        segment_dict (dict):
            key (string) =>
            value (dict) =>
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & stddev over
                      the training set per gender
            speaker => normalize input features by mean & stddev per speaker
            utterance => normalize input features by mean & stddev per utterancet
                         data by mean & stddev per utterance
        save_path (string): path to save npy files
        global_mean_male (np.ndarray, optional): global mean of male over the
            training set
        global_std_male (np.ndarray, optional): global standard deviation of
            male over the training set
        global_mean_female (np.ndarray, optional): global mean of female over
            the training set
        global_std_female (np.ndarray, optional): global standard deviation of
            female over the training set
        dtype (optional): the type of data, default is np.float32
    """
    is_training = 'train' in data_type

    if not is_training:
        if global_mean_male is None or global_mean_female is None:
            raise ValueError('Set mean & stddev computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError(
            'tool must be "htk" or "python_speech_features" or "librosa".')

    audio_paths_male, audio_paths_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict = {}

    # NOTE: assume that speakers are different between sessions

    # Loop 1: Computing global mean and statistics
    if is_training and normalize != 'no':
        print('=====> Reading audio files...')
        for i, speaker in enumerate(tqdm(segment_dict.keys())):
            audio_path = spk2audio[speaker]

            # Divide each audio file into utterances
            _, feat_utt_sum, speaker_mean, _, total_frame_num_speaker = segment(
                audio_path,
                speaker,
                segment_dict[speaker],  # dict of utterances
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            if i == 0:
                # Initialize global statistics
                feat_dim = feat_utt_sum.shape[0]
                global_mean_male = np.zeros((feat_dim, ), dtype=dtype)
                global_mean_female = np.zeros((feat_dim, ), dtype=dtype)
                global_std_male = np.zeros((feat_dim, ), dtype=dtype)
                global_std_female = np.zeros((feat_dim, ), dtype=dtype)

            # For computing global mean
            if speaker[3] == 'M':
                audio_paths_male.append(audio_path)
                global_mean_male += feat_utt_sum
                total_frame_num_male += total_frame_num_speaker
            elif speaker[3] == 'F':
                audio_paths_female.append(audio_path)
                global_mean_female += feat_utt_sum
                total_frame_num_female += total_frame_num_speaker
            else:
                raise ValueError('gender is M or F.')

            # For computing speaker mean & stddev
            if normalize == 'speaker':
                speaker_mean_dict[speaker] = speaker_mean
                total_frame_num_dict[speaker] = total_frame_num_speaker
                # NOTE: speaker mean is already computed

        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        for speaker in tqdm(segment_dict.keys()):
            audio_path = spk2audio[speaker]

            # Divide each audio into utterances
            feat_dict_speaker, _, _, _, _ = segment(audio_path,
                                                    speaker,
                                                    segment_dict[speaker],
                                                    is_training=True,
                                                    sil_duration=0,
                                                    tool=tool,
                                                    config=config)

            # For computing global stddev
            if speaker[3] == 'M':
                for feat_utt in feat_dict_speaker.values():
                    global_std_male += np.sum(np.abs(feat_utt -
                                                     global_mean_male)**2,
                                              axis=0)
            elif speaker[3] == 'F':
                for feat_utt in feat_dict_speaker.values():
                    global_std_female += np.sum(np.abs(feat_utt -
                                                       global_mean_female)**2,
                                                axis=0)
            else:
                raise ValueError('gender is M or F.')

        # Compute global stddev per gender
        global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(global_std_female /
                                    (total_frame_num_female - 1))

        # Save global mean & stddev per gender
        np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
        np.save(join(save_path, 'global_mean_female.npy'), global_mean_female)
        np.save(join(save_path, 'global_std_male.npy'), global_std_male)
        np.save(join(save_path, 'global_std_female.npy'), global_std_female)

    # Loop 2: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    # sampPeriod, parmKind = None, None
    for speaker in tqdm(segment_dict.keys()):
        audio_path = spk2audio[speaker]

        if normalize == 'speaker' and is_training:
            speaker_mean = speaker_mean_dict[speaker]
        else:
            speaker_mean = None

        # Divide each audio into utterances
        feat_dict_speaker, _, speaker_mean, speaker_std, _ = segment(
            audio_path,
            speaker,
            segment_dict[speaker],
            is_training=is_training,
            sil_duration=0,
            tool=tool,
            config=config,
            mean=speaker_mean)  # for compute speaker stddev
        # NOTE: feat_dict_speaker have been not normalized yet

        for utt_idx, feat_utt in feat_dict_speaker.items():
            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & stddev over the training set per gender
                if speaker[3] == 'M':
                    feat_utt -= global_mean_male
                    feat_utt /= global_std_male
                elif speaker[3] == 'F':
                    feat_utt -= global_mean_female
                    feat_utt /= global_std_female
                else:
                    raise ValueError('gender is M or F.')
            elif normalize == 'speaker':
                # Normalize by mean & stddev per speaker
                feat_utt = (feat_utt - speaker_mean) / speaker_std
            elif normalize == 'utterance':
                # Normalize by mean & stddev per utterance
                utt_mean = np.mean(feat_utt, axis=0, dtype=dtype)
                utt_std = np.std(feat_utt, axis=0, dtype=dtype)
                feat_utt = (feat_utt - utt_mean) / utt_std

            frame_num_dict[utt_idx] = feat_utt.shape[0]

            # Save input features
            np.save(mkdir_join(save_path, speaker, utt_idx + '.npy'), feat_utt)

    # Save the frame number dictionary
    with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
        pickle.dump(frame_num_dict, f)

Beispiel #35

Datei anzeigen

Datei: plot_hierarchical_attention_weights.py Projekt: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval1',
        # data_type='eval2',
        # data_type='eval3',
        data_size=params['data_size'],
        label_type=params['label_type'],
        label_type_sub=params['label_type_sub'],
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=False,
        reverse=False,
        tool=params['tool'])

    params['num_classes'] = dataset.num_classes
    params['num_classes_sub'] = dataset.num_classes_sub

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    save_path = mkdir_join(args.model_path, 'att_weights')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:
        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=args.beam_width,
            max_decode_len=MAX_DECODE_LEN_WORD,
            min_decode_len=MIN_DECODE_LEN_WORD,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty)
        best_hyps_sub, aw_sub, _ = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=args.beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_CHAR,
            min_decode_len=MIN_DECODE_LEN_CHAR,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty,
            task_index=1)

        for b in range(len(batch['xs'])):

            word_list = dataset.idx2word(best_hyps[b], return_list=True)
            char_list = dataset.idx2char(best_hyps_sub[b], return_list=True)

            speaker = batch['input_names'][b].split('_')[0]

            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8))

        if is_new_epoch:
            break

Beispiel #36

Datei anzeigen

Datei: plot_ctc_probs.py Projekt: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(data_save_path=args.data_save_path,
                      backend=params['backend'],
                      input_freq=params['input_freq'],
                      use_delta=params['use_delta'],
                      use_double_delta=params['use_double_delta'],
                      data_type='test',
                      label_type=params['label_type'],
                      batch_size=args.eval_batch_size,
                      splice=params['splice'],
                      num_stack=params['num_stack'],
                      num_skip=params['num_skip'],
                      sort_utt=True,
                      reverse=True,
                      tool=params['tool'])

    params['num_classes'] = dataset.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    save_path = mkdir_join(args.model_path, 'ctc_probs')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:
        # Get CTC probs
        probs, x_lens, _ = model.posteriors(batch['xs'],
                                            batch['x_lens'],
                                            temperature=1)
        # NOTE: probs: '[B, T, num_classes]'

        # Visualize
        for b in range(len(batch['xs'])):
            plot_ctc_probs(probs[b, :x_lens[b], :],
                           frame_num=x_lens[b],
                           num_stack=dataset.num_stack,
                           spectrogram=batch['xs'][b, :, :40],
                           save_path=join(save_path,
                                          batch['input_names'][b] + '.png'),
                           figsize=(14, 7))

        if is_new_epoch:
            break

Beispiel #37

Datei anzeigen

Datei: save_ctc_prob.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def do_save(model, params, epoch, eval_batch_size):
    """Save the CTC outputs.
    Args:
        model: the model to restore
        params (dict): A dictionary of parameters
        epoch (int): the epoch to restore
        eval_batch_size (int): the size of mini-batch in evaluation
    """
    # Load dataset
    train_data = Dataset(
        data_type='train', train_data_size=params['train_data_size'],
        label_type=params['label_type'],
        batch_size=eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=True)

    with tf.name_scope('tower_gpu0'):
        # Define placeholders
        model.create_placeholders()

        # Add to the graph each operation (including model definition)
        _, logits = model.compute_loss(
            model.inputs_pl_list[0],
            model.labels_pl_list[0],
            model.inputs_seq_len_pl_list[0],
            model.keep_prob_input_pl_list[0],
            model.keep_prob_hidden_pl_list[0],
            model.keep_prob_output_pl_list[0],
            softmax_temperature=params['softmax_temperature'])
        posteriors_op = model.posteriors(logits, blank_prior=1)

    # Create a saver for writing training checkpoints
    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(model.save_path)

        # If check point exists
        if ckpt:
            # Use last saved model
            model_path = ckpt.model_checkpoint_path
            if epoch != -1:
                model_path = model_path.split('/')[:-1]
                model_path = '/'.join(model_path) + '/model.ckpt-' + str(epoch)
            saver.restore(sess, model_path)
            print("Model restored: " + model_path)
        else:
            raise ValueError('There are not any checkpoints.')

        for data, is_new_epoch in train_data:

            # Create feed dictionary for next mini batch
            inputs, _, inputs_seq_len, input_names = data
            feed_dict = {
                model.inputs_pl_list[0]: inputs[0],
                model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
                model.keep_prob_input_pl_list[0]: 1.0,
                model.keep_prob_hidden_pl_list[0]: 1.0,
                model.keep_prob_output_pl_list[0]: 1.0
            }

            batch_size, max_frame_num = inputs[0].shape[:2]
            posteriors = sess.run(posteriors_op, feed_dict=feed_dict)
            posteriors = posteriors.reshape(-1,
                                            max_frame_num, model.num_classes)

            for i_batch in range(batch_size):
                prob = posteriors[i_batch][:int(inputs_seq_len[0][i_batch]), :]

                # Save as a npy file
                np.save(mkdir_join(model.save_path, 'probs',
                                   input_names[0][i_batch]), prob)

            if is_new_epoch:
                break

Beispiel #38

Datei anzeigen

Datei: train_attention.py Projekt: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Model setting
    model = AttentionSeq2Seq(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'],
        sigmoid_smoothing=params['sigmoid_smoothing'])

    # Set process name
    setproctitle('tf_csj_' + model.name + '_' +
                 params['train_data_size'] + '_' + params['label_type'] + '_' +
                 params['attention_type'])

    model.name = 'en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    if bool(params['sigmoid_smoothing']):
        model.name += '_smoothing'
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'attention', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

Beispiel #39

Datei anzeigen

Datei: train_multitask_ctc.py Projekt: sky1170447398/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type_main'] == 'character':
        params['num_classes_main'] = 28
    elif params['label_type_main'] == 'character_capital_divide':
        params['num_classes_main'] = 72

    if params['label_type_sub'] == 'phone61':
        params['num_classes_sub'] = 61
    elif params['label_type_sub'] == 'phone48':
        params['num_classes_sub'] = 48
    elif params['label_type_sub'] == 'phone39':
        params['num_classes_sub'] = 39

    # Model setting
    model = Multitask_CTC(encoder_type=params['encoder_type'],
                          input_size=params['input_size'] *
                          params['num_stack'],
                          num_units=params['num_units'],
                          num_layers_main=params['num_layers_main'],
                          num_layers_sub=params['num_layers_sub'],
                          num_classes_main=params['num_classes_main'],
                          num_classes_sub=params['num_classes_sub'],
                          main_task_weight=params['main_task_weight'],
                          lstm_impl=params['lstm_impl'],
                          use_peephole=params['use_peephole'],
                          parameter_init=params['weight_init'],
                          clip_grad=params['clip_grad'],
                          clip_activation=params['clip_activation'],
                          num_proj=params['num_proj'],
                          weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('timit_' + model.name + '_' + params['label_type_main'] +
                 '_' + params['label_type_sub'])

    model.name += '_' + str(params['num_units'])
    model.name += '_main' + str(params['num_layers_main'])
    model.name += '_sub' + str(params['num_layers_sub'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout_input'] != 1:
        model.name += '_dropi' + str(params['dropout_input'])
    if params['dropout_hidden'] != 1:
        model.name += '_droph' + str(params['dropout_hidden'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    model.name += '_main' + str(params['main_task_weight'])

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'ctc',
                                 'char_' + params['label_type_sub'],
                                 model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            # tf.gfile.DeleteRecursively(new_model_path)
            # tf.gfile.MakeDirs(new_model_path)
            # break
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Beispiel #40

Datei anzeigen

def main(config_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    if corpus['label_type'] == 'phone61':
        output_size = 61
    elif corpus['label_type'] == 'phone48':
        output_size = 48
    elif corpus['label_type'] == 'phone39':
        output_size = 39
    elif corpus['label_type'] == 'character':
        output_size = 30

    # Model setting
    CTCModel = load(model_type=config['model_name'])
    network = CTCModel(batch_size=param['batch_size'],
                       input_size=feature['input_size'] * feature['num_stack'],
                       num_unit=param['num_unit'],
                       num_layer=param['num_layer'],
                       output_size=output_size,
                       parameter_init=param['weight_init'],
                       clip_grad=param['clip_grad'],
                       clip_activation=param['clip_activation'],
                       dropout_ratio_input=param['dropout_input'],
                       dropout_ratio_hidden=param['dropout_hidden'],
                       num_proj=param['num_proj'],
                       weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_' + str(param['num_unit'])
    network.model_name += '_' + str(param['num_layer'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['num_proj'] != 0:
        network.model_name += '_proj' + str(param['num_proj'])
    if feature['num_stack'] != 1:
        network.model_name += '_stack' + str(feature['num_stack'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])
    network.model_name += '_' + str(len(gpu_indices)) + 'gpu'

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/')
    network.model_dir = mkdir_join(network.model_dir, corpus['label_type'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('multigpu_ctc_timit_' + corpus['label_type'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type=corpus['label_type'],
             num_stack=feature['num_stack'],
             num_skip=feature['num_skip'],
             gpu_indices=gpu_indices)
    sys.stdout = sys.__stdout__

Beispiel #41

Datei anzeigen

Datei: feature_extraction.py Projekt: dsp6414/pytorch_end2end_speech_recognition

def read_audio(data_type, audio_paths, spk2gender, tool, config, normalize,
               save_path, global_mean_male=None, global_std_male=None,
               global_mean_female=None, global_std_female=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        data_type (string): train_si84 or train_si284 or test_dev93 or test_eval92
        audio_paths (list): paths to audio files
        spk2gender (dict):
            key => speaker
            value => gender
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & stddev over
                      the training set per gender
            speaker => normalize input features by mean & stddev per speaker
            utterance => normalize input features by mean & stddev per utterancet
                         data by mean & stddev per utterance
        save_path (string): path to save npy files
        global_mean_male (np.ndarray, optional): global mean of male over the
            training set
        global_std_male (np.ndarray, optional): global standard deviation of
            male over the training set
        global_mean_female (np.ndarray, optional): global mean of female over
            the training set
        global_std_female (np.ndarray, optional): global standard deviation of
            female over the training set
        dtype (optional): the type of data, default is np.float32
    """
    if 'train' not in data_type:
        if global_mean_male is None or global_mean_female is None:
            raise ValueError('Set mean & stddev computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError(
            'tool must be "htk" or "python_speech_features" or "librosa".')

    audio_paths_male, audio_paths_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}

    # Loop 1: Computing global mean and statistics
    if 'train' in data_type and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = audio_path.split('/')[-2]
            utt_idx = basename(audio_path).split('.')[0]
            gender = spk2gender[speaker]

            if tool == 'htk':
                feat_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                feat_utt = w2f_psf(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])
            elif tool == 'librosa':
                feat_utt = w2f_librosa(audio_path,
                                       feature_type=config['feature_type'],
                                       feature_dim=config['channels'],
                                       use_energy=config['energy'],
                                       use_delta1=config['delta'],
                                       use_delta2=config['deltadelta'],
                                       window=config['window'],
                                       slide=config['slide'])

            frame_num, feat_dim = feat_utt.shape
            feat_utt_sum = np.sum(feat_utt, axis=0)

            if i == 0:
                # Initialize global statistics
                global_mean_male = np.zeros((feat_dim,), dtype=dtype)
                global_mean_female = np.zeros((feat_dim,), dtype=dtype)
                global_std_male = np.zeros((feat_dim,), dtype=dtype)
                global_std_female = np.zeros((feat_dim,), dtype=dtype)

            # For computing global mean
            if gender == 'm':
                audio_paths_male.append(audio_path)
                global_mean_male += feat_utt_sum
                total_frame_num_male += frame_num
            elif gender == 'f':
                audio_paths_female.append(audio_path)
                global_mean_female += feat_utt_sum
                total_frame_num_female += frame_num
            else:
                raise ValueError('gender is m or f.')

            # For computing speaker mean & stddev
            if normalize == 'speaker':
                # Initialize speaker statistics
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    speaker_mean_dict[speaker] = np.zeros(
                        (feat_dim,), dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros(
                        (feat_dim,), dtype=dtype)
                total_frame_num_dict[speaker] += frame_num
                speaker_mean_dict[speaker] += feat_utt_sum

        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        # Compute speaker mean
        if normalize == 'speaker':
            for speaker in speaker_mean_dict.keys():
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

        for audio_path in tqdm(audio_paths):
            speaker = audio_path.split('/')[-2]
            utt_idx = basename(audio_path).split('.')[0]
            gender = spk2gender[speaker]

            if tool == 'htk':
                feat_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                feat_utt = w2f_psf(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])
            elif tool == 'librosa':
                feat_utt = w2f_librosa(audio_path,
                                       feature_type=config['feature_type'],
                                       feature_dim=config['channels'],
                                       use_energy=config['energy'],
                                       use_delta1=config['delta'],
                                       use_delta2=config['deltadelta'],
                                       window=config['window'],
                                       slide=config['slide'])

            # For computing global stddev
            if gender == 'm':
                global_std_male += np.sum(
                    np.abs(feat_utt - global_mean_male) ** 2, axis=0)
            elif gender == 'f':
                global_std_female += np.sum(
                    np.abs(feat_utt - global_mean_female) ** 2, axis=0)
            else:
                raise ValueError('gender is m or f.')

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(feat_utt - speaker_mean_dict[speaker]) ** 2, axis=0)

        # Compute speaker stddev
        if normalize == 'speaker':
            for speaker in speaker_std_dict.keys():
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1))

        # Compute global stddev per gender
        global_std_male = np.sqrt(
            global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(
            global_std_female / (total_frame_num_female - 1))

        # Save global mean & stddev per gender
        np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
        np.save(join(save_path, 'global_mean_female.npy'),
                global_mean_female)
        np.save(join(save_path, 'global_std_male.npy'), global_std_male)
        np.save(join(save_path, 'global_std_female.npy'), global_std_female)

    # Loop 2: Normalization and saving
    print('=====> Normalization...')
    frame_num_dict = {}
    # sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = audio_path.split('/')[-2]
        utt_idx = basename(audio_path).split('.')[0]
        gender = spk2gender[speaker]

        if tool == 'htk':
            feat_utt, sampPeriod, parmKind = read(audio_path)
        elif tool == 'python_speech_features':
            feat_utt = w2f_psf(audio_path,
                               feature_type=config['feature_type'],
                               feature_dim=config['channels'],
                               use_energy=config['energy'],
                               use_delta1=config['delta'],
                               use_delta2=config['deltadelta'],
                               window=config['window'],
                               slide=config['slide'])
        elif tool == 'librosa':
            feat_utt = w2f_librosa(audio_path,
                                   feature_type=config['feature_type'],
                                   feature_dim=config['channels'],
                                   use_energy=config['energy'],
                                   use_delta1=config['delta'],
                                   use_delta2=config['deltadelta'],
                                   window=config['window'],
                                   slide=config['slide'])

        if normalize == 'no':
            pass
        elif normalize == 'global' or 'train' not in data_type:
            # Normalize by mean & stddev over the training set per gender
            if gender == 'm':
                feat_utt -= global_mean_male
                feat_utt /= global_std_male
            elif gender == 'f':
                feat_utt -= global_mean_female
                feat_utt /= global_std_female
            else:
                raise ValueError('gender is m or f.')
        elif normalize == 'speaker':
            # Normalize by mean & stddev per speaker
            feat_utt = (
                feat_utt - speaker_mean_dict[speaker]) / speaker_std_dict[speaker]
        elif normalize == 'utterance':
            # Normalize by mean & stddev per utterance
            utt_mean = np.mean(feat_utt, axis=0, dtype=dtype)
            utt_std = np.std(feat_utt, axis=0, dtype=dtype)
            feat_utt = (feat_utt - utt_mean) / utt_std

        frame_num_dict[utt_idx] = feat_utt.shape[0]

        # Save input features
        np.save(mkdir_join(save_path, speaker, utt_idx + '.npy'), feat_utt)

    # Save the frame number dictionary
    with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
        pickle.dump(frame_num_dict, f)