Example #1
0
def test():
    args = parser.parse_args()
    try:
        conf = yaml.safe_load(open(args.conf, 'r'))
    except:
        print("Config file not exist!")
        sys.exit(1)

    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        print('{:50}:{}'.format(k, v))

    use_cuda = opts.use_gpu
    device = torch.device('cuda:0') if use_cuda else torch.device('cpu')

    model_path = os.path.join(opts.checkpoint_dir, opts.exp_name,
                              'ctc_best_model.pkl')
    package = torch.load(model_path)

    rnn_param = package["rnn_param"]
    add_cnn = package["add_cnn"]
    cnn_param = package["cnn_param"]
    num_class = package["num_class"]
    feature_type = package['epoch']['feature_type']
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']
    mel = opts.mel

    beam_width = opts.beam_width
    lm_alpha = opts.lm_alpha
    decoder_type = opts.decode_type
    vocab_file = opts.vocab_file

    vocab = Vocab(vocab_file)
    test_dataset = SpeechDataset(vocab, opts.test_scp_path, opts.test_lab_path,
                                 opts)
    test_loader = SpeechDataLoader(test_dataset,
                                   batch_size=opts.batch_size,
                                   shuffle=False,
                                   num_workers=opts.num_workers,
                                   pin_memory=False)

    model = CTC_Model(rnn_param=rnn_param,
                      add_cnn=add_cnn,
                      cnn_param=cnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    model.to(device)
    model.load_state_dict(package['state_dict'])
    model.eval()

    if decoder_type == 'Greedy':
        decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0)
    else:
        decoder = BeamDecoder(vocab.index2word,
                              beam_width=beam_width,
                              blank_index=0,
                              space_idx=-1,
                              lm_path=opts.lm_path,
                              lm_alpha=opts.lm_alpha)

    total_wer = 0
    total_cer = 0
    start = time.time()
    with torch.no_grad():
        for data in test_loader:
            inputs, input_sizes, targets, target_sizes, utt_list = data
            inputs = inputs.to(device)
            #rnput_sizes = input_sizes.to(device)
            #target = target.to(device)
            #target_sizes = target_sizes.to(device)

            probs = model(inputs)

            max_length = probs.size(0)
            input_sizes = (input_sizes * max_length).long()

            probs = probs.cpu()
            decoded = decoder.decode(probs, input_sizes.numpy().tolist())
            targets, target_sizes = targets.numpy(), target_sizes.numpy()
            labels = []
            for i in range(len(targets)):
                label = [
                    vocab.index2word[num]
                    for num in targets[i][:target_sizes[i]]
                ]
                labels.append(' '.join(label))

            for x in range(len(targets)):
                print("origin : " + labels[x])
                print("decoded: " + decoded[x])
            cer = 0
            wer = 0
            for x in range(len(labels)):
                wer += decoder.wer(decoded[x], labels[x])
                decoder.num_word += len(labels[x].split())

            total_wer += wer
    print("total_error:", total_wer)
    print("total_phoneme:", decoder.num_word)
    PER = (float(total_wer) / decoder.num_word) * 100
    print("Phoneme error rate on test set: %.4f" % PER)
    end = time.time()
    time_used = (end - start) / 60.0
    print("time used for decode %d sentences: %.4f minutes." %
          (len(test_dataset), time_used))
Example #2
0
        predictions.append(beam_decode(tokenized_test[i:i + 1]))

    for idx, result in enumerate(predictions):
        if result == '':
            print(idx)

    return predictions


def submit_proc(sentence):
    sentence = sentence.lstrip(' ,!。')
    sentence = sentence.replace(' ', '')
    if sentence == '':
        sentence = '随时联系'
    return sentence


if __name__ == '__main__':
    params = get_params()
    vocab, reverse_vocab = Vocab()
    embedding_matrix = load_embedding_matrix()
    params['mode'] = 'test'
    predictions = test(params, embedding_matrix, vocab)

    test_df = pd.read_csv(test_data_file)
    test_df['Prediction'] = predictions
    test_df = test_df[['QID', 'Prediction']]
    test_df['Prediction'] = test_df['Prediction'].apply(submit_proc)
    test_df.to_csv(os.path.join(root, 'data', 'result_pgn_beam_search.csv'),
                   index=None,
                   sep=',')
def main(conf):
    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        print('{:50}:{}'.format(k, v))

    device = torch.device('cuda:0') if opts.use_gpu else torch.device('cpu')
    torch.manual_seed(opts.seed)
    np.random.seed(opts.seed)
    if opts.use_gpu:
        torch.cuda.manual_seed(opts.seed)

    #Data Loader
    vocab = Vocab(opts.vocab_file)
    train_dataset = SpeechDataset(vocab, opts.train_scp_path,
                                  opts.train_lab_path, opts.train_trans_path,
                                  opts, True)
    dev_dataset = SpeechDataset(vocab, opts.valid_scp_path,
                                opts.valid_lab_path, opts.valid_trans_path,
                                opts)
    train_loader = SpeechDataLoader(train_dataset,
                                    batch_size=opts.batch_size,
                                    shuffle=opts.shuffle_train,
                                    num_workers=opts.num_workers)
    dev_loader = SpeechDataLoader(dev_dataset,
                                  batch_size=opts.batch_size,
                                  shuffle=False,
                                  num_workers=opts.num_workers)

    #Define Model
    rnn_type = supported_rnn[opts.rnn_type]
    rnn_param = {
        "rnn_input_size": opts.rnn_input_size,
        "rnn_hidden_size": opts.rnn_hidden_size,
        "rnn_layers": opts.rnn_layers,
        "rnn_type": rnn_type,
        "bidirectional": opts.bidirectional,
        "batch_norm": opts.batch_norm
    }

    num_class = vocab.n_words
    opts.output_class_dim = vocab.n_words
    drop_out = opts.drop_out
    add_cnn = opts.add_cnn

    cnn_param = {}
    channel = eval(opts.channel)
    kernel_size = eval(opts.kernel_size)
    stride = eval(opts.stride)
    padding = eval(opts.padding)
    pooling = eval(opts.pooling)
    activation_function = supported_activate[opts.activation_function]
    cnn_param['batch_norm'] = opts.batch_norm
    cnn_param['activate_function'] = activation_function
    cnn_param["layer"] = []
    for layer in range(opts.layers):
        layer_param = [
            channel[layer], kernel_size[layer], stride[layer], padding[layer]
        ]
        if pooling is not None:
            layer_param.append(pooling[layer])
        else:
            layer_param.append(None)
        cnn_param["layer"].append(layer_param)

    model = CTC_Model(add_cnn=add_cnn,
                      cnn_param=cnn_param,
                      rnn_param=rnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    model = model.to(device)
    num_params = 0
    for name, param in model.named_parameters():
        num_params += param.numel()
    print("Number of parameters %d" % num_params)
    for idx, m in enumerate(model.children()):
        print(idx, m)

    #Training
    init_lr = opts.init_lr
    num_epoches = opts.num_epoches
    end_adjust_acc = opts.end_adjust_acc
    decay = opts.lr_decay
    weight_decay = opts.weight_decay
    batch_size = opts.batch_size

    params = {
        'num_epoches': num_epoches,
        'end_adjust_acc': end_adjust_acc,
        'mel': opts.mel,
        'seed': opts.seed,
        'decay': decay,
        'learning_rate': init_lr,
        'weight_decay': weight_decay,
        'batch_size': batch_size,
        'feature_type': opts.feature_type,
        'n_feats': opts.feature_dim
    }
    print(params)

    loss_fn = nn.CTCLoss(reduction='sum')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=init_lr,
                                 weight_decay=weight_decay)

    #visualization for training
    from visdom import Visdom
    viz = Visdom()
    if add_cnn:
        title = opts.feature_type + str(opts.feature_dim) + ' CNN_LSTM_CTC'
    else:
        title = opts.feature_type + str(opts.feature_dim) + ' LSTM_CTC'

    viz_opts = [
        dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'),
        dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'),
        dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch')
    ]
    viz_window = [None, None, None]

    count = 0
    learning_rate = init_lr
    loss_best = 1000
    loss_best_true = 1000
    adjust_rate_flag = False
    stop_train = False
    adjust_time = 0
    acc_best = 0
    start_time = time.time()
    loss_results = []
    dev_loss_results = []
    dev_cer_results = []

    while not stop_train:
        if count >= num_epoches:
            break
        count += 1

        if adjust_rate_flag:
            learning_rate *= decay
            adjust_rate_flag = False
            for param in optimizer.param_groups:
                param['lr'] *= decay

        print("Start training epoch: %d, learning_rate: %.5f" %
              (count, learning_rate))

        train_acc, loss = run_epoch(count,
                                    model,
                                    train_loader,
                                    loss_fn,
                                    device,
                                    optimizer=optimizer,
                                    print_every=opts.verbose_step,
                                    is_training=True)
        loss_results.append(loss)
        acc, dev_loss = run_epoch(count,
                                  model,
                                  dev_loader,
                                  loss_fn,
                                  device,
                                  optimizer=None,
                                  print_every=opts.verbose_step,
                                  is_training=False)
        print("loss on dev set is %.4f" % dev_loss)
        dev_loss_results.append(dev_loss)
        dev_cer_results.append(acc)

        #adjust learning rate by dev_loss
        if dev_loss < (loss_best - end_adjust_acc):
            loss_best = dev_loss
            loss_best_true = dev_loss
            adjust_rate_count = 0
            model_state = copy.deepcopy(model.state_dict())
            op_state = copy.deepcopy(optimizer.state_dict())
        elif (dev_loss < loss_best + end_adjust_acc):
            adjust_rate_count += 1
            if dev_loss < loss_best and dev_loss < loss_best_true:
                loss_best_true = dev_loss
                model_state = copy.deepcopy(model.state_dict())
                op_state = copy.deepcopy(optimizer.state_dict())
        else:
            adjust_rate_count = 10

        if acc > acc_best:
            acc_best = acc
            best_model_state = copy.deepcopy(model.state_dict())
            best_op_state = copy.deepcopy(optimizer.state_dict())

        print("adjust_rate_count:" + str(adjust_rate_count))
        print('adjust_time:' + str(adjust_time))

        if adjust_rate_count == 10:
            adjust_rate_flag = True
            adjust_time += 1
            adjust_rate_count = 0
            if loss_best > loss_best_true:
                loss_best = loss_best_true
            model.load_state_dict(model_state)
            optimizer.load_state_dict(op_state)

        if adjust_time == 8:
            stop_train = True

        time_used = (time.time() - start_time) / 60
        print("epoch %d done, cv acc is: %.4f, time_used: %.4f minutes" %
              (count, acc, time_used))
        print('loss_best:', loss_best)
        #x_axis = range(count)
        #y_axis = [loss_results[0:count], dev_loss_results[0:count], dev_cer_results[0:count]]
        #for x in range(len(viz_window)):
        #    if viz_window[x] is None:
        #        viz_window[x] = viz.line(X = np.array(x_axis), Y = np.array(y_axis[x]), opts = viz_opts[x],)
        #    else:
        #        viz.line(X = np.array(x_axis), Y = np.array(y_axis[x]), win = viz_window[x], update = 'replace',)

    print("End training, best dev loss is: %.4f, acc is: %.4f" %
          (loss_best, acc_best))
    model.load_state_dict(best_model_state)
    optimizer.load_state_dict(best_op_state)
    save_dir = os.path.join(opts.checkpoint_dir, opts.exp_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    best_path = os.path.join(save_dir, 'ctc_best_model.pkl')
    params['epoch'] = count

    torch.save(
        CTC_Model.save_package(model,
                               optimizer=optimizer,
                               epoch=params,
                               loss_results=loss_results,
                               dev_loss_results=dev_loss_results,
                               dev_cer_results=dev_cer_results), best_path)
Example #4
0
            print('Saving checkpoint for epoch {} at {}'.format(
                epoch + 1, ckpt_save_path))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


def train(params, embedding_matrix, vocab):
    print('Building the model ...')
    model = PGN(params, embedding_matrix, vocab)

    print('Creating the checkpoint manager ...')
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        'drive/NLP1/data/checkpoints/training_pgn_checkpoints',
        max_to_keep=5)

    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print('Resotred from {}'.format(checkpoint_manager.latest_checkpoint))
    else:
        print('Initializing from scratch ...')

    print('Start the training process ...')
    train_model(model, params, vocab, checkpoint_manager)


if __name__ == '__main__':
    params = get_params()
    vocab = Vocab()
    embedding_matrix = load_embedding_matrix()
    train(params, embedding_matrix, vocab)
Example #5
0
def test():
    args = parser.parse_args()
    try:
        conf = yaml.safe_load(open(args.conf, 'r'))
    except:
        print("Config file not exist!")
        sys.exit(1)

    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        print('{:50}:{}'.format(k, v))

    use_cuda = opts.use_gpu
    # use_cuda = False
    separator = opts.separator if opts.separator else " "
    device = torch.device('cuda') if use_cuda else torch.device('cpu')

    model_path = os.path.join(opts.checkpoint_dir, opts.exp_name,
                              'ctc_best_model.pkl')
    package = torch.load(model_path, map_location=device)

    rnn_param = package["rnn_param"]
    add_cnn = package["add_cnn"]
    cnn_param = package["cnn_param"]
    feature_type = package['epoch']['feature_type']
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']
    mel = opts.mel

    beam_width = opts.beam_width
    lm_alpha = opts.lm_alpha
    decoder_type = opts.decode_type
    vocab_file = opts.data_file + "/units"
    if opts.universal:
        vocab_file = opts.data_file + "/all_units"
    keywords = []
    with open(opts.keyword_path, 'r') as f:
        for kw in f.readlines():
            kw = kw.rstrip("\n")
            keywords.append(kw)

    pos_probs = {}
    neg_probs = {}
    for kw in keywords:
        pos_probs[kw] = []
        neg_probs[kw] = []

    vocab = Vocab(vocab_file)
    num_class = vocab.n_words
    test_dataset = SpeechDataset(None, opts.test_scp_path,
                                 opts.test_kws_lab_path, opts)
    test_loader = SpeechDataLoader(test_dataset,
                                   batch_size=opts.batch_size,
                                   shuffle=False,
                                   num_workers=opts.num_workers,
                                   pin_memory=False)

    model = CTC_Model(rnn_param=rnn_param,
                      add_cnn=add_cnn,
                      cnn_param=cnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    model.to(device)

    language = opts.data_file.split("/")[1]
    language_dict = {}
    with open(opts.language_order) as f:
        for idx, line in enumerate(f.readlines()):
            line = line.strip()
            language_dict[line] = idx
    language_id = language_dict[language]

    if opts.from_multi:
        print("Load from multi")
        state_dict = package['state_dict']
        pretrained_dict = {
            k: v
            for k, v in state_dict.items() if k in model.state_dict().keys()
        }
        prefix = "fc_list." + str(language_id)
        language_softmax_dict = {
            k: v
            for k, v in state_dict.items() if k.startswith(prefix)
        }
        for k, v in language_softmax_dict.items():
            new_key = k.replace(prefix, "fc")
            pretrained_dict[new_key] = v

        model.load_state_dict(pretrained_dict)
    else:
        model.load_state_dict(package['state_dict'])

    model.eval()
    if opts.language_one_hot:
        # add size of one-hot label
        lid = torch.zeros(len(language_dict.items()))
        lid[language_id] = 1
    '''
    Decode   Initialize: keywords, blank_index, beam_width

    inputs: probs, length

    outputs: probs for each keyword

    '''

    decoder = BeamDecoder(vocab.index2word,
                          beam_width=beam_width,
                          blank_index=0,
                          space_idx=-1,
                          lm_path=opts.lm_path,
                          lm_alpha=opts.lm_alpha)

    utt_idx = 0
    start = time.time()
    with torch.no_grad():
        for data in test_loader:
            inputs, input_sizes, targets, target_sizes, utt_list = data
            if opts.language_one_hot:
                B, T, _ = inputs.shape
                xx = lid.repeat(B, T, 1)
                inputs = torch.cat((inputs, xx), dim=-1)

            inputs = inputs.to(device)
            #rnput_sizes = input_sizes.to(device)
            #target = target.to(device)
            #target_sizes = target_sizes.to(device)

            probs = model(inputs)

            max_length = probs.size(0)
            input_sizes = (input_sizes * max_length).long()

            probs = probs.cpu()

            if decoder_type == "soft":
                decoded = decoder.decode(probs,
                                         input_sizes.numpy().tolist(),
                                         n_best=True)
                prob_mat = soft_kwd(decoded, keywords)
            else:

                decoded = decoder.decode(probs, input_sizes.numpy().tolist())
                # output existence for each keyword
                prob_mat = exist_kwd(decoded, keywords)
            # target is a 0-1 matrix
            targets, target_sizes = targets.numpy(), target_sizes.numpy()

            for i in range(len(decoded)):
                for j, kw in enumerate(keywords):
                    if targets[i, j] == 1:
                        pos_probs[kw].append(prob_mat[i, j])
                    else:
                        neg_probs[kw].append(prob_mat[i, j])
            utt_idx += len(decoded)
            print("Processed {}/{} utterances.".format(utt_idx,
                                                       len(test_dataset)))

    expdir = opts.checkpoint_dir + opts.exp_name
    print("Output to {}".format(expdir))
    threshold = 0.5
    FPs = {}
    TPs = {}
    for item in pos_probs.items():
        kw = item[0]
        probs = item[1]
        probs = np.array(probs)
        TPs[kw] = len(probs[probs >= threshold])
        with open(expdir + "/" + kw + ".pos", 'w') as f:
            for prob in probs:
                f.write(str(prob) + "\n")

    for item in neg_probs.items():
        kw = item[0]
        probs = item[1]
        probs = np.array(probs)
        FPs[kw] = len(probs[probs >= threshold])
        with open(expdir + "/" + kw + ".neg", 'w') as f:
            for prob in probs:
                f.write(str(prob) + "\n")

    for kw in keywords:
        recall = TPs[kw] / (len(pos_probs[kw]) + 1e-8)
        precision = TPs[kw] / (TPs[kw] + FPs[kw] + 1e-8)
        print("For keyword {} of threshold {}: Precision {}, Recall {}, F1 {}".
              format(kw, str(threshold), recall, precision,
                     2 * (precision * recall) / (precision + recall + 1e-8)))

    print("kws decode method: {}".format(decoder_type))
    end = time.time()
    time_used = (end - start) / 60.0
    print("time used for decode %d sentences: %.4f minutes." %
          (len(test_dataset), time_used))
Example #6
0
def test():
    args = parser.parse_args()
    try:
        conf = yaml.safe_load(open(args.conf, 'r'))
    except:
        print("Config file not exist!")
        sys.exit(1)

    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        # print('{:50}:{}'.format(k, v))

    use_cuda = opts.use_gpu
    device = torch.device('cuda') if use_cuda else torch.device('cpu')

    model_path = os.path.join(opts.checkpoint_dir, opts.exp_name,
                              'ctc_best_model.pkl')
    package = torch.load(model_path)

    rnn_param = package["rnn_param"]
    add_cnn = package["add_cnn"]
    cnn_param = package["cnn_param"]
    num_class = package["num_class"]
    feature_type = package['epoch']['feature_type']
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']
    mel = opts.mel

    beam_width = opts.beam_width
    lm_alpha = opts.lm_alpha
    decoder_type = opts.decode_type
    vocab_file = opts.vocab_file

    vocab = Vocab(vocab_file)
    test_dataset = SpeechDataset(vocab, opts.pred_scp_path, opts.pred_lab_path,
                                 opts)
    test_loader = SpeechDataLoader(test_dataset,
                                   batch_size=opts.batch_size,
                                   shuffle=False,
                                   num_workers=opts.num_workers,
                                   pin_memory=False)

    model = CTC_Model(rnn_param=rnn_param,
                      add_cnn=add_cnn,
                      cnn_param=cnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    model.to(device)
    model.load_state_dict(package['state_dict'])
    model.eval()

    if decoder_type == 'Greedy':
        decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0)
    else:
        decoder = BeamDecoder(vocab.index2word,
                              beam_width=beam_width,
                              blank_index=0,
                              space_idx=-1,
                              lm_path=opts.lm_path,
                              lm_alpha=opts.lm_alpha)

    # total_wer = 0
    # total_cer = 0
    # start = time.time()
    with torch.no_grad():
        for data in test_loader:
            inputs, input_sizes, targets, target_sizes, utt_list = data
            # os.system("cp ../TIMIT/predict/{}.txt ../output/words.txt".format(utt_list[0]))
            inputs = inputs.to(device)
            # rnput_sizes = input_sizes.to(device)
            # target = target.to(device)
            # target_sizes = target_sizes.to(device)

            probs = model(inputs)

            max_length = probs.size(0)
            input_sizes = (input_sizes * max_length).long()

            probs = probs.cpu()
            decoded = decoder.decode(probs, input_sizes.numpy().tolist())

            targets, target_sizes = targets.numpy(), target_sizes.numpy()
            labels = []
            for i in range(len(targets)):
                label = [
                    vocab.index2word[num]
                    for num in targets[i][:target_sizes[i]]
                ]
                labels.append(' '.join(label))

            for x in range(len(targets)):
                with open("../output/original.txt", "a") as writer:
                    writer.write(utt_list[x] + " " + labels[x] + "\n")
                with open("../output/predicted.txt", "a") as writer:
                    writer.write(utt_list[x] + " " + decoded[x] + "\n")
Example #7
0
def test():
    args = parser.parse_args()
    try:
        conf = yaml.safe_load(open(args.conf, 'r'))
    except:
        print("Config file not exist!")
        sys.exit(1)

    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        print('{:50}:{}'.format(k, v))

    use_cuda = opts.use_gpu
    separator = opts.separator if opts.separator else " "
    device = torch.device('cuda') if use_cuda else torch.device('cpu')

    model_path = os.path.join(opts.checkpoint_dir, opts.exp_name,
                              'ctc_best_model.pkl')
    package = torch.load(model_path)

    rnn_param = package["rnn_param"]
    add_cnn = package["add_cnn"]
    cnn_param = package["cnn_param"]
    feature_type = package['epoch']['feature_type']
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']
    mel = opts.mel

    beam_width = opts.beam_width
    lm_alpha = opts.lm_alpha
    decoder_type = opts.decode_type
    vocab_file = opts.data_file + "/units"
    if opts.universal:
        vocab_file = opts.data_file + "/all_units"
    vocab = Vocab(vocab_file)
    num_class = vocab.n_words
    test_dataset = SpeechDataset(vocab, opts.test_scp_path, opts.test_lab_path,
                                 opts)
    test_loader = SpeechDataLoader(test_dataset,
                                   batch_size=opts.batch_size,
                                   shuffle=False,
                                   num_workers=opts.num_workers,
                                   pin_memory=False)

    model = CTC_Model(rnn_param=rnn_param,
                      add_cnn=add_cnn,
                      cnn_param=cnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    model.to(device)

    language = opts.data_file.split("/")[1]
    language_dict = {}
    with open(opts.language_order) as f:
        for idx, line in enumerate(f.readlines()):
            line = line.strip()
            language_dict[line] = idx
    language_id = language_dict[language]

    if opts.from_multi:
        print("Load from multi")
        state_dict = package['state_dict']
        pretrained_dict = {
            k: v
            for k, v in state_dict.items() if k in model.state_dict().keys()
        }
        prefix = "fc_list." + str(language_id)
        language_softmax_dict = {
            k: v
            for k, v in state_dict.items() if k.startswith(prefix)
        }
        for k, v in language_softmax_dict.items():
            new_key = k.replace(prefix, "fc")
            pretrained_dict[new_key] = v

        model.load_state_dict(pretrained_dict)
    else:
        model.load_state_dict(package['state_dict'])

    model.eval()

    if opts.language_one_hot:
        # add size of one-hot label
        lid = torch.zeros(len(language_dict.items()))
        lid[language_id] = 1

    if decoder_type == 'Greedy':
        decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0)
    else:
        decoder = BeamDecoder(vocab.index2word,
                              beam_width=beam_width,
                              blank_index=0,
                              space_idx=-1,
                              lm_path=opts.lm_path,
                              lm_alpha=opts.lm_alpha)

    total_wer = 0
    total_cer = 0
    start = time.time()
    with torch.no_grad():
        for data in test_loader:
            inputs, input_sizes, targets, target_sizes, utt_list = data

            if opts.language_one_hot:
                B, T, _ = inputs.shape
                xx = lid.repeat(B, T, 1)
                inputs = torch.cat((inputs, xx), dim=-1)
            inputs = inputs.to(device)
            #rnput_sizes = input_sizes.to(device)
            #target = target.to(device)
            #target_sizes = target_sizes.to(device)

            probs = model(inputs)

            max_length = probs.size(0)
            input_sizes = (input_sizes * max_length).long()

            probs = probs.cpu()
            decoded = decoder.decode(probs, input_sizes.numpy().tolist())

            targets, target_sizes = targets.numpy(), target_sizes.numpy()
            labels = []
            for i in range(len(targets)):
                label = [
                    vocab.index2word[num]
                    for num in targets[i][:target_sizes[i]]
                ]
                labels.append(' '.join(label))

            for x in range(len(targets)):
                print("origin : " + labels[x])
                print("decoded: " + decoded[x])
            cer = 0
            wer = 0
            for x in range(len(labels)):
                cer += decoder.cer(decoded[x], labels[x])
                wer += decoder.wer(decoded[x], labels[x], separator)
                decoder.num_word += len(labels[x].split(separator))
                decoder.num_char += len(labels[x])
            total_cer += cer
            total_wer += wer
    CER = (float(total_cer) / decoder.num_char) * 100
    WER = (float(total_wer) / decoder.num_word) * 100
    print("Character error rate on test set: %.4f" % CER)
    print("Word error rate on test set: %.4f" % WER)
    end = time.time()
    time_used = (end - start) / 60.0
    print("time used for decode %d sentences: %.4f minutes." %
          (len(test_dataset), time_used))
Example #8
0
def main(conf):
    opts = Config()
    for k, v in conf.items():
        setattr(opts, k, v)
        print('{:50}:{}'.format(k, v))

    device = torch.device('cuda') if opts.use_gpu and torch.cuda.is_available(
    ) else torch.device('cpu')
    torch.manual_seed(opts.seed)
    np.random.seed(opts.seed)
    if opts.use_gpu:
        torch.cuda.manual_seed(opts.seed)
    datasets = os.listdir(opts.data_file)
    for idx, dataset in enumerate(datasets):
        datasets[idx] = opts.data_file + "/" + dataset
    train_scp = "/train/feats.scp"
    train_lab = "/train/lab.txt"
    valid_scp = "/dev/feats.scp"
    valid_lab = "/dev/lab.txt"
    vocab_f = "/units"
    if opts.universal:
        vocab_f = "/all_units"
    semi = False
    semi_loader = None

    #Data Loader
    vocab = [Vocab(dataset + vocab_f) for dataset in datasets]

    train_dataset = [
        SpeechDataset(voc, dataset + train_scp, dataset + train_lab, opts)
        for dataset, voc in zip(datasets, vocab)
    ]
    dev_dataset = [
        SpeechDataset(voc, dataset + valid_scp, dataset + valid_lab, opts)
        for dataset, voc in zip(datasets, vocab)
    ]
    train_loader = [
        SpeechDataLoader(dataset,
                         batch_size=opts.batch_size,
                         shuffle=opts.shuffle_train,
                         num_workers=opts.num_workers)
        for dataset in train_dataset
    ]
    dev_loader = [
        SpeechDataLoader(dataset,
                         batch_size=opts.batch_size,
                         shuffle=False,
                         num_workers=opts.num_workers)
        for dataset in dev_dataset
    ]

    if opts.semi:
        semi = True
        semi_scp = "/train/feats_nolabel.scp"
        semi_train_dataset = [
            UnlabelSpeechDataset(dataset + semi_scp, opts)
            for dataset in datasets
        ]
        semi_loader = [
            UnlabelSpeechDataLoader(dataset,
                                    batch_size=opts.batch_size,
                                    shuffle=opts.shuffle_train,
                                    num_workers=opts.num_workers)
            for dataset in semi_train_dataset
        ]

    if opts.language_one_hot:
        # add size of one-hot label
        opts.rnn_input_size = opts.rnn_input_size + len(train_dataset)
    #Define Model
    rnn_type = supported_rnn[opts.rnn_type]
    rnn_param = {
        "rnn_input_size": opts.rnn_input_size,
        "rnn_hidden_size": opts.rnn_hidden_size,
        "rnn_layers": opts.rnn_layers,
        "rnn_type": rnn_type,
        "bidirectional": opts.bidirectional,
        "batch_norm": opts.batch_norm
    }

    num_class = [voc.n_words for voc in vocab]
    # opts.output_class_dim = vocab.n_words
    drop_out = opts.drop_out
    add_cnn = opts.add_cnn

    cnn_param = {}
    channel = eval(opts.channel)
    kernel_size = eval(opts.kernel_size)
    stride = eval(opts.stride)
    padding = eval(opts.padding)
    pooling = eval(opts.pooling)
    activation_function = supported_activate[opts.activation_function]
    cnn_param['batch_norm'] = opts.batch_norm
    cnn_param['activate_function'] = activation_function
    cnn_param["layer"] = []
    for layer in range(opts.layers):
        layer_param = [
            channel[layer], kernel_size[layer], stride[layer], padding[layer]
        ]
        if pooling is not None:
            layer_param.append(pooling[layer])
        else:
            layer_param.append(None)
        cnn_param["layer"].append(layer_param)

    # Domain Adversarial Training
    if opts.dat_lambda != 0:
        dat = True
    else:
        dat = False

    if opts.mme_lambda != 0:
        mme = True
    else:
        mme = False
    model = Multi_CTC_Model(add_cnn=add_cnn,
                            cnn_param=cnn_param,
                            rnn_param=rnn_param,
                            num_class=num_class,
                            drop_out=drop_out,
                            dat=opts.dat_lambda,
                            mme=opts.mme_lambda,
                            universal=opts.universal)
    model = model.to(device)
    num_params = 0
    for name, param in model.named_parameters():
        num_params += param.numel()
    print("Number of parameters %d" % num_params)
    for idx, m in enumerate(model.children()):
        print(idx, m)

    if opts.resume != '':
        print("Load ckp from {}".format(opts.resume))
        package = torch.load(opts.resume)
        state_dict = package['state_dict']
        pretrained_dict = {
            k: v
            for k, v in state_dict.items() if k in model.state_dict().keys()
        }
        model_dict = model.state_dict()
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)

    #Training
    init_lr = opts.init_lr
    num_epoches = opts.num_epoches
    end_adjust_acc = opts.end_adjust_acc
    decay = opts.lr_decay
    weight_decay = opts.weight_decay
    batch_size = opts.batch_size

    params = {
        'num_epoches': num_epoches,
        'end_adjust_acc': end_adjust_acc,
        'mel': opts.mel,
        'seed': opts.seed,
        'decay': decay,
        'learning_rate': init_lr,
        'weight_decay': weight_decay,
        'batch_size': batch_size,
        'feature_type': opts.feature_type,
        'n_feats': opts.feature_dim
    }
    print(params)

    loss_fn = nn.CTCLoss(reduction='sum', zero_infinity=True)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=init_lr,
                                 weight_decay=weight_decay)

    #visualization for training

    count = 0
    learning_rate = init_lr
    loss_best = 1e6
    loss_best_true = 1e6
    adjust_rate_flag = False
    stop_train = False
    adjust_time = 0
    acc_best = 0
    start_time = time.time()
    loss_results = []
    dev_loss_results = []
    dev_cer_results = []

    advT = AT(opts)

    while not stop_train:
        if count >= num_epoches:
            break
        count += 1
        advT.step()

        if adjust_rate_flag:
            learning_rate *= decay
            adjust_rate_flag = False
            for param in optimizer.param_groups:
                param['lr'] *= decay

        print("Start training epoch: %d, learning_rate: %.5f" %
              (count, learning_rate))

        train_acc, loss = run_epoch(count,
                                    model,
                                    train_loader,
                                    loss_fn,
                                    device,
                                    opts,
                                    semi_loader,
                                    optimizer=optimizer,
                                    print_every=opts.verbose_step,
                                    is_training=True,
                                    advT=advT)
        loss_results.append(loss)
        acc, dev_loss = run_epoch(count,
                                  model,
                                  dev_loader,
                                  loss_fn,
                                  device,
                                  opts,
                                  optimizer=None,
                                  print_every=opts.verbose_step,
                                  is_training=False,
                                  advT=None)
        print("loss on dev set is %.4f" % dev_loss)
        dev_loss_results.append(dev_loss)
        dev_cer_results.append(acc)

        #adjust learning rate by dev_loss
        if dev_loss < (loss_best - end_adjust_acc):
            loss_best = dev_loss
            loss_best_true = dev_loss
            adjust_rate_count = 0
            model_state = copy.deepcopy(model.state_dict())
            op_state = copy.deepcopy(optimizer.state_dict())
        elif (dev_loss < loss_best + end_adjust_acc):
            adjust_rate_count += 1
            if dev_loss < loss_best and dev_loss < loss_best_true:
                loss_best_true = dev_loss
                model_state = copy.deepcopy(model.state_dict())
                op_state = copy.deepcopy(optimizer.state_dict())
        else:
            adjust_rate_count = 10

        if acc > acc_best:
            acc_best = acc
            best_model_state = copy.deepcopy(model.state_dict())
            best_op_state = copy.deepcopy(optimizer.state_dict())

        print("adjust_rate_count:" + str(adjust_rate_count))
        print('adjust_time:' + str(adjust_time))

        if adjust_rate_count == 10:
            adjust_rate_flag = True
            adjust_time += 1
            adjust_rate_count = 0
            if loss_best > loss_best_true:
                loss_best = loss_best_true
            model.load_state_dict(model_state)
            optimizer.load_state_dict(op_state)

        if adjust_time == 8:
            stop_train = True

        time_used = (time.time() - start_time) / 60
        print("epoch %d done, cv acc is: %.4f, time_used: %.4f minutes" %
              (count, acc, time_used))

        x_axis = range(count)
        y_axis = [
            loss_results[0:count], dev_loss_results[0:count],
            dev_cer_results[0:count]
        ]

    print("End training, best dev loss is: %.4f, acc is: %.4f" %
          (loss_best, acc_best))
    model.load_state_dict(best_model_state)
    optimizer.load_state_dict(best_op_state)
    save_dir = os.path.join(opts.checkpoint_dir, opts.exp_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    best_path = os.path.join(save_dir, 'ctc_best_model.pkl')
    params['epoch'] = count

    torch.save(
        CTC_Model.save_package(model,
                               optimizer=optimizer,
                               epoch=params,
                               loss_results=loss_results,
                               dev_loss_results=dev_loss_results,
                               dev_cer_results=dev_cer_results), best_path)