def get_lm(): n_layers = 1 n_units = 4 char_list = ["<blank>", "<space>", "a", "b", "c", "d", "<eos>"] rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(char_list), n_layers, n_units, typ="lstm")) return rnnlm
def test_recognition_results_with_lm(etype, dtype, m_str, text_idx1): const = 1e-4 numpy.random.seed(1) seq_true_texts = [ ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"], ["o", "o", "ieieieieieieieieo"], ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"], ["o", "o", "ieieieieieieieieo"], ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"], ["o", "o", "ieieieieieieieieo"], ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"], ["o", "o", "ieieieieieieieieo"], ] # ctc_weight: 0.0 (attention), 0.5 (hybrid CTC/attention), 1.0 (CTC) for text_idx2, ctc_weight in enumerate([0.0, 0.5, 1.0]): seq_true_text = seq_true_texts[text_idx1][text_idx2] args = make_arg( etype=etype, rnnlm="dummy", ctc_weight=ctc_weight, lm_weight=0.3 ) m = importlib.import_module(m_str) model = m.E2E(40, 5, args) if "pytorch" in m_str: rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), 2, 10) ) init_torch_weight_const(model, const) init_torch_weight_const(rnnlm, const) else: rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(args.char_list), 2, 10) ) init_chainer_weight_const(model, const) init_chainer_weight_const(rnnlm, const) data = [ ( "aaa", dict( feat=numpy.random.randn(100, 40).astype(numpy.float32), token=seq_true_text, ), ) ] in_data = data[0][1]["feat"] nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm) y_hat = nbest_hyps[0]["yseq"][1:] seq_hat = [args.char_list[int(idx)] for idx in y_hat] seq_hat_text = "".join(seq_hat).replace("<space>", " ") seq_true_text = data[0][1]["token"] assert seq_hat_text == seq_true_text
def test_batch_beam_search(etype, dtype, m_str): const = 1e-4 numpy.random.seed(1) # ctc_weight: 0.0 (attention), 0.5 (hybrid CTC/attention), 1.0 (CTC) for ctc_weight in [0.0, 0.5]: args = make_arg(etype=etype, rnnlm="dummy", ctc_weight=ctc_weight, lm_weight=0.3) m = importlib.import_module(m_str) model = m.E2E(40, 5, args) if "pytorch" in m_str: rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), 2, 10)) init_torch_weight_const(model, const) init_torch_weight_const(rnnlm, const) else: # chainer module continue data = [("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32)))] in_data = data[0][1]["feat"] for lm_weight in [0.0, 0.3]: if lm_weight == 0.0: s_nbest_hyps = model.recognize(in_data, args, args.char_list) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list) else: s_nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list, rnnlm) assert s_nbest_hyps[0]['yseq'] == b_nbest_hyps[0][0]['yseq'] if ctc_weight > 0.0: args.ctc_window_margin = 40 s_nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list, rnnlm) assert s_nbest_hyps[0]['yseq'] == b_nbest_hyps[0][0]['yseq']
def get_wordlm(): n_layers = 1 n_units = 8 char_list = ["<blank>", "<space>", "a", "b", "c", "d", "<eos>"] word_list = ["<blank>", "<unk>", "ab", "id", "ac", "bd", "<eos>"] char_dict = {x: i for i, x in enumerate(char_list)} word_dict = {x: i for i, x in enumerate(word_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_list), n_layers, n_units)) word_rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) return word_rnnlm
def test_lm(): n_vocab = 3 n_layers = 2 n_units = 2 batchsize = 5 for typ in ["lstm"]: # TODO(anyone) gru rnnlm_ch = lm_chainer.ClassifierWithState(lm_chainer.RNNLM(n_vocab, n_layers, n_units, typ=typ)) rnnlm_th = lm_pytorch.ClassifierWithState(lm_pytorch.RNNLM(n_vocab, n_layers, n_units, typ=typ)) transfer_lm(rnnlm_ch.predictor, rnnlm_th.predictor) # test prediction equality x = torch.from_numpy(numpy.random.randint(n_vocab, size=batchsize)).long() with torch.no_grad(), chainer.no_backprop_mode(), chainer.using_config('train', False): rnnlm_th.predictor.eval() state_th, y_th = rnnlm_th.predictor(None, x.long()) state_ch, y_ch = rnnlm_ch.predictor(None, x.data.numpy()) for k in state_ch.keys(): for n in range(len(state_th[k])): print(k, n) print(state_th[k][n].data.numpy()) print(state_ch[k][n].data) numpy.testing.assert_allclose(state_th[k][n].data.numpy(), state_ch[k][n].data, 1e-5) numpy.testing.assert_allclose(y_th.data.numpy(), y_ch.data, 1e-5)
def test_batch_beam_search(etype, dtype, m_str): numpy.random.seed(1) # ctc_weight: 0.0 (attention), 0.5 (hybrid CTC/attention), 1.0 (CTC) for ctc_weight in [0.0, 0.5, 1.0]: args = make_arg(etype=etype, rnnlm="dummy", ctc_weight=ctc_weight, lm_weight=0.3) m = importlib.import_module(m_str) model = m.E2E(40, 5, args) if "pytorch" in m_str: torch.manual_seed(1) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), 2, 10)) init_torch_weight_random(model, (-0.1, 0.1)) init_torch_weight_random(rnnlm, (-0.1, 0.1)) model.eval() rnnlm.eval() else: # chainer module continue data = [("aaa", dict(feat=numpy.random.randn(100, 40).astype(numpy.float32)))] in_data = data[0][1]["feat"] for lm_weight in [0.0, 0.3]: if lm_weight == 0.0: s_nbest_hyps = model.recognize(in_data, args, args.char_list) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list) else: s_nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list, rnnlm) assert s_nbest_hyps[0]['yseq'] == b_nbest_hyps[0][0]['yseq'] if ctc_weight > 0.0: args.ctc_window_margin = 40 s_nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm) b_nbest_hyps = model.recognize_batch([in_data], args, args.char_list, rnnlm) assert s_nbest_hyps[0]['yseq'] == b_nbest_hyps[0][0]['yseq'] # Test word LM in batch decoding if "pytorch" in m_str: rand_range = (-0.01, 0.01) torch.manual_seed(1) char_list = ['<blank>', '<space>'] + args.char_list + ['<eos>'] args = make_arg(etype=etype, rnnlm="dummy", ctc_weight=ctc_weight, ctc_window_margin=40, lm_weight=0.3, beam_size=5) m = importlib.import_module(m_str) model = m.E2E(40, len(char_list), args) char_dict = {x: i for i, x in enumerate(char_list)} word_dict = {x: i for i, x in enumerate(args.word_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.word_list), 2, 10)) rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) init_torch_weight_random(model, rand_range) init_torch_weight_random(rnnlm, rand_range) model.eval() rnnlm.eval() s_nbest_hyps = model.recognize(in_data, args, char_list, rnnlm) b_nbest_hyps = model.recognize_batch([in_data], args, char_list, rnnlm) assert s_nbest_hyps[0]['yseq'] == b_nbest_hyps[0][0]['yseq']
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] if args.streaming_mode == 'window': logging.info( 'Using streaming recognizer with window size %d frames', args.streaming_window) se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) for i in range(0, feat.shape[0], args.streaming_window): logging.info('Feeding frames %d - %d', i, i + args.streaming_window) se2e.accept_input(feat[i:i + args.streaming_window]) logging.info('Running offline attention decoder') se2e.decode_with_attention_offline() logging.info('Offline attention decoder finished') nbest_hyps = se2e.retrieve_recognition() elif args.streaming_mode == 'segment': logging.info( 'Using streaming recognizer with threshold value %d', args.streaming_min_blank_dur) nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({'yseq': [], 'score': 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i:i + r]) if hyps is not None: text = ''.join([ train_args.char_list[int(x)] for x in hyps[0]['yseq'][1:-1] if int(x) != -1 ]) text = text.replace( '\u2581', ' ').strip() # for SentencePiece text = text.replace(model.space, ' ') text = text.replace(model.blank, '') logging.info(text) for n in range(args.nbest): nbest_hyps[n]['yseq'].extend(hyps[n]['yseq']) nbest_hyps[n]['score'] += hyps[n]['score'] else: nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = load_inputs_and_targets(batch)[0] nbest_hyps = model.recognize_batch(feats, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp, train_args.char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]['input'][0]['shape'][-1]) odim = int(valid_json[utts[0]]['output'][0]['shape'][-1]) logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = 'ctc' logging.info('Pure CTC mode') elif args.mtlalpha == 0.0: mtl_mode = 'att' logging.info('Pure attention mode') else: mtl_mode = 'mtl' logging.info('Multitask learning mode') if args.enc_init is not None or args.dec_init is not None: model = load_trained_modules(idim, odim, args) elif args.asr_init is not None: model, _ = load_trained_model(args.asr_init) else: model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, ASRInterface) subsampling_factor = model.subsample[0] if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch.load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 logging.info(device) logging.info(dtype) model = model.to(device=device, dtype=dtype) # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay) elif args.opt == 'noam': from espnet.nets.pytorch_backend.rnn.optimizer import get_std_opt optimizer = get_std_opt(model, args.adim, args.transformer_warmup_steps, args.transformer_lr) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux") raise e if args.opt == 'noam': model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype) else: model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(subsampling_factor=subsampling_factor, dtype=dtype) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) valid = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) load_tr = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': True} # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': False} # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list if args.n_iter_processes > 0: train_iter = ToggleableShufflingMultiprocessIterator( TransformDataset(train, load_tr), batch_size=1, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20, shuffle=not use_sortagrad) valid_iter = ToggleableShufflingMultiprocessIterator( TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20) else: train_iter = ToggleableShufflingSerialIterator( TransformDataset(train, load_tr), batch_size=1, shuffle=not use_sortagrad) valid_iter = ToggleableShufflingSerialIterator(TransformDataset( valid, load_cv), batch_size=1, repeat=False, shuffle=False) # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, 'epoch')) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, converter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class(att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device) trainer.extend(att_reporter, trigger=(1, 'epoch')) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/loss_ctc', 'validation/main/loss_ctc', 'main/loss_att', 'validation/main/loss_att' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/acc', 'validation/main/acc'], 'epoch', file_name='acc.png')) trainer.extend( extensions.PlotReport(['main/cer_ctc', 'validation/main/cer_ctc'], 'epoch', file_name='cer.png')) # Save best models trainer.extend( snapshot_object(model, 'model.loss.best'), trigger=training.triggers.MinValueTrigger('validation/main/loss')) if mtl_mode != 'ctc': trainer.extend( snapshot_object(model, 'model.acc.best'), trigger=training.triggers.MaxValueTrigger('validation/main/acc')) # save snapshot which contains model and optimizer states trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc' and mtl_mode != 'ctc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att', 'validation/main/loss', 'validation/main/loss_ctc', 'validation/main/loss_att', 'main/acc', 'validation/main/acc', 'main/cer_ctc', 'validation/main/cer_ctc', 'elapsed_time' ] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["eps"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('eps') if args.report_cer: report_keys.append('validation/main/cer') if args.report_wer: report_keys.append('validation/main/wer') trainer.extend(extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, 'iteration')) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend(TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration")) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit) ) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM( word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict ) ) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM( word_rnnlm.predictor, word_dict, char_dict ) ) # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info("gpu id: " + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]["input"][0]["shape"][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = load_inputs_and_targets(batch)[0] nbest_hyps = model.recognize_batch( feats, args, train_args.char_list, rnnlm=rnnlm ) for i, name in enumerate(names): nbest_hyp = [hyp[i] for hyp in nbest_hyps] new_js[name] = add_results_to_json( js[name], nbest_hyp, train_args.char_list ) with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") )
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]["input"][0]["shape"][-1]) odim = int(valid_json[utts[0]]["output"][0]["shape"][-1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = "ctc" logging.info("Pure CTC mode") elif args.mtlalpha == 0.0: mtl_mode = "att" logging.info("Pure attention mode") else: mtl_mode = "mtl" logging.info("Multitask learning mode") # specify model architecture model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, ASRInterface) subsampling_factor = model.subsample[0] if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) torch.load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), ) ) # Setup an optimizer if args.opt == "adadelta": optimizer = torch.optim.Adadelta( model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay ) elif args.opt == "adam": optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model.parameters(), args.adim, args.transformer_warmup_steps, args.transformer_lr, ) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux" ) raise e if args.opt == "noam": model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype ) else: model, optimizer = amp.initialize( model, optimizer, opt_level=args.train_dtype ) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter( subsampling_factor=subsampling_factor, dtype=dtype, num_spkrs=args.num_spkrs ) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=-1, ) valid = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=-1, ) load_tr = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = { "main": ChainerDataLoader( dataset=TransformDataset(train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=True, collate_fn=lambda x: x[0], ) } valid_iter = { "main": ChainerDataLoader( dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) } # Set up a trainer updater = CustomUpdater( model, args.grad_clip, train_iter, optimizer, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex, ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_att", "validation/main/loss_att", ], "epoch", file_name="loss.png", ) ) trainer.extend( extensions.PlotReport( ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png" ) ) trainer.extend( extensions.PlotReport( ["main/cer_ctc", "validation/main/cer_ctc"], "epoch", file_name="cer.png" ) ) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) if mtl_mode != "ctc": trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # save snapshot which contains model and optimizer states trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot( model, args.outdir + "/model.acc.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot( model, args.outdir + "/model.loss.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration")) ) report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ctc", "main/loss_att", "validation/main/loss", "validation/main/loss_ctc", "validation/main/loss_att", "main/acc", "validation/main/acc", "main/cer_ctc", "validation/main/cer_ctc", "elapsed_time", ] if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][ "eps" ], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") if args.report_cer: report_keys.append("validation/main/cer") if args.report_wer: report_keys.append("validation/main/wer") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": from torch.utils.tensorboard import SummaryWriter trainer.extend( TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def trans(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, STInterface) # args.ctc_weight = 0.0 model.trans_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError("use '--api v2' option to decode with non-default language model") rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.trans_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] input_feat = load_inputs_and_targets(batch) feat = input_feat[0][0] visual_feat = input_feat[1][0] nbest_hyps = model.translate(feat, visual_feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] input_feat = load_inputs_and_targets(batch) feats = input_feat[0][0] visual_feat = input_feat[1][0] nbest_hyps = model.translate_batch(feats, visual_feat, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp, train_args.char_list) with open(args.result_label, 'wb') as f: f.write(json.dumps({'utts': new_js}, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) model.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility )) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info("gpu id: " + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch) feat = feat[0][0] if args.prefix_decode: best, ids, score = model.prefix_recognize( feat, args, train_args, train_args.char_list, rnnlm) new_js[name] = add_single_results(js[name], best, ids, score) else: nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) with open(args.result_label, "wb") as f: f.write( json.dumps({ "utts": new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode("utf_8"))
def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args if args.streaming_mode and "transformer" in train_args.model_module: raise NotImplementedError( "streaming mode for transformer is not implemented") # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility )) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(word_dict), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility )) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info("gpu id: " + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch) feat = (feat[0][0] if args.num_encs == 1 else [feat[idx][0] for idx in range(model.num_encs)]) if args.streaming_mode == "window" and args.num_encs == 1: logging.info( "Using streaming recognizer with window size %d frames", args.streaming_window, ) se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) for i in range(0, feat.shape[0], args.streaming_window): logging.info("Feeding frames %d - %d", i, i + args.streaming_window) se2e.accept_input(feat[i:i + args.streaming_window]) logging.info("Running offline attention decoder") se2e.decode_with_attention_offline() logging.info("Offline attention decoder finished") nbest_hyps = se2e.retrieve_recognition() elif args.streaming_mode == "segment" and args.num_encs == 1: logging.info( "Using streaming recognizer with threshold value %d", args.streaming_min_blank_dur, ) nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({"yseq": [], "score": 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i:i + r]) if hyps is not None: text = "".join([ train_args.char_list[int(x)] for x in hyps[0]["yseq"][1:-1] if int(x) != -1 ]) text = text.replace( "\u2581", " ").strip() # for SentencePiece text = text.replace(model.space, " ") text = text.replace(model.blank, "") logging.info(text) for n in range(args.nbest): nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"]) nbest_hyps[n]["score"] += hyps[n]["score"] else: nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]["input"][0]["shape"][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = (load_inputs_and_targets(batch)[0] if args.num_encs == 1 else load_inputs_and_targets(batch)) if args.streaming_mode == "window" and args.num_encs == 1: raise NotImplementedError elif args.streaming_mode == "segment" and args.num_encs == 1: if args.batchsize > 1: raise NotImplementedError feat = feats[0] nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({"yseq": [], "score": 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i:i + r]) if hyps is not None: text = "".join([ train_args.char_list[int(x)] for x in hyps[0]["yseq"][1:-1] if int(x) != -1 ]) text = text.replace( "\u2581", " ").strip() # for SentencePiece text = text.replace(model.space, " ") text = text.replace(model.blank, "") logging.info(text) for n in range(args.nbest): nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"]) nbest_hyps[n]["score"] += hyps[n]["score"] nbest_hyps = [nbest_hyps] else: nbest_hyps = model.recognize_batch(feats, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp, train_args.char_list) with open(args.result_label, "wb") as f: f.write( json.dumps({ "utts": new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode("utf_8"))
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get paths to data lang_pairs = sorted(args.lang_pairs.split(',')) args.one_to_many = True if len(lang_pairs) > 1 else False tgt_langs = sorted([p.split('-')[-1] for p in lang_pairs]) src_lang = lang_pairs[0].split('-')[0] if args.one_to_many: train_jpaths = [ os.path.join(args.train_json, fname) for fname in sorted(os.listdir(args.train_json)) if fname.endswith('.json') ] valid_jpaths = [ os.path.join(args.valid_json, fname) for fname in sorted(os.listdir(args.valid_json)) if fname.endswith('.json') ] all_langs = list( sorted(set([l for p in lang_pairs for l in p.split('-')]))) args.langs_dict = {} offset = 2 # for <blank> and <unk> for i, lang in enumerate(all_langs): args.langs_dict[f'<2{lang}>'] = offset + i logging.info(f'| train_jpaths: {train_jpaths}') logging.info(f'| valid_jpaths: {valid_jpaths}') logging.info(f'| lang_pairs : {lang_pairs}') logging.info(f'| langs_dict : {args.langs_dict}') else: train_jpaths = [args.train_json] valid_jpaths = [args.valid_json] args.langs_dict = None # get input and output dimension info idim = 0 odim = 0 for i, jpath in enumerate(valid_jpaths): with open(jpath, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) idim_tmp = int(valid_json[utts[0]]['input'][0]['shape'][-1]) odim_tmp = int(valid_json[utts[0]]['output'][0]['shape'][-1]) logging.info('| pair {}: idim={}, odim={}'.format( lang_pairs[i], idim_tmp, odim_tmp)) if idim == 0: idim = idim_tmp else: assert idim == idim_tmp if odim < odim_tmp: odim = odim_tmp logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # Initialize with pre-trained ASR encoder and MT decoder if args.enc_init is not None or args.dec_init is not None: logging.info('Loading pretrained ASR encoder and/or MT decoder ...') model = load_trained_modules(idim, odim, args, interface=STInterface) logging.info(f'*** Model *** \n {model}') else: model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) logging.info(f'*** Model *** \n {model}') assert isinstance(model, STInterface) logging.info( f'| Number of model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}' ) subsampling_factor = model.subsample[0] logging.info(f'subsampling_factor={subsampling_factor}') if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility )) torch_load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to ' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( 'batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) # Setup an optimizer if args.opt == 'adadelta': optimizer = torch.optim.Adadelta(model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.opt == 'noam': from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt(model, args.adim, args.transformer_warmup_steps, args.transformer_lr) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux") raise e if args.opt == 'noam': model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype) else: model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 logging.info(f'use_sortagrad: {use_sortagrad}') # read json data num_langs = len(tgt_langs) train_all_pairs = [None] * num_langs valid_all_pairs = [None] * num_langs # check_data = {} batch_size = args.batch_size // num_langs if num_langs > 1 else args.batch_size for i, jpath in enumerate(train_jpaths): with open(jpath, 'rb') as f: train_json = json.load(f)['utts'] train_all_pairs[i] = make_batchset( train_json, batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout) # check_data[lang_pairs[i]] = list(train_json.keys()) for i, jpath in enumerate(valid_jpaths): with open(jpath, 'rb') as f: valid_json = json.load(f)['utts'] valid_all_pairs[i] = make_batchset( valid_json, batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout) # check_data[lang_pairs[i]] = list(valid_json.keys()) # print(f'len(train_all_pairs) = {len(train_all_pairs)}') # print(f'len(valid_all_pairs) = {len(valid_all_pairs)}') # for i, batch_langs in enumerate(train_all_pairs): # print(f'batch for lang {lang_pairs[i]}') # for batch_lang in batch_langs: # print(f'len(batch_lang) = {len(batch_lang)}') # print('-'*5) if num_langs > 1: cycle_train = [cycle(x) for x in train_all_pairs] cycle_valid = [cycle(x) for x in valid_all_pairs] num_batches_train = max(len(i) for i in train_all_pairs) num_batches_valid = max(len(i) for i in valid_all_pairs) train = [None] * num_batches_train valid = [None] * num_batches_valid for i, s in enumerate(zip(*cycle_train)): x = [] for y in s: x.extend(y) train[i] = x if i >= num_batches_train - 1: break for i, s in enumerate(zip(*cycle_valid)): x = [] for y in s: x.extend(y) valid[i] = x if i >= num_batches_valid - 1: break else: train = train_all_pairs[0] valid = valid_all_pairs[0] # print(f'num_batches_train = {num_batches_train}') # print(f'num_batches_valid = {num_batches_valid}') # print(f'len(train) = {len(train)}') # print(f'len(valid) = {len(valid)}') # print('*** Checking results of make_batchset() ***') # for i, batch in enumerate(train): # # if i == 0: # # print(batch) # ids = [sample[0] for sample in batch] # langs = [sample[1]['lang'] for sample in batch] # pairs = ['en-'+l for l in langs] # for i in range(len(ids)): # r = ids[i] in list(check_data[pairs[i]]) # print(f'ids[i]={ids[i]} in {check_data[pairs[i]]}: {r}') # print('-') # if r: # check_data[pairs[i]].remove(ids[i]) # print(f'len(batch) = {len(batch)}') # print(f'langs in batch: {langs}') # print('-'*5) # # if i > 5: # # break # print('*** Samples that are not used yet ***') # for k, v in check_data.items(): # print(k, v) # print('-'*5) # print('-'*20) load_tr = LoadInputsAndTargets(mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': True}, langs_dict=args.langs_dict, src_lang=src_lang) load_cv = LoadInputsAndTargets(mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': False}, langs_dict=args.langs_dict, src_lang=src_lang) # print('LoadInputsAndTargets()') # features, targets = load_cv(train[0]) # print(f'*** features: {features} ***') # for f in features: # # print(f) # print(f'len(f) = {len(f)}') # print('---') # print(f'*** targets : {targets} ***') # y1, y2 = zip(*targets) # # print(f'y1 = {y1}') # # print(f'y2 = {y2}') # for s in zip(y1, y2): # print(len(s[0][1]), len(s[1][1])) # print('-'*20) # Setup a converter converter = CustomConverter(subsampling_factor=subsampling_factor, dtype=dtype, asr_task=args.asr_weight > 0) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list n_iter_processes = args.n_iter_processes if n_iter_processes < 0: n_iter_processes = multiprocessing.cpu_count() elif n_iter_processes > 0: n_iter_processes = min(n_iter_processes, multiprocessing.cpu_count()) print(f'n_iter_processes = {n_iter_processes}') train_iter = { 'main': ChainerDataLoader(dataset=TransformDataset( train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=n_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], pin_memory=False) } valid_iter = { 'main': ChainerDataLoader(dataset=TransformDataset( valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=n_iter_processes, pin_memory=False) } # xs_pad, ilens, ys_pad, ys_pad_asr = converter([load_cv(valid[0])]) # print('*** xs_pad ***') # # print(xs_pad) # print(xs_pad.size()) # print('*** ilens ***') # print(ilens) # print('*** ys_pad ***') # # print(ys_pad) # print(ys_pad.size()) # print('*** ys_pad_asr ***') # print(ys_pad_asr) # print('-'*20) # print(train_iter['main']) # i=0 # for item in train_iter['main']: # print(item) # print('-'*5) # if i > 8: # break # i += 1 # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex) # trainer = training.Trainer( # updater, (args.epochs, 'epoch'), out=args.outdir) time_limit_trigger = TimeLimitTrigger(args) trainer = training.Trainer(updater, time_limit_trigger, out=args.outdir) logging.info(f'updater: {updater}') logging.info(f'trainer: {trainer}') if use_sortagrad: logging.info(f'use_sortagrad ...') trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, 'epoch')) # Evaluate the model with the test dataset for each epoch if args.save_interval_iters > 0: trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu), trigger=(args.save_interval_iters, 'iteration')) else: trainer.extend( CustomEvaluator(model, valid_iter, reporter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class(att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device) trainer.extend(att_reporter, trigger=(1, 'epoch')) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/loss_asr', 'validation/main/loss_asr', 'main/loss_st', 'validation/main/loss_st' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport([ 'main/acc', 'validation/main/acc', 'main/acc_asr', 'validation/main/acc_asr' ], 'epoch', file_name='acc.png')) trainer.extend( extensions.PlotReport(['main/bleu', 'validation/main/bleu'], 'epoch', file_name='bleu.png')) # Save best models if args.report_interval_iters > 0: trainer.extend(snapshot_object(model, 'model.loss.best'), trigger=MinValueTrigger( 'validation/main/loss', trigger=(args.report_interval_iters, 'iteration'), best_value=None)) trainer.extend(snapshot_object(model, 'model.acc.best'), trigger=MaxValueTrigger( 'validation/main/acc', trigger=(args.report_interval_iters, 'iteration'), best_value=None)) else: trainer.extend(snapshot_object(model, 'model.loss.best'), trigger=MinValueTrigger('validation/main/loss', best_value=None)) trainer.extend(snapshot_object(model, 'model.acc.best'), trigger=MaxValueTrigger('validation/main/acc', best_value=None)) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename='snapshot.iter.{.updater.iteration}'), trigger=(args.save_interval_iters, 'iteration')) else: trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # epsilon decay in the optimizer if args.opt == 'adadelta': if args.criterion == 'acc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) elif args.opt == 'adam': if args.criterion == 'acc': trainer.extend(restore_snapshot(model, args.outdir + '/model.acc.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) trainer.extend(adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( 'validation/main/acc', lambda best_value, current_value: best_value > current_value)) elif args.criterion == 'loss': trainer.extend(restore_snapshot(model, args.outdir + '/model.loss.best', load_fn=torch_load), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) trainer.extend(adam_lr_decay(args.lr_decay), trigger=CompareValueTrigger( 'validation/main/loss', lambda best_value, current_value: best_value < current_value)) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'main/loss', 'main/loss_st', 'main/loss_asr', 'validation/main/loss', 'validation/main/loss_st', 'validation/main/loss_asr', 'main/acc', 'validation/main/acc' ] if args.asr_weight > 0: report_keys.append('main/acc_asr') report_keys.append('validation/main/acc_asr') report_keys += ['elapsed_time'] if args.opt == 'adadelta': trainer.extend(extensions.observe_value( 'eps', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["eps"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('eps') elif args.opt in ['adam', 'noam']: trainer.extend(extensions.observe_value( 'lr', lambda trainer: trainer.updater.get_optimizer('main'). param_groups[0]["lr"]), trigger=(args.report_interval_iters, 'iteration')) report_keys.append('lr') if args.asr_weight > 0: if args.mtlalpha > 0: report_keys.append('main/cer_ctc') report_keys.append('validation/main/cer_ctc') if args.mtlalpha < 1: if args.report_cer: report_keys.append('validation/main/cer') if args.report_wer: report_keys.append('validation/main/wer') if args.report_bleu: report_keys.append('validation/main/bleu') trainer.extend(extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, 'iteration')) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend(TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration")) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def save_alignment(args): set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError("use '--api v2' option to decode with non-default language model") rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState(lm_pytorch.RNNLM( len(word_dict), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM(word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict)) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM(word_rnnlm.predictor, word_dict, char_dict)) # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") dtype = next(model.parameters()).dtype model = model.to(device=device) if rnnlm: rnnlm = rnnlm.to(device=device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=True, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # Setup a converter if args.num_encs == 1: converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype) else: converter = CustomConverterMulEnc([i[0] for i in model.subsample_list], dtype=dtype) import matplotlib.pyplot as plt outdir = args.outdir if not os.path.exists(outdir): os.makedirs(outdir) with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] x = converter([load_inputs_and_targets(batch)], device) alignments = model.calculate_alignments(*x) for i in range(len(alignments)): alignment = np.transpose(np.exp(alignments[i].astype(np.float32))) np_filename = "%s/%s.npy" % (outdir, names[i]) np.save(np_filename, alignment) plt.imshow(alignment, aspect="auto") plt.xlabel("Input Index") plt.ylabel("Label Index") plt.tight_layout() fig_filename = "%s/%s.png" % (outdir, names[i]) plt.savefig(fig_filename) plt.close()
def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) if args.num_encs > 1: args = format_mulenc_args(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim_list = [ int(valid_json[utts[0]]["input"][i]["shape"][-1]) for i in range(args.num_encs) ] odim = int(valid_json[utts[0]]["output"][0]["shape"][-1]) for i in range(args.num_encs): logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i])) logging.info("#output dims: " + str(odim)) # specify semi-supervised method assert 0.0 <= args.mixup_alpha <= 1.0, "mixup-alpha should be [0.0, 1.0]" if args.mixup_alpha == 0.0: semi_mode = "MT" logging.info("Pure Mean-Teacher mode") else: semi_mode = "ICT" logging.info("Interpolation Consistency Training mode") if (args.enc_init is not None or args.dec_init is not None) and args.num_encs == 1: model = load_trained_modules(idim_list[0], odim, args) else: model_class = dynamic_import(args.model_module) model = model_class(idim_list[0] if args.num_encs == 1 else idim_list, odim, args) assert isinstance(model, ASRInterface) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim_list[0] if args.num_encs == 1 else idim_list, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True, ).encode("utf_8")) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu if args.num_encs > 1: # TODO(ruizhili): implement data parallel for multi-encoder setup. raise NotImplementedError( "Data parallel is not supported for multi-encoder setup.") # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) # Setup an optimizer if args.opt == "adadelta": optimizer = torch.optim.Adadelta(model.enc.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay) elif args.opt == "adam": optimizer = torch.optim.Adam(model.enc.parameters(), weight_decay=args.weight_decay) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt(model.enc, args.adim, args.transformer_warmup_steps, args.transformer_lr) elif args.opt == "rmsprop": optimizer = torch.optim.RMSprop(model.enc.parameters(), lr=0.0008, alpha=0.95) elif args.opt == "sgd": optimizer = torch.optim.SGD(model.enc.parameters(), lr=0.5, momentum=0.9, nesterov=True) else: raise NotImplementedError("unknown optimizer: " + args.opt) use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter if args.num_encs == 1: converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype) else: converter = CustomConverterMulEnc([i[0] for i in model.subsample_list], dtype=dtype) # read json data assert 0.0 < args.utt_using_ratio < 1.0, "utt-using-ratio should not be 0 or 1" with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] if args.train_json is not None: with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] train_labeled_json_path = args.train_json.replace( '.json', '_labeled_{}.json'.format(int(args.utt_using_ratio * 100))) train_unlabeled_json_path = args.train_json.replace( '.json', '_unlabeled_{}.json'.format(100 - int(args.utt_using_ratio * 100))) if os.path.exists(train_labeled_json_path): with open(train_labeled_json_path, 'rb') as f: train_labeled_json = json.load(f)['utts'] with open(train_unlabeled_json_path, 'rb') as f: train_unlabeled_json = json.load(f)['utts'] else: # split json for each task split_point = [int(len(train_json) * args.utt_using_ratio)] train_labeled_json = dict( list(train_json.items())[:split_point[0]]) train_unlabeled_json = dict( list(train_json.items())[split_point[0]:]) with codecs.open(train_labeled_json_path, 'w+', encoding='utf8') as f: json.dump({'utts': train_labeled_json}, f, indent=4, sort_keys=True, ensure_ascii=False, separators=(',', ': ')) with codecs.open(train_unlabeled_json_path, 'w+', encoding='utf8') as f: json.dump({'utts': train_unlabeled_json}, f, indent=4, sort_keys=True, ensure_ascii=False, separators=(',', ': ')) else: with open(args.label_train_json, 'rb') as f: train_labeled_json = json.load(f)['utts'] with open(args.unlabel_train_json, 'rb') as f: train_unlabeled_json = json.load(f)['utts'] valid_labeled_json_path = args.valid_json.replace( '.json', '_labeled_{}.json'.format(int(args.utt_using_ratio * 100))) valid_unlabeled_json_path = args.valid_json.replace( '.json', '_unlabeled_{}.json'.format(100 - int(args.utt_using_ratio * 100))) if os.path.exists(valid_labeled_json_path): with open(valid_labeled_json_path, 'rb') as f: valid_labeled_json = json.load(f)['utts'] with open(valid_unlabeled_json_path, 'rb') as f: valid_unlabeled_json = json.load(f)['utts'] else: # split json for each task split_point = [int(len(valid_json) * args.utt_using_ratio)] valid_labeled_json = dict(list(valid_json.items())[:split_point[0]]) valid_unlabeled_json = dict(list(valid_json.items())[split_point[0]:]) with codecs.open(valid_labeled_json_path, 'w+', encoding='utf8') as f: json.dump({'utts': valid_labeled_json}, f, indent=4, sort_keys=True, ensure_ascii=False, separators=(',', ': ')) with codecs.open(valid_unlabeled_json_path, 'w+', encoding='utf8') as f: json.dump({'utts': valid_unlabeled_json}, f, indent=4, sort_keys=True, ensure_ascii=False, separators=(',', ': ')) use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset(train_labeled_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) valid = make_batchset(valid_labeled_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) ul_train = make_batchset(train_unlabeled_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) ul_valid = make_batchset(valid_unlabeled_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0) load_tr = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': True} # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': False} # Switch the mode of preprocessing ) load_ul_tr = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': True} # Switch the mode of preprocessing ) load_ul_cv = LoadInputsAndTargets( mode='asr', load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={'train': False} # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = ChainerDataLoader( dataset=TransformDataset(train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=not use_sortagrad, collate_fn=lambda x: x[0], ) valid_iter = ChainerDataLoader( dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) # Add splited data iteration for training ul_train_iter = ChainerDataLoader( dataset=TransformDataset(ul_train, lambda data: converter([load_ul_tr(data)])), batch_size=1, shuffle=True, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) ul_valid_iter = ChainerDataLoader( dataset=TransformDataset(ul_valid, lambda data: converter([load_ul_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) # Set up ICT related arguments ICT_args = { "consistency_rampup_starts": args.consistency_rampup_starts, "consistency_rampup_ends": args.consistency_rampup_ends, "cosine_rampdown_starts": args.cosine_rampdown_starts, "cosine_rampdown_ends": args.cosine_rampdown_ends, "ema_pre_decay": args.ema_pre_decay, "ema_post_decay": args.ema_post_decay } # Set up a trainer updater = CustomUpdater( model, args.grad_clip, { "main": train_iter, "sub": ul_train_iter }, optimizer, device, args.ngpu, ICT_args, args.grad_noise, args.accum_grad, use_apex=use_apex, ) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch # TODO: custom evaluator if args.save_interval_iters > 0: trainer.extend( CustomEvaluator(model, { "main": valid_iter, "sub": ul_valid_iter }, reporter, device, args.ngpu), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend( CustomEvaluator(model, { "main": valid_iter, "sub": ul_valid_iter }, reporter, device, args.ngpu)) # Make a plot for training and validation values trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ce", "validation/main/loss_ce", "main/loss_mse", "validation/main/loss_mse", ], "epoch", file_name="loss.png", )) trainer.extend( extensions.PlotReport( ["main/teacher_acc", "validation/main/teacher_acc"] + (["main/student_acc", "validation/main/student_acc"] if args.show_student_model_acc else []), "epoch", file_name="acc.png")) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger( "validation/main/teacher_acc"), ) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename="snapshot.iter.{.updater.iteration}"), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc": trainer.extend( restore_snapshot(model, args.outdir + "/model.acc.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/student_acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/student_acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot(model, args.outdir + "/model.loss.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # lr decay in rmsprop elif args.opt == "rmsprop" or "sgd": if args.criterion == "acc": trainer.extend( restore_snapshot(model, args.outdir + "/model.acc.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/teacher_acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( rmsprop_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/teacher_acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot(model, args.outdir + "/model.loss.best", load_fn=torch_load), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( rmsprop_lr_decay(args.lr_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))) report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ce", "main/loss_mse", "validation/main/loss", "validation/main/loss_ce", "validation/main/loss_mse", "main/teacher_acc", "validation/main/teacher_acc", "elapsed_time", ] + ["main/student_acc", "validation/main/student_acc" ] if args.show_student_model_acc else [] if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main"). param_groups[0]["eps"], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") if args.opt == "rmsprop" or "sgd": trainer.extend( extensions.observe_value( "lr", lambda trainer: trainer.updater.get_optimizer("main"). param_groups[0]["lr"], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("lr") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend( extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend( TensorboardLogger(SummaryWriter(args.tensorboard_dir), None), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs)
def trans(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, STInterface) # args.ctc_weight = 0.0 model.trans_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit)) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.trans_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) # Change to evaluation mode model.eval() if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('| (%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] if args.recog_and_trans: # for cross logging.info( '***** Recognize and Translate simultaneously for cross decoders ******' ) if args.beam_search_type == 'sum': logging.info('=== Beam search by sum of scores ===') nbest_hyps = model.recognize_and_translate_sum( feat, args, train_args.char_list, rnnlm, decode_asr_weight=args.decode_asr_weight, score_is_prob=args.score_is_prob, ratio_diverse_st=args.ratio_diverse_st, ratio_diverse_asr=args.ratio_diverse_asr, debug=args.debug) new_js[name] = add_results_to_json_st_asr( js[name], nbest_hyps, train_args.char_list) elif args.beam_search_type == 'sum-mono': logging.info('=== Beam search by sum of scores ===') nbest_hyps = model.recognize_and_translate_sum( feat, args, train_args.char_list, rnnlm, decode_asr_weight=args.decode_asr_weight, score_is_prob=args.score_is_prob, ratio_diverse_st=args.ratio_diverse_st, ratio_diverse_asr=args.ratio_diverse_asr, debug=args.debug) new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list) elif args.beam_search_type == 'separate': logging.info( '=== Beam search using beam_cross hypothesis ===') nbest_hyps, nbest_hyps_asr = model.recognize_and_translate_separate( feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list) new_js[name]['output'].append( add_results_to_json(js[name], nbest_hyps_asr, train_args.char_list, output_idx=1)['output'][0]) else: raise NotImplementedError elif args.recog and args.trans: logging.info( '***** Recognize and Translate separately ******') nbest_hyps_asr = model.recognize(feat, args, train_args.char_list, rnnlm) nbest_hyps = model.translate(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) new_js[name]['output'].append( add_results_to_json(js[name], nbest_hyps_asr, train_args.char_list, output_idx=1)['output'][0]) elif args.recog: logging.info('***** Recognize ONLY ******') nbest_hyps_asr = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps_asr, train_args.char_list) elif args.trans: logging.info('***** Translate ONLY ******') nbest_hyps = model.translate(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) else: raise NotImplementedError else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = load_inputs_and_targets(batch)[0] nbest_hyps = model.translate_batch(feats, args, train_args.char_list, rnnlm=rnnlm) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json(js[name], nbest_hyp, train_args.char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))