def main(): # Load configuration args, _, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'decode.log')): os.remove(os.path.join(args.recog_dir, 'decode.log')) set_logger(os.path.join(args.recog_dir, 'decode.log'), stdout=args.recog_stdout) ppl_avg = 0 for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, batch_size=args.recog_batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize, is_test=True) if i == 0: # Load the LM model = build_lm(args) load_checkpoint(args.recog_model[0], model) epoch = int(args.recog_model[0].split('-')[-1]) # NOTE: model averaging is not helpful for LM logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) logger.info('BPTT: %d' % (args.bptt)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache theta: %.3f' % (args.recog_cache_theta)) logger.info('cache lambda: %.3f' % (args.recog_cache_lambda)) logger.info('model average (Transformer): %d' % (args.recog_n_average)) model.cache_theta = args.recog_cache_theta model.cache_lambda = args.recog_cache_lambda # GPU setting if args.recog_n_gpus > 0: model.cuda() start_time = time.time() ppl, _ = eval_ppl([model], dataset, batch_size=1, bptt=args.bptt, n_caches=args.recog_n_caches, progressbar=True) ppl_avg += ppl print('PPL (%s): %.2f' % (dataset.set, ppl)) logger.info('Elasped time: %.2f [sec]:' % (time.time() - start_time)) logger.info('PPL (avg.): %.2f\n' % (ppl_avg / len(args.recog_sets)))
def load_lm(lm_path, mem_len=0): conf_lm = load_config(os.path.join(os.path.dirname(lm_path), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) args_lm.recog_mem_len = mem_len lm = build_lm(args_lm) load_checkpoint(lm_path, lm) lm.backward = args_lm.backward return lm
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, batch_size=args.recog_batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize, is_test=True) if i == 0: # Load the LM model = build_lm(args, dir_name) topk_list = load_checkpoint(model, args.recog_model[0]) epoch = int(args.recog_model[0].split('-')[-1]) # Model averaging for Transformer if conf['lm_type'] == 'transformer': model = average_checkpoints(model, args.recog_model[0], n_average=args.recog_n_average, topk_list=topk_list) logger.info('epoch: %d' % (epoch - 1)) logger.info('batch size: %d' % args.recog_batch_size) # logger.info('recog unit: %s' % args.recog_unit) # logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('BPTT: %d' % (args.bptt)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache theta: %.3f' % (args.recog_cache_theta)) logger.info('cache lambda: %.3f' % (args.recog_cache_lambda)) model.cache_theta = args.recog_cache_theta model.cache_lambda = args.recog_cache_lambda # GPU setting model.cuda() assert args.recog_n_caches > 0 save_path = mkdir_join(args.recog_dir, 'cache') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) hidden = None fig_count = 0 toknen_count = 0 n_tokens = args.recog_n_caches while True: ys, is_new_epoch = dataset.next() for t in range(ys.shape[1] - 1): loss, hidden = model(ys[:, t:t + 2], hidden, is_eval=True, n_caches=args.recog_n_caches)[:2] if len(model.cache_attn) > 0: if toknen_count == n_tokens: tokens_keys = dataset.idx2token[0]( model.cache_ids[:args.recog_n_caches], return_list=True) tokens_query = dataset.idx2token[0]( model.cache_ids[-n_tokens:], return_list=True) # Slide attention matrix n_keys = len(tokens_keys) n_queries = len(tokens_query) cache_probs = np.zeros( (n_keys, n_queries)) # `[n_keys, n_queries]` mask = np.zeros((n_keys, n_queries)) for i, aw in enumerate(model.cache_attn[-n_tokens:]): cache_probs[:(n_keys - n_queries + i + 1), i] = aw[0, -(n_keys - n_queries + i + 1):] mask[(n_keys - n_queries + i + 1):, i] = 1 plot_cache_weights(cache_probs, keys=tokens_keys, queries=tokens_query, save_path=mkdir_join( save_path, str(fig_count) + '.png'), figsize=(40, 16), mask=mask) toknen_count = 0 fig_count += 1 else: toknen_count += 1 if is_new_epoch: break
def main(): # Load configuration args, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'decode.log')): os.remove(os.path.join(args.recog_dir, 'decode.log')) set_logger(os.path.join(args.recog_dir, 'decode.log'), stdout=args.recog_stdout) wer_avg, cer_avg, per_avg = 0, 0, 0 ppl_avg, loss_avg = 0, 0 acc_avg = 0 bleu_avg = 0 for i, s in enumerate(args.recog_sets): # Load dataloader dataloader = build_dataloader( args=args, tsv_path=s, batch_size=1, is_test=True, first_n_utterances=args.recog_first_n_utt, longform_max_n_frames=args.recog_longform_max_n_frames) if i == 0: # Load ASR model model = Speech2Text(args, dir_name) epoch = int(float(args.recog_model[0].split('-')[-1]) * 10) / 10 if args.recog_n_average > 1: # Model averaging for Transformer # topk_list = load_checkpoint(args.recog_model[0], model) model = average_checkpoints( model, args.recog_model[0], # topk_list=topk_list, n_average=args.recog_n_average) else: load_checkpoint(args.recog_model[0], model) # Ensemble (different models) ensemble_models = [model] if len(args.recog_model) > 1: for recog_model_e in args.recog_model[1:]: conf_e = load_config( os.path.join(os.path.dirname(recog_model_e), 'conf.yml')) args_e = copy.deepcopy(args) for k, v in conf_e.items(): if 'recog' not in k: setattr(args_e, k, v) model_e = Speech2Text(args_e) load_checkpoint(recog_model_e, model_e) if args.recog_n_gpus >= 1: model_e.cuda() ensemble_models += [model_e] # Load LM for shallow fusion if not args.lm_fusion: # first path if args.recog_lm is not None and args.recog_lm_weight > 0: conf_lm = load_config( os.path.join(os.path.dirname(args.recog_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) args_lm.recog_mem_len = args.recog_mem_len lm = build_lm(args_lm, wordlm=args.recog_wordlm, lm_dict_path=os.path.join( os.path.dirname(args.recog_lm), 'dict.txt'), asr_dict_path=os.path.join( dir_name, 'dict.txt')) load_checkpoint(args.recog_lm, lm) if args_lm.backward: model.lm_bwd = lm else: model.lm_fwd = lm # second path (forward) if args.recog_lm_second is not None and args.recog_lm_second_weight > 0: conf_lm_second = load_config( os.path.join(os.path.dirname(args.recog_lm_second), 'conf.yml')) args_lm_second = argparse.Namespace() for k, v in conf_lm_second.items(): setattr(args_lm_second, k, v) args_lm_second.recog_mem_len = args.recog_mem_len lm_second = build_lm(args_lm_second) load_checkpoint(args.recog_lm_second, lm_second) model.lm_second = lm_second # second path (backward) if args.recog_lm_bwd is not None and args.recog_lm_bwd_weight > 0: conf_lm = load_config( os.path.join(os.path.dirname(args.recog_lm_bwd), 'conf.yml')) args_lm_bwd = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm_bwd, k, v) args_lm_bwd.recog_mem_len = args.recog_mem_len lm_bwd = build_lm(args_lm_bwd) load_checkpoint(args.recog_lm_bwd, lm_bwd) model.lm_bwd = lm_bwd if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('recog metric: %s' % args.recog_metric) logger.info('recog oracle: %s' % args.recog_oracle) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) logger.info('beam width: %d' % args.recog_beam_width) logger.info('min length ratio: %.3f' % args.recog_min_len_ratio) logger.info('max length ratio: %.3f' % args.recog_max_len_ratio) logger.info('length penalty: %.3f' % args.recog_length_penalty) logger.info('length norm: %s' % args.recog_length_norm) logger.info('coverage penalty: %.3f' % args.recog_coverage_penalty) logger.info('coverage threshold: %.3f' % args.recog_coverage_threshold) logger.info('CTC weight: %.3f' % args.recog_ctc_weight) logger.info('fist LM path: %s' % args.recog_lm) logger.info('second LM path: %s' % args.recog_lm_second) logger.info('backward LM path: %s' % args.recog_lm_bwd) logger.info('LM weight (first-pass): %.3f' % args.recog_lm_weight) logger.info('LM weight (second-pass): %.3f' % args.recog_lm_second_weight) logger.info('LM weight (backward): %.3f' % args.recog_lm_bwd_weight) logger.info('GNMT: %s' % args.recog_gnmt_decoding) logger.info('forward-backward attention: %s' % args.recog_fwd_bwd_attention) logger.info('resolving UNK: %s' % args.recog_resolving_unk) logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('ASR decoder state carry over: %s' % (args.recog_asr_state_carry_over)) logger.info('LM state carry over: %s' % (args.recog_lm_state_carry_over)) logger.info('model average (Transformer): %d' % (args.recog_n_average)) # GPU setting if args.recog_n_gpus >= 1: model.cudnn_setting(deterministic=True, benchmark=False) model.cuda() start_time = time.time() if args.recog_metric == 'edit_distance': if args.recog_unit in ['word', 'word_char']: wer, cer, _ = eval_word(ensemble_models, dataloader, args, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True, fine_grained=True, oracle=True) wer_avg += wer cer_avg += cer elif args.recog_unit == 'wp': wer, cer = eval_wordpiece(ensemble_models, dataloader, args, epoch=epoch - 1, recog_dir=args.recog_dir, streaming=args.recog_streaming, progressbar=True, fine_grained=True, oracle=True) wer_avg += wer cer_avg += cer elif 'char' in args.recog_unit: wer, cer = eval_char(ensemble_models, dataloader, args, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True, task_idx=0, fine_grained=True, oracle=True) # task_idx=1 if args.recog_unit and 'char' in args.recog_unit else 0) wer_avg += wer cer_avg += cer elif 'phone' in args.recog_unit: per = eval_phone(ensemble_models, dataloader, args, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True, fine_grained=True, oracle=True) per_avg += per else: raise ValueError(args.recog_unit) elif args.recog_metric in ['ppl', 'loss']: ppl, loss = eval_ppl(ensemble_models, dataloader, progressbar=True) ppl_avg += ppl loss_avg += loss elif args.recog_metric == 'accuracy': acc_avg += eval_accuracy(ensemble_models, dataloader, progressbar=True) elif args.recog_metric == 'bleu': bleu = eval_wordpiece_bleu(ensemble_models, dataloader, args, epoch=epoch - 1, recog_dir=args.recog_dir, streaming=args.recog_streaming, progressbar=True, fine_grained=True, oracle=True) bleu_avg += bleu else: raise NotImplementedError(args.recog_metric) elapsed_time = time.time() - start_time logger.info('Elapsed time: %.3f [sec]' % elapsed_time) logger.info('RTF: %.3f' % (elapsed_time / (dataloader.n_frames * 0.01))) if args.recog_metric == 'edit_distance': if 'phone' in args.recog_unit: logger.info('PER (avg.): %.2f %%\n' % (per_avg / len(args.recog_sets))) else: logger.info('WER / CER (avg.): %.2f / %.2f %%\n' % (wer_avg / len(args.recog_sets), cer_avg / len(args.recog_sets))) elif args.recog_metric in ['ppl', 'loss']: logger.info('PPL (avg.): %.2f\n' % (ppl_avg / len(args.recog_sets))) print('PPL (avg.): %.3f' % (ppl_avg / len(args.recog_sets))) logger.info('Loss (avg.): %.2f\n' % (loss_avg / len(args.recog_sets))) print('Loss (avg.): %.3f' % (loss_avg / len(args.recog_sets))) elif args.recog_metric == 'accuracy': logger.info('Accuracy (avg.): %.2f\n' % (acc_avg / len(args.recog_sets))) print('Accuracy (avg.): %.3f' % (acc_avg / len(args.recog_sets))) elif args.recog_metric == 'bleu': logger.info('BLEU (avg.): %.2f\n' % (bleu / len(args.recog_sets))) print('BLEU (avg.): %.3f' % (bleu / len(args.recog_sets)))
def main(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args_init = copy.deepcopy(args) args_teacher = copy.deepcopy(args) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k not in ['resume', 'local_rank']: setattr(args, k, v) args = compute_subsampling_factor(args) resume_epoch = int(args.resume.split('-')[-1]) if args.resume else 0 # Load dataset train_set = build_dataloader(args=args, tsv_path=args.train_set, tsv_path_sub1=args.train_set_sub1, tsv_path_sub2=args.train_set_sub2, batch_size=args.batch_size, batch_size_type=args.batch_size_type, max_n_frames=args.max_n_frames, resume_epoch=resume_epoch, sort_by=args.sort_by, short2long=args.sort_short2long, sort_stop_epoch=args.sort_stop_epoch, num_workers=args.workers, pin_memory=args.pin_memory, distributed=args.distributed, word_alignment_dir=args.train_word_alignment, ctc_alignment_dir=args.train_ctc_alignment) dev_set = build_dataloader( args=args, tsv_path=args.dev_set, tsv_path_sub1=args.dev_set_sub1, tsv_path_sub2=args.dev_set_sub2, batch_size=1 if 'transducer' in args.dec_type else args.batch_size, batch_size_type='seq' if 'transducer' in args.dec_type else args.batch_size_type, max_n_frames=1600, word_alignment_dir=args.dev_word_alignment, ctc_alignment_dir=args.dev_ctc_alignment) eval_sets = [ build_dataloader(args=args, tsv_path=s, batch_size=1, is_test=True) for s in args.eval_sets ] args.vocab = train_set.vocab args.vocab_sub1 = train_set.vocab_sub1 args.vocab_sub2 = train_set.vocab_sub2 args.input_dim = train_set.input_dim # Set save path if args.resume: args.save_path = os.path.dirname(args.resume) dir_name = os.path.basename(args.save_path) else: dir_name = set_asr_model_name(args) if args.mbr_training: assert args.asr_init args.save_path = mkdir_join(os.path.dirname(args.asr_init), dir_name) else: args.save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) if args.local_rank > 0: time.sleep(1) args.save_path = set_save_path(args.save_path) # avoid overwriting # Set logger set_logger(os.path.join(args.save_path, 'train.log'), args.stdout, args.local_rank) # Load a LM conf file for LM fusion & LM initialization if not args.resume and args.external_lm: lm_conf = load_config( os.path.join(os.path.dirname(args.external_lm), 'conf.yml')) args.lm_conf = argparse.Namespace() for k, v in lm_conf.items(): setattr(args.lm_conf, k, v) assert args.unit == args.lm_conf.unit assert args.vocab == args.lm_conf.vocab # Model setting model = Speech2Text(args, args.save_path, train_set.idx2token[0]) if not args.resume: # Save nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(args.save_path, 'nlsyms.txt')) for sub in ['', '_sub1', '_sub2']: if args.get('dict' + sub): shutil.copy( args.get('dict' + sub), os.path.join(args.save_path, 'dict' + sub + '.txt')) if args.get('unit' + sub) == 'wp': shutil.copy( args.get('wp_model' + sub), os.path.join(args.save_path, 'wp' + sub + '.model')) for k, v in sorted(args.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info('torch version: %s' % str(torch.__version__)) logger.info(model) # Initialize with pre-trained model's parameters if args.asr_init: # Load ASR model (full model) conf_init = load_config( os.path.join(os.path.dirname(args.asr_init), 'conf.yml')) for k, v in conf_init.items(): setattr(args_init, k, v) model_init = Speech2Text(args_init) load_checkpoint(args.asr_init, model_init) # Overwrite parameters param_dict = dict(model_init.named_parameters()) for n, p in model.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if args.asr_init_enc_only and 'enc' not in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n) # Set optimizer optimizer = set_optimizer( model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = 'former' in args.enc_type or 'former' in args.dec_type or 'former' in args.dec_type_sub1 scheduler = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, lower_better=args.metric not in ['accuracy', 'bleu'], warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, peak_lr=0.05 / (args.get('transformer_enc_d_model', 0)**0.5) if 'conformer' in args.enc_type else 1e6, model_size=args.get('transformer_enc_d_model', args.get('transformer_dec_d_model', 0)), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, scheduler) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # Load teacher ASR model teacher = None if args.teacher: assert os.path.isfile(args.teacher), 'There is no checkpoint.' conf_teacher = load_config( os.path.join(os.path.dirname(args.teacher), 'conf.yml')) for k, v in conf_teacher.items(): setattr(args_teacher, k, v) # Setting for knowledge distillation args_teacher.ss_prob = 0 args.lsm_prob = 0 teacher = Speech2Text(args_teacher) load_checkpoint(args.teacher, teacher) # Load teacher LM teacher_lm = None if args.teacher_lm: assert os.path.isfile(args.teacher_lm), 'There is no checkpoint.' conf_lm = load_config( os.path.join(os.path.dirname(args.teacher_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) teacher_lm = build_lm(args_lm) load_checkpoint(args.teacher_lm, teacher_lm) # GPU setting args.use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp, scaler = None, None if args.n_gpus >= 1: model.cudnn_setting( deterministic=((not is_transformer) and (not args.cudnn_benchmark)) or args.cudnn_deterministic, benchmark=(not is_transformer) and args.cudnn_benchmark) # Mixed precision training setting if args.use_apex: if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): scaler = torch.cuda.amp.GradScaler() else: from apex import amp model, scheduler.optimizer = amp.initialize( model, scheduler.optimizer, opt_level=args.train_dtype) from neural_sp.models.seq2seq.decoders.ctc import CTC amp.register_float_function(CTC, "loss_fn") # NOTE: see https://github.com/espnet/espnet/pull/1779 amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) n = torch.cuda.device_count() // args.local_world_size device_ids = list(range(args.local_rank * n, (args.local_rank + 1) * n)) torch.cuda.set_device(device_ids[0]) model.cuda(device_ids[0]) scheduler.cuda(device_ids[0]) if args.distributed: model = DDP(model, device_ids=device_ids) else: model = CustomDataParallel(model, device_ids=list(range(args.n_gpus))) if teacher is not None: teacher.cuda() if teacher_lm is not None: teacher_lm.cuda() else: model = CPUWrapperASR(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(args, model, args.local_rank) args.wandb_id = reporter.wandb_id if args.resume: n_steps = scheduler.n_steps * max( 1, args.accum_grad_n_steps // args.local_world_size) reporter.resume(n_steps, resume_epoch) # Save conf file as a yaml file if args.local_rank == 0: save_config(args, os.path.join(args.save_path, 'conf.yml')) if args.external_lm: save_config(args.lm_conf, os.path.join(args.save_path, 'conf_lm.yml')) # NOTE: save after reporter for wandb ID # Define tasks if args.mtl_per_batch: # NOTE: from easier to harder tasks tasks = [] if args.total_weight - args.bwd_weight - args.ctc_weight - args.sub1_weight - args.sub2_weight > 0: tasks += ['ys'] if args.bwd_weight > 0: tasks = ['ys.bwd'] + tasks if args.ctc_weight > 0: tasks = ['ys.ctc'] + tasks if args.mbr_ce_weight > 0: tasks = ['ys.mbr'] + tasks for sub in ['sub1', 'sub2']: if args.get('train_set_' + sub) is not None: if args.get(sub + '_weight', 0) - args.get( 'ctc_weight_' + sub, 0) > 0: tasks = ['ys_' + sub] + tasks if args.get('ctc_weight_' + sub, 0) > 0: tasks = ['ys_' + sub + '.ctc'] + tasks else: tasks = ['all'] if args.get('ss_start_epoch', 0) <= resume_epoch: model.module.trigger_scheduled_sampling() if args.get('mocha_quantity_loss_start_epoch', 0) <= resume_epoch: model.module.trigger_quantity_loss() start_time_train = time.time() for ep in range(resume_epoch, args.n_epochs): train_one_epoch(model, train_set, dev_set, eval_sets, scheduler, reporter, logger, args, amp, scaler, tasks, teacher, teacher_lm) # Save checkpoint and validate model per epoch if reporter.n_epochs + 1 < args.eval_start_epoch: scheduler.epoch() # lr decay reporter.epoch() # plot # Save model if args.local_rank == 0: scheduler.save_checkpoint(model, args.save_path, amp=amp, remove_old=(not is_transformer) and args.remove_old_checkpoints) else: start_time_eval = time.time() # dev metric_dev = validate([model.module], dev_set, args, reporter.n_epochs + 1, logger) scheduler.epoch(metric_dev) # lr decay reporter.epoch(metric_dev, name=args.metric) # plot reporter.add_scalar('dev/' + args.metric, metric_dev) if scheduler.is_topk or is_transformer: # Save model if args.local_rank == 0: scheduler.save_checkpoint(model, args.save_path, amp=amp, remove_old=(not is_transformer) and args.remove_old_checkpoints) # test if scheduler.is_topk: for eval_set in eval_sets: validate([model.module], eval_set, args, reporter.n_epochs, logger) logger.info('Evaluation time: %.2f min' % ((time.time() - start_time_eval) / 60)) # Early stopping if scheduler.is_early_stop: break # Convert to fine-tuning stage if reporter.n_epochs == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) if reporter.n_epochs >= args.n_epochs: break if args.get('ss_start_epoch', 0) == (ep + 1): model.module.trigger_scheduled_sampling() if args.get('mocha_quantity_loss_start_epoch', 0) == (ep + 1): model.module.trigger_quantity_loss() logger.info('Total time: %.2f hour' % ((time.time() - start_time_train) / 3600)) reporter.close() return args.save_path
def main(): # Load configuration args, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataloader dataloader = build_dataloader(args=args, tsv_path=s, batch_size=1, is_test=True, first_n_utterances=args.recog_first_n_utt, longform_max_n_frames=args.recog_longform_max_n_frames) if i == 0: # Load ASR model model = Speech2Text(args, dir_name) epoch = int(float(args.recog_model[0].split('-')[-1]) * 10) / 10 if args.recog_n_average > 1: # Model averaging for Transformer model = average_checkpoints(model, args.recog_model[0], n_average=args.recog_n_average) else: load_checkpoint(args.recog_model[0], model) # Ensemble (different models) ensemble_models = [model] if len(args.recog_model) > 1: for recog_model_e in args.recog_model[1:]: conf_e = load_config(os.path.join(os.path.dirname(recog_model_e), 'conf.yml')) args_e = copy.deepcopy(args) for k, v in conf_e.items(): if 'recog' not in k: setattr(args_e, k, v) model_e = Speech2Text(args_e) load_checkpoint(recog_model_e, model_e) if args.recog_n_gpus >= 1: model_e.cuda() ensemble_models += [model_e] # Load LM for shallow fusion if not args.lm_fusion: # first path if args.recog_lm is not None and args.recog_lm_weight > 0: conf_lm = load_config(os.path.join(os.path.dirname(args.recog_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) lm = build_lm(args_lm) load_checkpoint(args.recog_lm, lm) if args_lm.backward: model.lm_bwd = lm else: model.lm_fwd = lm # NOTE: only support for first path if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('recog oracle: %s' % args.recog_oracle) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) logger.info('beam width: %d' % args.recog_beam_width) logger.info('min length ratio: %.3f' % args.recog_min_len_ratio) logger.info('max length ratio: %.3f' % args.recog_max_len_ratio) logger.info('length penalty: %.3f' % args.recog_length_penalty) logger.info('length norm: %s' % args.recog_length_norm) logger.info('coverage penalty: %.3f' % args.recog_coverage_penalty) logger.info('coverage threshold: %.3f' % args.recog_coverage_threshold) logger.info('CTC weight: %.3f' % args.recog_ctc_weight) logger.info('fist LM path: %s' % args.recog_lm) logger.info('LM weight: %.3f' % args.recog_lm_weight) logger.info('GNMT: %s' % args.recog_gnmt_decoding) logger.info('forward-backward attention: %s' % args.recog_fwd_bwd_attention) logger.info('resolving UNK: %s' % args.recog_resolving_unk) logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('ASR decoder state carry over: %s' % (args.recog_asr_state_carry_over)) logger.info('LM state carry over: %s' % (args.recog_lm_state_carry_over)) logger.info('model average (Transformer): %d' % (args.recog_n_average)) # GPU setting if args.recog_n_gpus >= 1: model.cudnn_setting(deterministic=True, benchmark=False) model.cuda() save_path = mkdir_join(args.recog_dir, 'att_weights') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) for batch in dataloader: nbest_hyps_id, aws = model.decode( batch['xs'], args, dataloader.idx2token[0], exclude_eos=False, refs_id=batch['ys'], ensemble_models=ensemble_models[1:] if len(ensemble_models) > 1 else [], speakers=batch['sessions'] if dataloader.corpus == 'swbd' else batch['speakers']) best_hyps_id = [h[0] for h in nbest_hyps_id] # Get CTC probs ctc_probs, topk_ids = None, None if args.ctc_weight > 0: ctc_probs, topk_ids, xlens = model.get_ctc_probs( batch['xs'], task='ys', temperature=1, topk=min(100, model.vocab)) # NOTE: ctc_probs: '[B, T, topk]' ctc_probs_sub1, topk_ids_sub1 = None, None if args.ctc_weight_sub1 > 0: ctc_probs_sub1, topk_ids_sub1, xlens_sub1 = model.get_ctc_probs( batch['xs'], task='ys_sub1', temperature=1, topk=min(100, model.vocab_sub1)) if model.bwd_weight > 0.5: # Reverse the order best_hyps_id = [hyp[::-1] for hyp in best_hyps_id] aws = [[aw[0][:, ::-1]] for aw in aws] for b in range(len(batch['xs'])): tokens = dataloader.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_attention_weights( aws[b][0][:, :len(tokens)], tokens, spectrogram=batch['xs'][b][:, :dataloader.input_dim] if args.input_type == 'speech' else None, factor=args.subsample_factor, ref=batch['text'][b].lower(), save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8), ctc_probs=ctc_probs[b, :xlens[b]] if ctc_probs is not None else None, ctc_topk_ids=topk_ids[b] if topk_ids is not None else None, ctc_probs_sub1=ctc_probs_sub1[b, :xlens_sub1[b]] if ctc_probs_sub1 is not None else None, ctc_topk_ids_sub1=topk_ids_sub1[b] if topk_ids_sub1 is not None else None) if model.bwd_weight > 0.5: hyp = ' '.join(tokens[::-1]) else: hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50)
def main(): args = parse_args_train(sys.argv[1:]) args_init = copy.deepcopy(args) args_teacher = copy.deepcopy(args) # Load a conf file if args.resume: conf = load_config(os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) recog_params = vars(args) args = compute_susampling_factor(args) # Load dataset batch_size = args.batch_size * args.n_gpus if args.n_gpus >= 1 else args.batch_size train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, tsv_path_sub1=args.train_set_sub1, tsv_path_sub2=args.train_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=batch_size, n_epochs=args.n_epochs, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, shuffle_bucket=args.shuffle_bucket, sort_by='input', short2long=args.sort_short2long, sort_stop_epoch=args.sort_stop_epoch, dynamic_batching=args.dynamic_batching, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=args.subsample_factor, subsample_factor_sub1=args.subsample_factor_sub1, subsample_factor_sub2=args.subsample_factor_sub2, discourse_aware=args.discourse_aware) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, tsv_path_sub1=args.dev_set_sub1, tsv_path_sub2=args.dev_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=batch_size, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=args.subsample_factor, subsample_factor_sub1=args.subsample_factor_sub1, subsample_factor_sub2=args.subsample_factor_sub2) eval_sets = [Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, is_test=True) for s in args.eval_sets] args.vocab = train_set.vocab args.vocab_sub1 = train_set.vocab_sub1 args.vocab_sub2 = train_set.vocab_sub2 args.input_dim = train_set.input_dim # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_asr_model_name(args) if args.mbr_training: assert args.asr_init save_path = mkdir_join(os.path.dirname(args.asr_init), dir_name) else: save_path = mkdir_join(args.model_save_dir, '_'.join( os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger set_logger(os.path.join(save_path, 'train.log'), stdout=args.stdout) # Load a LM conf file for LM fusion & LM initialization if not args.resume and args.external_lm: lm_conf = load_config(os.path.join(os.path.dirname(args.external_lm), 'conf.yml')) args.lm_conf = argparse.Namespace() for k, v in lm_conf.items(): setattr(args.lm_conf, k, v) assert args.unit == args.lm_conf.unit assert args.vocab == args.lm_conf.vocab # Model setting model = Speech2Text(args, save_path, train_set.idx2token[0]) if not args.resume: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) if args.external_lm: save_config(args.lm_conf, os.path.join(save_path, 'conf_lm.yml')) # Save the nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) for sub in ['', '_sub1', '_sub2']: if getattr(args, 'dict' + sub): shutil.copy(getattr(args, 'dict' + sub), os.path.join(save_path, 'dict' + sub + '.txt')) if getattr(args, 'unit' + sub) == 'wp': shutil.copy(getattr(args, 'wp_model' + sub), os.path.join(save_path, 'wp' + sub + '.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Initialize with pre-trained model's parameters if args.asr_init: # Load the ASR model (full model) conf_init = load_config(os.path.join(os.path.dirname(args.asr_init), 'conf.yml')) for k, v in conf_init.items(): setattr(args_init, k, v) model_init = Speech2Text(args_init) load_checkpoint(args.asr_init, model_init) # Overwrite parameters param_dict = dict(model_init.named_parameters()) for n, p in model.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if args.asr_init_enc_only and 'enc' not in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n) # Set optimizer resume_epoch = 0 if args.resume: resume_epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer(model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) else: optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = 'former' in args.enc_type or 'former' in args.dec_type optimizer = LRScheduler(optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, lower_better=args.metric not in ['accuracy', 'bleu'], warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=getattr(args, 'transformer_d_model', 0), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, optimizer) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # Load the teacher ASR model teacher = None if args.teacher: assert os.path.isfile(args.teacher), 'There is no checkpoint.' conf_teacher = load_config(os.path.join(os.path.dirname(args.teacher), 'conf.yml')) for k, v in conf_teacher.items(): setattr(args_teacher, k, v) # Setting for knowledge distillation args_teacher.ss_prob = 0 args.lsm_prob = 0 teacher = Speech2Text(args_teacher) load_checkpoint(args.teacher, teacher) # Load the teacher LM teacher_lm = None if args.teacher_lm: assert os.path.isfile(args.teacher_lm), 'There is no checkpoint.' conf_lm = load_config(os.path.join(os.path.dirname(args.teacher_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) teacher_lm = build_lm(args_lm) load_checkpoint(args.teacher_lm, teacher_lm) # GPU setting use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp = None if args.n_gpus >= 1: model.cudnn_setting(deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=not is_transformer and args.cudnn_benchmark) model.cuda() # Mix precision training setting if use_apex: from apex import amp model, optimizer.optimizer = amp.initialize(model, optimizer.optimizer, opt_level=args.train_dtype) from neural_sp.models.seq2seq.decoders.ctc import CTC amp.register_float_function(CTC, "loss_fn") # NOTE: see https://github.com/espnet/espnet/pull/1779 amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) if teacher is not None: teacher.cuda() if teacher_lm is not None: teacher_lm.cuda() else: model = CPUWrapperASR(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) if args.mtl_per_batch: # NOTE: from easier to harder tasks tasks = [] if 1 - args.bwd_weight - args.ctc_weight - args.sub1_weight - args.sub2_weight > 0: tasks += ['ys'] if args.bwd_weight > 0: tasks = ['ys.bwd'] + tasks if args.ctc_weight > 0: tasks = ['ys.ctc'] + tasks if args.mbr_ce_weight > 0: tasks = ['ys.mbr'] + tasks for sub in ['sub1', 'sub2']: if getattr(args, 'train_set_' + sub): if getattr(args, sub + '_weight') - getattr(args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub] + tasks if getattr(args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub + '.ctc'] + tasks else: tasks = ['all'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() accum_n_steps = 0 n_steps = optimizer.n_steps * args.accum_grad_n_steps epoch_detail_prev = 0 for ep in range(resume_epoch, args.n_epochs): pbar_epoch = tqdm(total=len(train_set)) session_prev = None for batch_train, is_new_epoch in train_set: # Compute loss in the training set if args.discourse_aware and batch_train['sessions'][0] != session_prev: model.module.reset_session() session_prev = batch_train['sessions'][0] accum_n_steps += 1 # Change mini-batch depending on task if accum_n_steps == 1: loss_train = 0 # moving average over gradient accumulation for task in tasks: loss, observation = model(batch_train, task, teacher=teacher, teacher_lm=teacher_lm) reporter.add(observation) if use_apex: with amp.scale_loss(loss, optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss.detach() # Trancate the graph loss_train = (loss_train * (accum_n_steps - 1) + loss.item()) / accum_n_steps if accum_n_steps >= args.accum_grad_n_steps or is_new_epoch: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_steps = 0 # NOTE: parameters are forcibly updated at the end of every epoch del loss pbar_epoch.update(len(batch_train['utt_ids'])) reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() n_steps += 1 # NOTE: n_steps is different from the step counter in Noam Optimizer if n_steps % args.print_step == 0: # Compute loss in the dev set batch_dev = iter(dev_set).next(batch_size=1 if 'transducer' in args.dec_type else None)[0] # Change mini-batch depending on task for task in tasks: loss, observation = model(batch_dev, task, is_eval=True) reporter.add(observation, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step if args.input_type == 'speech': xlen = max(len(x) for x in batch_train['xs']) ylen = max(len(y) for y in batch_train['ys']) elif args.input_type == 'text': xlen = max(len(x) for x in batch_train['ys']) ylen = max(len(y) for y in batch_train['ys_sub1']) logger.info("step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.7f/bs:%d/xlen:%d/ylen:%d (%.2f min)" % (n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, optimizer.lr, len(batch_train['utt_ids']), xlen, ylen, duration_step / 60)) start_time_step = time.time() # Save fugures of loss and accuracy if n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() model.module.plot_ctc() # Ealuate model every 0.1 epoch during MBR training if args.mbr_training: if int(train_set.epoch_detail * 10) != int(epoch_detail_prev * 10): # dev evaluate([model.module], dev_set, recog_params, args, int(train_set.epoch_detail * 10) / 10, logger) # Save the model optimizer.save_checkpoint( model, save_path, remove_old=False, amp=amp, epoch_detail=train_set.epoch_detail) epoch_detail_prev = train_set.epoch_detail if is_new_epoch: break # Save checkpoint and evaluate model per epoch duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model optimizer.save_checkpoint( model, save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) else: start_time_eval = time.time() # dev metric_dev = evaluate([model.module], dev_set, recog_params, args, optimizer.n_epochs + 1, logger) optimizer.epoch(metric_dev) # lr decay reporter.epoch(metric_dev, name=args.metric) # plot if optimizer.is_topk or is_transformer: # Save the model optimizer.save_checkpoint( model, save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) # test if optimizer.is_topk: for eval_set in eval_sets: evaluate([model.module], eval_set, recog_params, args, optimizer.n_epochs, logger) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) if optimizer.n_epochs >= args.n_epochs: break # if args.ss_prob > 0: # model.module.scheduled_sampling_trigger() start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) recog_params = vars(args) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), dict_path_sub1=os.path.join(dir_name, 'dict_sub1.txt') if os.path.isfile( os.path.join(dir_name, 'dict_sub1.txt')) else False, nlsyms=args.nlsyms, wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, unit_sub1=args.unit_sub1, batch_size=args.recog_batch_size, is_test=True) if i == 0: # Load the ASR model model = Speech2Text(args, dir_name) topk_list = load_checkpoint(model, args.recog_model[0]) epoch = int(args.recog_model[0].split('-')[-1]) # Model averaging for Transformer if 'transformer' in conf['enc_type'] and conf['dec_type'] == 'transformer': model = average_checkpoints(model, args.recog_model[0], n_average=args.recog_n_average, topk_list=topk_list) # ensemble (different models) ensemble_models = [model] if len(args.recog_model) > 1: for recog_model_e in args.recog_model[1:]: conf_e = load_config(os.path.join(os.path.dirname(recog_model_e), 'conf.yml')) args_e = copy.deepcopy(args) for k, v in conf_e.items(): if 'recog' not in k: setattr(args_e, k, v) model_e = Speech2Text(args_e) load_checkpoint(model_e, recog_model_e) if args.recog_n_gpus > 0: model_e.cuda() ensemble_models += [model_e] # Load the LM for shallow fusion if not args.lm_fusion: # first path if args.recog_lm is not None and args.recog_lm_weight > 0: conf_lm = load_config(os.path.join(os.path.dirname(args.recog_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) lm = build_lm(args_lm) load_checkpoint(lm, args.recog_lm) if args_lm.backward: model.lm_bwd = lm else: model.lm_fwd = lm # NOTE: only support for first path if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('recog oracle: %s' % args.recog_oracle) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) logger.info('beam width: %d' % args.recog_beam_width) logger.info('min length ratio: %.3f' % args.recog_min_len_ratio) logger.info('max length ratio: %.3f' % args.recog_max_len_ratio) logger.info('length penalty: %.3f' % args.recog_length_penalty) logger.info('length norm: %s' % args.recog_length_norm) logger.info('coverage penalty: %.3f' % args.recog_coverage_penalty) logger.info('coverage threshold: %.3f' % args.recog_coverage_threshold) logger.info('CTC weight: %.3f' % args.recog_ctc_weight) logger.info('fist LM path: %s' % args.recog_lm) logger.info('LM weight: %.3f' % args.recog_lm_weight) logger.info('GNMT: %s' % args.recog_gnmt_decoding) logger.info('forward-backward attention: %s' % args.recog_fwd_bwd_attention) logger.info('resolving UNK: %s' % args.recog_resolving_unk) logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('ASR decoder state carry over: %s' % (args.recog_asr_state_carry_over)) logger.info('LM state carry over: %s' % (args.recog_lm_state_carry_over)) logger.info('model average (Transformer): %d' % (args.recog_n_average)) # GPU setting if args.recog_n_gpus > 0: model.cuda() save_path = mkdir_join(args.recog_dir, 'att_weights') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) while True: batch, is_new_epoch = dataset.next(recog_params['recog_batch_size']) best_hyps_id, aws = model.decode( batch['xs'], recog_params, dataset.idx2token[0], exclude_eos=False, refs_id=batch['ys'], ensemble_models=ensemble_models[1:] if len(ensemble_models) > 1 else [], speakers=batch['sessions'] if dataset.corpus == 'swbd' else batch['speakers']) # Get CTC probs ctc_probs, topk_ids = None, None if args.ctc_weight > 0: ctc_probs, topk_ids, xlens = model.get_ctc_probs( batch['xs'], temperature=1, topk=min(100, model.vocab)) # NOTE: ctc_probs: '[B, T, topk]' if model.bwd_weight > 0.5: # Reverse the order best_hyps_id = [hyp[::-1] for hyp in best_hyps_id] aws = [aw[:, ::-1] for aw in aws] for b in range(len(batch['xs'])): tokens = dataset.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_attention_weights( aws[b][:, :len(tokens)], tokens, spectrogram=batch['xs'][b][:, :dataset.input_dim] if args.input_type == 'speech' else None, ref=batch['text'][b].lower(), save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8), ctc_probs=ctc_probs[b, :xlens[b]] if ctc_probs is not None else None, ctc_topk_ids=topk_ids[b] if topk_ids is not None else None) if model.bwd_weight > 0.5: hyp = ' '.join(tokens[::-1]) else: hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50) if is_new_epoch: break
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) recog_params = vars(args) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'decode.log')): os.remove(os.path.join(args.recog_dir, 'decode.log')) logger = set_logger(os.path.join(args.recog_dir, 'decode.log'), key='decoding', stdout=args.recog_stdout) skip_thought = 'skip' in args.enc_type wer_avg, cer_avg, per_avg = 0, 0, 0 ppl_avg, loss_avg = 0, 0 for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset( corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), dict_path_sub1=os.path.join(dir_name, 'dict_sub1.txt') if os.path.isfile(os.path.join(dir_name, 'dict_sub1.txt')) else False, dict_path_sub2=os.path.join(dir_name, 'dict_sub2.txt') if os.path.isfile(os.path.join(dir_name, 'dict_sub2.txt')) else False, nlsyms=os.path.join(dir_name, 'nlsyms.txt'), wp_model=os.path.join(dir_name, 'wp.model'), wp_model_sub1=os.path.join(dir_name, 'wp_sub1.model'), wp_model_sub2=os.path.join(dir_name, 'wp_sub2.model'), unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, batch_size=args.recog_batch_size, skip_thought=skip_thought, is_test=True) if i == 0: # Load the ASR model if skip_thought: model = SkipThought(args, dir_name) else: model = Speech2Text(args, dir_name) model = load_checkpoint(model, args.recog_model[0])[0] epoch = int(args.recog_model[0].split('-')[-1]) # ensemble (different models) ensemble_models = [model] if len(args.recog_model) > 1: for recog_model_e in args.recog_model[1:]: conf_e = load_config( os.path.join(os.path.dirname(recog_model_e), 'conf.yml')) args_e = copy.deepcopy(args) for k, v in conf_e.items(): if 'recog' not in k: setattr(args_e, k, v) model_e = Speech2Text(args_e) model_e = load_checkpoint(model_e, recog_model_e)[0] model_e.cuda() ensemble_models += [model_e] # Load the LM for shallow fusion if not args.lm_fusion: if args.recog_lm is not None and args.recog_lm_weight > 0: conf_lm = load_config( os.path.join(os.path.dirname(args.recog_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) lm = build_lm(args_lm, wordlm=args.recog_wordlm, lm_dict_path=os.path.join( os.path.dirname(args.recog_lm), 'dict.txt'), asr_dict_path=os.path.join( dir_name, 'dict.txt')) lm = load_checkpoint_lm(lm, args.recog_lm)[0] if args_lm.backward: model.lm_bwd = lm else: model.lm_fwd = lm if args.recog_lm_bwd is not None and args.recog_lm_weight > 0 \ and (args.recog_fwd_bwd_attention or args.recog_reverse_lm_rescoring): conf_lm = load_config( os.path.join(os.path.dirname(args.recog_lm_bwd), 'conf.yml')) args_lm_bwd = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm_bwd, k, v) lm_bwd = build_lm(args_lm_bwd) lm_bwd = load_checkpoint(lm_bwd, args.recog_lm_bwd)[0] model.lm_bwd = lm_bwd if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('recog metric: %s' % args.recog_metric) logger.info('recog oracle: %s' % args.recog_oracle) logger.info('epoch: %d' % (epoch - 1)) logger.info('batch size: %d' % args.recog_batch_size) logger.info('beam width: %d' % args.recog_beam_width) logger.info('min length ratio: %.3f' % args.recog_min_len_ratio) logger.info('max length ratio: %.3f' % args.recog_max_len_ratio) logger.info('length penalty: %.3f' % args.recog_length_penalty) logger.info('coverage penalty: %.3f' % args.recog_coverage_penalty) logger.info('coverage threshold: %.3f' % args.recog_coverage_threshold) logger.info('CTC weight: %.3f' % args.recog_ctc_weight) logger.info('LM path: %s' % args.recog_lm) logger.info('LM path (bwd): %s' % args.recog_lm_bwd) logger.info('LM weight: %.3f' % args.recog_lm_weight) logger.info('GNMT: %s' % args.recog_gnmt_decoding) logger.info('forward-backward attention: %s' % args.recog_fwd_bwd_attention) logger.info('reverse LM rescoring: %s' % args.recog_reverse_lm_rescoring) logger.info('resolving UNK: %s' % args.recog_resolving_unk) logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('ASR decoder state carry over: %s' % (args.recog_asr_state_carry_over)) logger.info('LM state carry over: %s' % (args.recog_lm_state_carry_over)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache type: %s' % (args.recog_cache_type)) logger.info('cache theta (speech): %.3f' % (args.recog_cache_theta_speech)) logger.info('cache lambda (speech): %.3f' % (args.recog_cache_lambda_speech)) logger.info('cache theta (lm): %.3f' % (args.recog_cache_theta_lm)) logger.info('cache lambda (lm): %.3f' % (args.recog_cache_lambda_lm)) # GPU setting model.cuda() start_time = time.time() if args.recog_metric == 'edit_distance': if args.recog_unit in ['word', 'word_char']: wer, cer, _ = eval_word(ensemble_models, dataset, recog_params, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True) wer_avg += wer cer_avg += cer elif args.recog_unit == 'wp': wer, cer = eval_wordpiece(ensemble_models, dataset, recog_params, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True) wer_avg += wer cer_avg += cer elif 'char' in args.recog_unit: wer, cer = eval_char(ensemble_models, dataset, recog_params, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True, task_idx=0) # task_idx=1 if args.recog_unit and 'char' in args.recog_unit else 0) wer_avg += wer cer_avg += cer elif 'phone' in args.recog_unit: per = eval_phone(ensemble_models, dataset, recog_params, epoch=epoch - 1, recog_dir=args.recog_dir, progressbar=True) per_avg += per else: raise ValueError(args.recog_unit) elif args.recog_metric == 'acc': raise NotImplementedError elif args.recog_metric in ['ppl', 'loss']: ppl, loss = eval_ppl(ensemble_models, dataset, progressbar=True) ppl_avg += ppl loss_avg += loss elif args.recog_metric == 'bleu': raise NotImplementedError else: raise NotImplementedError logger.info('Elasped time: %.2f [sec]:' % (time.time() - start_time)) if args.recog_metric == 'edit_distance': if 'phone' in args.recog_unit: logger.info('PER (avg.): %.2f %%\n' % (per_avg / len(args.recog_sets))) else: logger.info('WER / CER (avg.): %.2f / %.2f %%\n' % (wer_avg / len(args.recog_sets), cer_avg / len(args.recog_sets))) elif args.recog_metric in ['ppl', 'loss']: logger.info('PPL (avg.): %.2f\n' % (ppl_avg / len(args.recog_sets))) print('PPL (avg.): %.2f' % (ppl_avg / len(args.recog_sets))) logger.info('Loss (avg.): %.2f\n' % (loss_avg / len(args.recog_sets))) print('Loss (avg.): %.2f' % (loss_avg / len(args.recog_sets)))
def main(): args = parse() # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger logger = set_logger(os.path.join(save_path, 'train.log'), key='training', stdout=args.stdout) # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size * args.n_gpus, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [] for s in args.eval_sets: eval_sets += [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) ] args.vocab = train_set.vocab # Model setting model = build_lm(args, save_path) if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, 'sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], conf['lr'], conf['weight_decay']) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler( optimizer, conf['lr'], decay_type=conf['lr_decay_type'], decay_start_epoch=conf['lr_decay_start_epoch'], decay_rate=conf['lr_decay_rate'], decay_patient_n_epochs=conf['lr_decay_patient_n_epochs'], early_stop_patient_n_epochs=conf['early_stop_patient_n_epochs'], warmup_start_lr=conf['warmup_start_lr'], warmup_n_steps=conf['warmup_n_steps'], model_size=conf['d_model'], factor=conf['lr_factor'], noam=conf['lm_type'] == 'transformer') # Restore the last saved model model, optimizer = load_checkpoint(model, args.resume, optimizer, resume=True) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, conf['weight_decay']) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps logger.info('========== Convert to SGD ==========') else: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Set optimizer optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.d_model, factor=args.lr_factor, noam=args.lm_type == 'transformer') # GPU setting if args.n_gpus >= 1: torch.backends.cudnn.benchmark = True model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) model.cuda() # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() pbar_epoch = tqdm(total=len(train_set)) accum_n_tokens = 0 while True: # Compute loss in the training set ys_train, is_new_epoch = train_set.next() accum_n_tokens += sum([len(y) for y in ys_train]) optimizer.zero_grad() loss, hidden, reporter = model(ys_train, hidden, reporter) loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_tokens == 0 or accum_n_tokens >= args.accum_grad_n_tokens: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_tokens = 0 loss_train = loss.item() del loss hidden = model.module.repackage_state(hidden) reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() if optimizer.n_steps % args.print_step == 0: # Compute loss in the dev set ys_dev = dev_set.next()[0] loss, _, reporter = model(ys_dev, None, reporter, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/ppl:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (optimizer.n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, np.exp(loss_train), np.exp(loss_dev), optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() pbar_epoch.update(ys_train.shape[0] * (ys_train.shape[1] - 1)) # Save fugures of loss and accuracy if optimizer.n_steps % (args.print_step * 10) == 0: reporter.snapshot() if args.lm_type == 'transformer': model.module.plot_attention() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model save_checkpoint( model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') else: start_time_eval = time.time() # dev ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) logger.info('PPL (%s, epoch:%d): %.2f' % (dev_set.set, optimizer.n_epochs, ppl_dev)) optimizer.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot if optimizer.is_best: # Save the model save_checkpoint( model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') # test ppl_test_avg = 0. for eval_set in eval_sets: ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) logger.info( 'PPL (%s, epoch:%d): %.2f' % (eval_set.set, optimizer.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info('PPL (avg., epoch:%d): %.2f' % (optimizer.n_epochs, ppl_test_avg / len(eval_sets))) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, args.weight_decay) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps logger.info('========== Convert to SGD ==========') pbar_epoch = tqdm(total=len(train_set)) if optimizer.n_epochs == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): args = parse_args_train(sys.argv[1:]) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # for multi-GPUs if args.n_gpus > 1: batch_size = args.batch_size * args.n_gpus accum_grad_n_steps = max(1, args.accum_grad_n_steps // args.n_gpus) else: batch_size = args.batch_size accum_grad_n_steps = args.accum_grad_n_steps # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, shuffle=args.shuffle, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) for s in args.eval_sets ] args.vocab = train_set.vocab # Set save path if args.resume: args.save_path = os.path.dirname(args.resume) dir_name = os.path.basename(args.save_path) else: dir_name = set_lm_name(args) args.save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) args.save_path = set_save_path(args.save_path) # avoid overwriting # Set logger set_logger(os.path.join(args.save_path, 'train.log'), stdout=args.stdout) # Model setting model = build_lm(args, args.save_path) if not args.resume: # Save nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(args.save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(args.save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(args.save_path, 'wp.model')) for k, v in sorted(args.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info('torch version: %s' % str(torch.__version__)) logger.info(model) # Set optimizer resume_epoch = int(args.resume.split('-')[-1]) if args.resume else 0 optimizer = set_optimizer( model, 'sgd' if resume_epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = args.lm_type in ['transformer', 'transformer_xl'] scheduler = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.get('transformer_d_model', 0), factor=args.lr_factor, noam=args.optimizer == 'noam', save_checkpoints_topk=10 if is_transformer else 1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, scheduler) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # GPU setting args.use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp, scaler = None, None if args.n_gpus >= 1: model.cudnn_setting( deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=not is_transformer and args.cudnn_benchmark) # Mixed precision training setting if args.use_apex: if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): scaler = torch.cuda.amp.GradScaler() else: from apex import amp model, scheduler.optimizer = amp.initialize( model, scheduler.optimizer, opt_level=args.train_dtype) amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model.cuda() model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) else: model = CPUWrapperLM(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(args, model) if args.resume: n_steps = scheduler.n_steps * accum_grad_n_steps reporter.resume(n_steps, resume_epoch) # Save conf file as a yaml file if not args.resume: save_config(args, os.path.join(args.save_path, 'conf.yml')) # NOTE: save after reporter for wandb ID hidden = None start_time_train = time.time() for ep in range(resume_epoch, args.n_epochs): for ys_train, is_new_epoch in train_set: hidden = train(model, train_set, dev_set, scheduler, reporter, logger, args, accum_grad_n_steps, amp, scaler, hidden) # Save checkpoint and validate model per epoch if reporter.n_epochs + 1 < args.eval_start_epoch: scheduler.epoch() # lr decay reporter.epoch() # plot # Save model scheduler.save_checkpoint(model, args.save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) else: start_time_eval = time.time() # dev model.module.reset_length(args.bptt) ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) scheduler.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot reporter.add_scalar('dev/perplexity', ppl_dev) logger.info('PPL (%s, ep:%d): %.2f' % (dev_set.set, reporter.n_epochs, ppl_dev)) if scheduler.is_topk or is_transformer: # Save model scheduler.save_checkpoint(model, args.save_path, remove_old=not is_transformer and args.remove_old_checkpoints, amp=amp) # test ppl_test_avg = 0. for eval_set in eval_sets: model.module.reset_length(args.bptt) ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) logger.info('PPL (%s, ep:%d): %.2f' % (eval_set.set, reporter.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info( 'PPL (avg., ep:%d): %.2f' % (reporter.n_epochs, ppl_test_avg / len(eval_sets))) logger.info('Evaluation time: %.2f min' % ((time.time() - start_time_eval) / 60)) # Early stopping if scheduler.is_early_stop: break # Convert to fine-tuning stage if reporter.n_epochs == args.convert_to_sgd_epoch: scheduler.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) if reporter.n_epochs >= args.n_epochs: break logger.info('Total time: %.2f hour' % ((time.time() - start_time_train) / 3600)) reporter.close() return args.save_path
def main(): args = parse() hvd.init() torch.cuda.set_device(hvd.local_rank()) hvd_rank = hvd.rank() # Load a conf file if args.resume: conf = load_config(os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) eval_set = Dataset(corpus=args.corpus, tsv_path=args.eval_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=args.batch_size, bptt=args.bptt, n_customers=hvd.size(), backward=args.backward, serialize=args.serialize) args.vocab = train_set.vocab train_loader = ChunkDataloader(train_set, batch_size=1, num_workers = 1, distributed=True, shuffle=False) eval_loader = ChunkDataloader(eval_set, batch_size=1, num_workers=1, distributed=True) # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join(args.model_save_dir, '_'.join( os.path.basename(args.train_set).split('.')[:-1]), dir_name) if hvd.rank() == 0: save_path = set_save_path(save_path) # avoid overwriting # Set logger if hvd_rank == 0: logger = set_logger(os.path.join(save_path, 'train.log'), key='training', stdout=args.stdout) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('NUMBER_DEVICES: %s' % hvd.size()) setproctitle(args.job_name if args.job_name else dir_name) # Model setting model = build_lm(args, save_path) # GPU setting if args.n_gpus >= 1: torch.backends.cudnn.benchmark = True model.cuda() if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer(model, 'sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], conf['lr'], conf['weight_decay']) # Restore the last saved model if hvd_rank == 0: model, optimizer = load_checkpoint(model, args.resume, optimizer, resume=True) #broadcast optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler(optimizer, conf['lr'], decay_type=conf['lr_decay_type'], decay_start_epoch=conf['lr_decay_start_epoch'], decay_rate=conf['lr_decay_rate'], decay_patient_n_epochs=conf['lr_decay_patient_n_epochs'], early_stop_patient_n_epochs=conf['early_stop_patient_n_epochs'], warmup_start_lr=conf['warmup_start_lr'], warmup_n_steps=conf['warmup_n_steps'], model_size=conf['d_model'], factor=conf['lr_factor'], noam=conf['lm_type'] == 'transformer') # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, conf['weight_decay']) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps if hvd_rank == 0: logger.info('========== Convert to SGD ==========') #broadcast optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) else: # Save the conf file as a yaml file if hvd_rank == 0: save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] if hvd.rank() == 0: logger.info("%s %d" % (n, n_params)) if hvd_rank == 0: logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Set optimizer hvd.broadcast_parameters(model.state_dict(), root_rank=0) optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Wrap optimizer by learning rate scheduler optimizer = LRScheduler(optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.d_model, factor=args.lr_factor, noam=args.lm_type == 'transformer') # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() data_size = len(train_set) accum_n_tokens = 0 verbose = 1 if hvd_rank == 0 else 0 while True: model.train() with tqdm(total=data_size/hvd.size(), desc='Train Epoch #{}'.format(optimizer.n_epochs + 1), disable=not verbose) as pbar_epoch: # Compute loss in the training set for _, ys_train in enumerate(train_loader): accum_n_tokens += sum([len(y) for y in ys_train]) optimizer.zero_grad() loss, hidden, reporter = model(ys_train, hidden, reporter) loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_tokens == 0 or accum_n_tokens >= args.accum_grad_n_tokens: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad_norm) #reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_tokens = 0 loss_train = loss.item() del loss hidden = model.repackage_state(hidden) if optimizer.n_steps % args.print_step == 0: model.eval() # Compute loss in the dev set ys_dev = dev_set.next()[0] loss, _, reporter = model(ys_dev, None, reporter, is_eval=True) loss_dev = loss.item() del loss duration_step = time.time() - start_time_step if hvd_rank == 0: logger.info("step:%d(ep:%.2f) loss:%.3f(%.3f)/ppl:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (optimizer.n_steps, optimizer.n_steps/data_size*hvd.size(), loss_train, loss_dev, np.exp(loss_train), np.exp(loss_dev), optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() pbar_epoch.update(1) # Save checkpoint and evaluate model per epoch duration_epoch = time.time() - start_time_epoch if hvd_rank == 0: logger.info('========== EPOCH:%d (%.2f min) ==========' %(optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: # Save the model if hvd_rank == 0: optimizer.epoch() save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') else: start_time_eval = time.time() # dev model.eval() ppl_dev, _ = eval_ppl_parallel([model], eval_loader, optimizer.n_epochs, batch_size=args.batch_size) ppl_dev = hvd.allreduce(np2tensor(np.array([ppl_dev], dtype=float), hvd.local_rank())) if hvd_rank == 0: logger.info('PPL : %.2f' % ppl_dev) optimizer.epoch(ppl_dev) if optimizer.is_best and hvd.rank() == 0: # Save the model save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=args.lm_type != 'transformer') duration_eval = time.time() - start_time_eval if hvd_rank == 0: logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: n_epochs = optimizer.n_epochs n_steps = optimizer.n_steps optimizer = set_optimizer(model, 'sgd', args.lr, args.weight_decay) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = LRScheduler(optimizer, args.lr, decay_type='always', decay_start_epoch=0, decay_rate=0.5) optimizer._epoch = n_epochs optimizer._step = n_steps if hvd_rank == 0: logger.info('========== Convert to SGD ==========') if optimizer.n_epochs == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train if hvd_rank == 0: logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() return save_path
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) recog_params = vars(args) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) logger = set_logger(os.path.join(args.recog_dir, 'plot.log'), key='decoding', stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), dict_path_sub1=os.path.join(dir_name, 'dict_sub1.txt') if os.path.isfile( os.path.join(dir_name, 'dict_sub1.txt')) else False, nlsyms=args.nlsyms, wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, unit_sub1=args.unit_sub1, batch_size=args.recog_batch_size, is_test=True) if i == 0: # Load the ASR model model = Speech2Text(args, dir_name) model = load_checkpoint(model, args.recog_model[0])[0] epoch = int(args.recog_model[0].split('-')[-1]) # ensemble (different models) ensemble_models = [model] if len(args.recog_model) > 1: for recog_model_e in args.recog_model[1:]: conf_e = load_config(os.path.join(os.path.dirname(recog_model_e), 'conf.yml')) args_e = copy.deepcopy(args) for k, v in conf_e.items(): if 'recog' not in k: setattr(args_e, k, v) model_e = Speech2Text(args_e) model_e = load_checkpoint(model_e, recog_model_e)[0] model_e.cuda() ensemble_models += [model_e] # Load the LM for shallow fusion if not args.lm_fusion: if args.recog_lm is not None and args.recog_lm_weight > 0: conf_lm = load_config(os.path.join(os.path.dirname(args.recog_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) lm = build_lm(args_lm) lm = load_checkpoint(lm, args.recog_lm)[0] if args_lm.backward: model.lm_bwd = lm else: model.lm_fwd = lm if args.recog_lm_bwd is not None and args.recog_lm_weight > 0 \ and (args.recog_fwd_bwd_attention or args.recog_reverse_lm_rescoring): conf_lm = load_config(os.path.join(os.path.dirname(args.recog_lm_bwd), 'conf.yml')) args_lm_bwd = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm_bwd, k, v) lm_bwd = build_lm(args_lm_bwd) lm_bwd = load_checkpoint(lm_bwd, args.recog_lm_bwd)[0] model.lm_bwd = lm_bwd if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('recog oracle: %s' % args.recog_oracle) logger.info('epoch: %d' % (epoch - 1)) logger.info('batch size: %d' % args.recog_batch_size) logger.info('beam width: %d' % args.recog_beam_width) logger.info('min length ratio: %.3f' % args.recog_min_len_ratio) logger.info('max length ratio: %.3f' % args.recog_max_len_ratio) logger.info('length penalty: %.3f' % args.recog_length_penalty) logger.info('coverage penalty: %.3f' % args.recog_coverage_penalty) logger.info('coverage threshold: %.3f' % args.recog_coverage_threshold) logger.info('CTC weight: %.3f' % args.recog_ctc_weight) logger.info('LM path: %s' % args.recog_lm) logger.info('LM path (bwd): %s' % args.recog_lm_bwd) logger.info('LM weight: %.3f' % args.recog_lm_weight) logger.info('GNMT: %s' % args.recog_gnmt_decoding) logger.info('forward-backward attention: %s' % args.recog_fwd_bwd_attention) logger.info('reverse LM rescoring: %s' % args.recog_reverse_lm_rescoring) logger.info('resolving UNK: %s' % args.recog_resolving_unk) logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('ASR decoder state carry over: %s' % (args.recog_asr_state_carry_over)) logger.info('LM state carry over: %s' % (args.recog_lm_state_carry_over)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache type: %s' % (args.recog_cache_type)) logger.info('cache theta (speech): %.3f' % (args.recog_cache_theta_speech)) logger.info('cache lambda (speech): %.3f' % (args.recog_cache_lambda_speech)) logger.info('cache theta (lm): %.3f' % (args.recog_cache_theta_lm)) logger.info('cache lambda (lm): %.3f' % (args.recog_cache_lambda_lm)) # GPU setting model.cuda() save_path = mkdir_join(args.recog_dir, 'att_weights') if args.recog_n_caches > 0: save_path_cache = mkdir_join(args.recog_dir, 'cache') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) if args.recog_n_caches > 0: shutil.rmtree(save_path_cache) os.mkdir(save_path_cache) while True: batch, is_new_epoch = dataset.next(recog_params['recog_batch_size']) best_hyps_id, aws, (cache_attn_hist, cache_id_hist) = model.decode( batch['xs'], recog_params, dataset.idx2token[0], exclude_eos=False, refs_id=batch['ys'], ensemble_models=ensemble_models[1:] if len(ensemble_models) > 1 else [], speakers=batch['sessions'] if dataset.corpus == 'swbd' else batch['speakers']) if model.bwd_weight > 0.5: # Reverse the order best_hyps_id = [hyp[::-1] for hyp in best_hyps_id] aws = [aw[::-1] for aw in aws] for b in range(len(batch['xs'])): tokens = dataset.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_attention_weights( aws[b][:len(tokens)], tokens, spectrogram=batch['xs'][b][:, :dataset.input_dim] if args.input_type == 'speech' else None, save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8)) if args.recog_n_caches > 0 and cache_id_hist is not None and cache_attn_hist is not None: n_keys, n_queries = cache_attn_hist[0].shape # mask = np.ones((n_keys, n_queries)) # for i in range(n_queries): # mask[:n_keys - i, -(i + 1)] = 0 mask = np.zeros((n_keys, n_queries)) plot_cache_weights( cache_attn_hist[0], keys=dataset.idx2token[0](cache_id_hist[-1], return_list=True), # fifo # keys=dataset.idx2token[0](cache_id_hist, return_list=True), # dict queries=tokens, save_path=mkdir_join(save_path_cache, spk, batch['utt_ids'][b] + '.png'), figsize=(40, 16), mask=mask) if model.bwd_weight > 0.5: hyp = ' '.join(tokens[::-1]) else: hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50) if is_new_epoch: break
def main(): args = parse_args_train(sys.argv[1:]) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) # Load dataset batch_size = args.batch_size * args.n_gpus if args.n_gpus >= 1 else args.batch_size train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, n_epochs=args.n_epochs, min_n_tokens=args.min_n_tokens, bptt=args.bptt, shuffle=args.shuffle, backward=args.backward, serialize=args.serialize) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize) eval_sets = [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, bptt=args.bptt, backward=args.backward, serialize=args.serialize) for s in args.eval_sets ] args.vocab = train_set.vocab # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_lm_name(args) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger set_logger(os.path.join(save_path, 'train.log'), stdout=args.stdout) # Model setting model = build_lm(args, save_path) if not args.resume: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) # Save the nlsyms, dictionary, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) shutil.copy(args.dict, os.path.join(save_path, 'dict.txt')) if args.unit == 'wp': shutil.copy(args.wp_model, os.path.join(save_path, 'wp.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Set optimizer resume_epoch = 0 if args.resume: epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, 'sgd' if epoch > args.convert_to_sgd_epoch else args.optimizer, args.lr, args.weight_decay) else: optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler is_transformer = args.lm_type in ['transformer', 'transformer_xl'] optimizer = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=getattr(args, 'transformer_d_model', 0), factor=args.lr_factor, noam=is_transformer, save_checkpoints_topk=1) if args.resume: # Restore the last saved model load_checkpoint(args.resume, model, optimizer) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if resume_epoch == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) # GPU setting use_apex = args.train_dtype in ["O0", "O1", "O2", "O3"] amp = None if args.n_gpus >= 1: model.cudnn_setting( deterministic=not (is_transformer or args.cudnn_benchmark), benchmark=args.cudnn_benchmark) model.cuda() # Mix precision training setting if use_apex: from apex import amp model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype) amp.init() if args.resume: load_checkpoint(args.resume, amp=amp) model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) else: model = CPUWrapperLM(model) # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) logger.info('#GPU: %d' % torch.cuda.device_count()) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) hidden = None start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() pbar_epoch = tqdm(total=len(train_set)) accum_n_steps = 0 n_steps = optimizer.n_steps * args.accum_grad_n_steps while True: # Compute loss in the training set ys_train, is_new_epoch = train_set.next() accum_n_steps += 1 loss, hidden, observation = model(ys_train, hidden) reporter.add(observation) if use_apex: with amp.scale_loss(loss, optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_steps == 1 or accum_n_steps >= args.accum_grad_n_steps: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_steps = 0 loss_train = loss.item() del loss hidden = model.module.repackage_state(hidden) reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() pbar_epoch.update(ys_train.shape[0] * (ys_train.shape[1] - 1)) n_steps += 1 # NOTE: n_steps is different from the step counter in Noam Optimizer if n_steps % args.print_step == 0: # Compute loss in the dev set ys_dev = dev_set.next(bptt=args.bptt)[0] loss, _, observation = model(ys_dev, None, is_eval=True) reporter.add(observation, is_eval=True) loss_dev = loss.item() del loss reporter.step(is_eval=True) duration_step = time.time() - start_time_step logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.5f/bs:%d (%.2f min)" % (n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, optimizer.lr, ys_train.shape[0], duration_step / 60)) start_time_step = time.time() # Save fugures of loss and accuracy if n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model optimizer.save_checkpoint(model, save_path, remove_old=not is_transformer, amp=amp) else: start_time_eval = time.time() # dev model.module.reset_length(args.bptt) ppl_dev, _ = eval_ppl([model.module], dev_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) optimizer.epoch(ppl_dev) # lr decay reporter.epoch(ppl_dev, name='perplexity') # plot logger.info('PPL (%s, ep:%d): %.2f' % (dev_set.set, optimizer.n_epochs, ppl_dev)) if optimizer.is_topk or is_transformer: # Save the model optimizer.save_checkpoint(model, save_path, remove_old=not is_transformer, amp=amp) # test ppl_test_avg = 0. for eval_set in eval_sets: model.module.reset_length(args.bptt) ppl_test, _ = eval_ppl([model.module], eval_set, batch_size=1, bptt=args.bptt) model.module.reset_length(args.bptt) logger.info( 'PPL (%s, ep:%d): %.2f' % (eval_set.set, optimizer.n_epochs, ppl_test)) ppl_test_avg += ppl_test if len(eval_sets) > 0: logger.info('PPL (avg., ep:%d): %.2f' % (optimizer.n_epochs, ppl_test_avg / len(eval_sets))) duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) pbar_epoch = tqdm(total=len(train_set)) if optimizer.n_epochs >= args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'decode.log')): os.remove(os.path.join(args.recog_dir, 'decode.log')) set_logger(os.path.join(args.recog_dir, 'decode.log'), stdout=args.recog_stdout) ppl_avg = 0 for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, batch_size=args.recog_batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize, is_test=True) if i == 0: # Load the LM model = build_lm(args) load_checkpoint(model, args.recog_model[0]) epoch = int(args.recog_model[0].split('-')[-1]) # Model averaging for Transformer if conf['lm_type'] == 'transformer': model = average_checkpoints(model, args.recog_model[0], epoch, n_average=args.recog_n_average) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) # logger.info('recog unit: %s' % args.recog_unit) # logger.info('ensemble: %d' % (len(ensemble_models))) logger.info('BPTT: %d' % (args.bptt)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache theta: %.3f' % (args.recog_cache_theta)) logger.info('cache lambda: %.3f' % (args.recog_cache_lambda)) logger.info('model average (Transformer): %d' % (args.recog_n_average)) model.cache_theta = args.recog_cache_theta model.cache_lambda = args.recog_cache_lambda # GPU setting model.cuda() start_time = time.time() # TODO(hirofumi): ensemble ppl, _ = eval_ppl([model], dataset, batch_size=1, bptt=args.bptt, n_caches=args.recog_n_caches, progressbar=True) ppl_avg += ppl print('PPL (%s): %.2f' % (dataset.set, ppl)) logger.info('Elasped time: %.2f [sec]:' % (time.time() - start_time)) logger.info('PPL (avg.): %.2f\n' % (ppl_avg / len(args.recog_sets)))
def main(): args = parse() args_init = copy.deepcopy(args) args_teacher = copy.deepcopy(args) # Load a conf file if args.resume: conf = load_config( os.path.join(os.path.dirname(args.resume), 'conf.yml')) for k, v in conf.items(): if k != 'resume': setattr(args, k, v) recog_params = vars(args) # Compute subsampling factor subsample_factor = 1 subsample_factor_sub1 = 1 subsample_factor_sub2 = 1 subsample = [int(s) for s in args.subsample.split('_')] if args.conv_poolings and 'conv' in args.enc_type: for p in args.conv_poolings.split('_'): subsample_factor *= int(p.split(',')[0].replace('(', '')) else: subsample_factor = np.prod(subsample) if args.train_set_sub1: if args.conv_poolings and 'conv' in args.enc_type: subsample_factor_sub1 = subsample_factor * np.prod( subsample[:args.enc_n_layers_sub1 - 1]) else: subsample_factor_sub1 = subsample_factor if args.train_set_sub2: if args.conv_poolings and 'conv' in args.enc_type: subsample_factor_sub2 = subsample_factor * np.prod( subsample[:args.enc_n_layers_sub2 - 1]) else: subsample_factor_sub2 = subsample_factor # Set save path if args.resume: save_path = os.path.dirname(args.resume) dir_name = os.path.basename(save_path) else: dir_name = set_asr_model_name(args, subsample_factor) save_path = mkdir_join( args.model_save_dir, '_'.join(os.path.basename(args.train_set).split('.')[:-1]), dir_name) save_path = set_save_path(save_path) # avoid overwriting # Set logger logger = set_logger(os.path.join(save_path, 'train.log'), key='training', stdout=args.stdout) # for multi-GPUs if args.n_gpus > 1: logger.info("Batch size is automatically reduced from %d to %d" % (args.batch_size, args.batch_size // 2)) args.batch_size //= 2 skip_thought = 'skip' in args.enc_type # Load dataset train_set = Dataset(corpus=args.corpus, tsv_path=args.train_set, tsv_path_sub1=args.train_set_sub1, tsv_path_sub2=args.train_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=args.batch_size * args.n_gpus, n_epochs=args.n_epochs, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, sort_by='input', short2long=True, sort_stop_epoch=args.sort_stop_epoch, dynamic_batching=args.dynamic_batching, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=subsample_factor, subsample_factor_sub1=subsample_factor_sub1, subsample_factor_sub2=subsample_factor_sub2, discourse_aware=args.discourse_aware, skip_thought=skip_thought) dev_set = Dataset(corpus=args.corpus, tsv_path=args.dev_set, tsv_path_sub1=args.dev_set_sub1, tsv_path_sub2=args.dev_set_sub2, dict_path=args.dict, dict_path_sub1=args.dict_sub1, dict_path_sub2=args.dict_sub2, nlsyms=args.nlsyms, unit=args.unit, unit_sub1=args.unit_sub1, unit_sub2=args.unit_sub2, wp_model=args.wp_model, wp_model_sub1=args.wp_model_sub1, wp_model_sub2=args.wp_model_sub2, batch_size=args.batch_size * args.n_gpus, min_n_frames=args.min_n_frames, max_n_frames=args.max_n_frames, ctc=args.ctc_weight > 0, ctc_sub1=args.ctc_weight_sub1 > 0, ctc_sub2=args.ctc_weight_sub2 > 0, subsample_factor=subsample_factor, subsample_factor_sub1=subsample_factor_sub1, subsample_factor_sub2=subsample_factor_sub2, discourse_aware=args.discourse_aware, skip_thought=skip_thought) eval_sets = [] for s in args.eval_sets: eval_sets += [ Dataset(corpus=args.corpus, tsv_path=s, dict_path=args.dict, nlsyms=args.nlsyms, unit=args.unit, wp_model=args.wp_model, batch_size=1, discourse_aware=args.discourse_aware, skip_thought=skip_thought, is_test=True) ] args.vocab = train_set.vocab args.vocab_sub1 = train_set.vocab_sub1 args.vocab_sub2 = train_set.vocab_sub2 args.input_dim = train_set.input_dim # Load a LM conf file for LM fusion & LM initialization if not args.resume and (args.lm_fusion or args.lm_init): if args.lm_fusion: lm_conf = load_config( os.path.join(os.path.dirname(args.lm_fusion), 'conf.yml')) elif args.lm_init: lm_conf = load_config( os.path.join(os.path.dirname(args.lm_init), 'conf.yml')) args.lm_conf = argparse.Namespace() for k, v in lm_conf.items(): setattr(args.lm_conf, k, v) assert args.unit == args.lm_conf.unit assert args.vocab == args.lm_conf.vocab # Model setting model = Speech2Text(args, save_path) if not skip_thought else SkipThought( args, save_path) if args.resume: # Set optimizer epoch = int(args.resume.split('-')[-1]) optimizer = set_optimizer( model, 'sgd' if epoch > conf['convert_to_sgd_epoch'] else conf['optimizer'], conf['lr'], conf['weight_decay']) # Wrap optimizer by learning rate scheduler noam = 'transformer' in conf['enc_type'] or conf[ 'dec_type'] == 'transformer' optimizer = LRScheduler( optimizer, conf['lr'], decay_type=conf['lr_decay_type'], decay_start_epoch=conf['lr_decay_start_epoch'], decay_rate=conf['lr_decay_rate'], decay_patient_n_epochs=conf['lr_decay_patient_n_epochs'], early_stop_patient_n_epochs=conf['early_stop_patient_n_epochs'], warmup_start_lr=conf['warmup_start_lr'], warmup_n_steps=conf['warmup_n_steps'], model_size=conf['d_model'], factor=conf['lr_factor'], noam=noam) # Restore the last saved model model, optimizer = load_checkpoint(model, args.resume, optimizer, resume=True) # Resume between convert_to_sgd_epoch -1 and convert_to_sgd_epoch if epoch == conf['convert_to_sgd_epoch']: optimizer.convert_to_sgd(model, 'sgd', args.lr, conf['weight_decay'], decay_type='always', decay_rate=0.5) else: # Save the conf file as a yaml file save_config(vars(args), os.path.join(save_path, 'conf.yml')) if args.lm_fusion: save_config(args.lm_conf, os.path.join(save_path, 'conf_lm.yml')) # Save the nlsyms, dictionar, and wp_model if args.nlsyms: shutil.copy(args.nlsyms, os.path.join(save_path, 'nlsyms.txt')) for sub in ['', '_sub1', '_sub2']: if getattr(args, 'dict' + sub): shutil.copy(getattr(args, 'dict' + sub), os.path.join(save_path, 'dict' + sub + '.txt')) if getattr(args, 'unit' + sub) == 'wp': shutil.copy(getattr(args, 'wp_model' + sub), os.path.join(save_path, 'wp' + sub + '.model')) for k, v in sorted(vars(args).items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, str(v))) # Count total parameters for n in sorted(list(model.num_params_dict.keys())): n_params = model.num_params_dict[n] logger.info("%s %d" % (n, n_params)) logger.info("Total %.2f M parameters" % (model.total_parameters / 1000000)) logger.info(model) # Initialize with pre-trained model's parameters if args.asr_init and os.path.isfile(args.asr_init): # Load the ASR model conf_init = load_config( os.path.join(os.path.dirname(args.asr_init), 'conf.yml')) for k, v in conf_init.items(): setattr(args_init, k, v) model_init = Speech2Text(args_init) model_init = load_checkpoint(model_init, args.asr_init)[0] # Overwrite parameters only_enc = (args.enc_n_layers != args_init.enc_n_layers) or ( args.unit != args_init.unit) or args_init.ctc_weight == 1 param_dict = dict(model_init.named_parameters()) for n, p in model.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if only_enc and 'enc' not in n: continue if args.lm_fusion_type == 'cache' and 'output' in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n) # Set optimizer optimizer = set_optimizer(model, args.optimizer, args.lr, args.weight_decay) # Wrap optimizer by learning rate scheduler noam = 'transformer' in args.enc_type or args.dec_type == 'transformer' optimizer = LRScheduler( optimizer, args.lr, decay_type=args.lr_decay_type, decay_start_epoch=args.lr_decay_start_epoch, decay_rate=args.lr_decay_rate, decay_patient_n_epochs=args.lr_decay_patient_n_epochs, early_stop_patient_n_epochs=args.early_stop_patient_n_epochs, warmup_start_lr=args.warmup_start_lr, warmup_n_steps=args.warmup_n_steps, model_size=args.d_model, factor=args.lr_factor, noam=noam) # Load the teacher ASR model teacher = None if args.teacher and os.path.isfile(args.teacher): conf_teacher = load_config( os.path.join(os.path.dirname(args.teacher), 'conf.yml')) for k, v in conf_teacher.items(): setattr(args_teacher, k, v) # Setting for knowledge distillation args_teacher.ss_prob = 0 args.lsm_prob = 0 teacher = Speech2Text(args_teacher) teacher = load_checkpoint(teacher, args.teacher)[0] # Load the teacher LM teacher_lm = None if args.teacher_lm and os.path.isfile(args.teacher_lm): conf_lm = load_config( os.path.join(os.path.dirname(args.teacher_lm), 'conf.yml')) args_lm = argparse.Namespace() for k, v in conf_lm.items(): setattr(args_lm, k, v) teacher_lm = build_lm(args_lm) teacher_lm = load_checkpoint(teacher_lm, args.teacher_lm)[0] # GPU setting if args.n_gpus >= 1: torch.backends.cudnn.benchmark = True model = CustomDataParallel(model, device_ids=list(range(0, args.n_gpus))) model.cuda() if teacher is not None: teacher.cuda() if teacher_lm is not None: teacher_lm.cuda() # Set process name logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) setproctitle(args.job_name if args.job_name else dir_name) # Set reporter reporter = Reporter(save_path) if args.mtl_per_batch: # NOTE: from easier to harder tasks tasks = [] if 1 - args.bwd_weight - args.ctc_weight - args.sub1_weight - args.sub2_weight > 0: tasks += ['ys'] if args.bwd_weight > 0: tasks = ['ys.bwd'] + tasks if args.ctc_weight > 0: tasks = ['ys.ctc'] + tasks for sub in ['sub1', 'sub2']: if getattr(args, 'train_set_' + sub): if getattr(args, sub + '_weight') - getattr( args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub] + tasks if getattr(args, 'ctc_weight_' + sub) > 0: tasks = ['ys_' + sub + '.ctc'] + tasks else: tasks = ['all'] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() pbar_epoch = tqdm(total=len(train_set)) accum_n_tokens = 0 while True: # Compute loss in the training set batch_train, is_new_epoch = train_set.next() accum_n_tokens += sum([len(y) for y in batch_train['ys']]) # Change mini-batch depending on task for task in tasks: if skip_thought: loss, reporter = model(batch_train['ys'], ys_prev=batch_train['ys_prev'], ys_next=batch_train['ys_next'], reporter=reporter) else: loss, reporter = model(batch_train, reporter, task, teacher=teacher, teacher_lm=teacher_lm) loss.backward() loss.detach() # Trancate the graph if args.accum_grad_n_tokens == 0 or accum_n_tokens >= args.accum_grad_n_tokens: if args.clip_grad_norm > 0: total_norm = torch.nn.utils.clip_grad_norm_( model.module.parameters(), args.clip_grad_norm) reporter.add_tensorboard_scalar('total_norm', total_norm) optimizer.step() optimizer.zero_grad() accum_n_tokens = 0 loss_train = loss.item() del loss reporter.add_tensorboard_scalar('learning_rate', optimizer.lr) # NOTE: loss/acc/ppl are already added in the model reporter.step() if optimizer.n_steps % args.print_step == 0: # Compute loss in the dev set batch_dev = dev_set.next()[0] # Change mini-batch depending on task for task in tasks: if skip_thought: loss, reporter = model(batch_dev['ys'], ys_prev=batch_dev['ys_prev'], ys_next=batch_dev['ys_next'], reporter=reporter, is_eval=True) else: loss, reporter = model(batch_dev, reporter, task, is_eval=True) loss_dev = loss.item() del loss # NOTE: this makes training slow # Compute WER/CER regardless of the output unit (greedy decoding) # best_hyps_id, _, _ = model.module.decode( # batch_dev['xs'], recog_params, dev_set.idx2token[0], exclude_eos=True) # cer = 0. # ref_n_words, ref_n_chars = 0, 0 # for b in range(len(batch_dev['xs'])): # ref = batch_dev['text'][b] # hyp = dev_set.idx2token[0](best_hyps_id[b]) # cer += editdistance.eval(hyp, ref) # ref_n_words += len(ref.split()) # ref_n_chars += len(ref) # wer = cer / ref_n_words # cer /= ref_n_chars # reporter.add_tensorboard_scalar('dev/WER', wer) # reporter.add_tensorboard_scalar('dev/CER', cer) # logger.info('WER (dev)', wer) # logger.info('CER (dev)', cer) reporter.step(is_eval=True) duration_step = time.time() - start_time_step if args.input_type == 'speech': xlen = max(len(x) for x in batch_train['xs']) ylen = max(len(y) for y in batch_train['ys']) elif args.input_type == 'text': xlen = max(len(x) for x in batch_train['ys']) ylen = max(len(y) for y in batch_train['ys_sub1']) logger.info( "step:%d(ep:%.2f) loss:%.3f(%.3f)/lr:%.7f/bs:%d/xlen:%d/ylen:%d (%.2f min)" % (optimizer.n_steps, optimizer.n_epochs + train_set.epoch_detail, loss_train, loss_dev, optimizer.lr, len(batch_train['utt_ids']), xlen, ylen, duration_step / 60)) start_time_step = time.time() pbar_epoch.update(len(batch_train['utt_ids'])) # Save fugures of loss and accuracy if optimizer.n_steps % (args.print_step * 10) == 0: reporter.snapshot() model.module.plot_attention() # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('========== EPOCH:%d (%.2f min) ==========' % (optimizer.n_epochs + 1, duration_epoch / 60)) if optimizer.n_epochs + 1 < args.eval_start_epoch: optimizer.epoch() # lr decay reporter.epoch() # plot # Save the model save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=not noam) else: start_time_eval = time.time() # dev metric_dev = eval_epoch([model.module], dev_set, recog_params, args, optimizer.n_epochs + 1, logger) optimizer.epoch(metric_dev) # lr decay reporter.epoch(metric_dev) # plot if optimizer.is_best: # Save the model save_checkpoint(model, save_path, optimizer, optimizer.n_epochs, remove_old_checkpoints=not noam) # test for eval_set in eval_sets: eval_epoch([model.module], eval_set, recog_params, args, optimizer.n_epochs, logger) # start scheduled sampling if args.ss_prob > 0: model.module.scheduled_sampling_trigger() duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.2f min' % (duration_eval / 60)) # Early stopping if optimizer.is_early_stop: break # Convert to fine-tuning stage if optimizer.n_epochs == args.convert_to_sgd_epoch: optimizer.convert_to_sgd(model, 'sgd', args.lr, args.weight_decay, decay_type='always', decay_rate=0.5) pbar_epoch = tqdm(total=len(train_set)) if optimizer.n_epochs == args.n_epochs: break start_time_step = time.time() start_time_epoch = time.time() duration_train = time.time() - start_time_train logger.info('Total time: %.2f hour' % (duration_train / 3600)) reporter.tf_writer.close() pbar_epoch.close() return save_path
def main(): # Load configuration args, _, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) # Load the LM model = build_lm(args, dir_name) load_checkpoint(args.recog_model[0], model) # NOTE: model averaging is not helpful for LM logger.info('batch size: %d' % args.recog_batch_size) logger.info('BPTT: %d' % (args.bptt)) logger.info('cache size: %d' % (args.recog_n_caches)) logger.info('cache theta: %.3f' % (args.recog_cache_theta)) logger.info('cache lambda: %.3f' % (args.recog_cache_lambda)) model.cache_theta = args.recog_cache_theta model.cache_lambda = args.recog_cache_lambda # GPU setting if args.recog_n_gpus > 0: model.cuda() for s in args.recog_sets: # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, batch_size=args.recog_batch_size, bptt=args.bptt, backward=args.backward, serialize=args.serialize, is_test=True) assert args.recog_n_caches > 0 save_path = mkdir_join(args.recog_dir, 'cache') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) hidden = None fig_count = 0 token_count = 0 n_tokens = args.recog_n_caches while True: ys, is_new_epoch = dataset.next() for t in range(ys.shape[1] - 1): loss, hidden = model(ys[:, t:t + 2], hidden, is_eval=True, n_caches=args.recog_n_caches)[:2] if len(model.cache_attn) > 0: if token_count == n_tokens: tokens_keys = dataset.idx2token[0]( model.cache_ids[:args.recog_n_caches], return_list=True) tokens_query = dataset.idx2token[0]( model.cache_ids[-n_tokens:], return_list=True) # Slide attention matrix n_keys = len(tokens_keys) n_queries = len(tokens_query) cache_probs = np.zeros( (n_keys, n_queries)) # `[n_keys, n_queries]` mask = np.zeros((n_keys, n_queries)) for i, aw in enumerate(model.cache_attn[-n_tokens:]): cache_probs[:(n_keys - n_queries + i + 1), i] = aw[0, -(n_keys - n_queries + i + 1):] mask[(n_keys - n_queries + i + 1):, i] = 1 plot_cache_weights(cache_probs, keys=tokens_keys, queries=tokens_query, save_path=mkdir_join( save_path, str(fig_count) + '.png'), figsize=(40, 16), mask=mask) token_count = 0 fig_count += 1 else: token_count += 1 if is_new_epoch: break