def main(): # Load configuration args, recog_params, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataset dataset = Dataset( corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), dict_path_sub1=os.path.join(dir_name, 'dict_sub1.txt') if os.path.isfile(os.path.join(dir_name, 'dict_sub1.txt')) else False, nlsyms=args.nlsyms, wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, unit_sub1=args.unit_sub1, batch_size=args.recog_batch_size, is_test=True) if i == 0: # Load the ASR model model = Speech2Text(args, dir_name) epoch = int(args.recog_model[0].split('-')[-1]) if args.recog_n_average > 1: # Model averaging for Transformer model = average_checkpoints(model, args.recog_model[0], n_average=args.recog_n_average) else: load_checkpoint(args.recog_model[0], model) if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) # GPU setting if args.recog_n_gpus >= 1: model.cudnn_setting(deterministic=True, benchmark=False) model.cuda() save_path = mkdir_join(args.recog_dir, 'ctc_probs') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) while True: batch, is_new_epoch = dataset.next( recog_params['recog_batch_size']) best_hyps_id, _ = model.decode(batch['xs'], recog_params) # Get CTC probs ctc_probs, topk_ids, xlens = model.get_ctc_probs(batch['xs'], temperature=1, topk=min( 100, model.vocab)) # NOTE: ctc_probs: '[B, T, topk]' for b in range(len(batch['xs'])): tokens = dataset.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_ctc_probs( ctc_probs[b, :xlens[b]], topk_ids[b], subsample_factor=args.subsample_factor, spectrogram=batch['xs'][b][:, :dataset.input_dim], save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8)) hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50) if is_new_epoch: break
def main(): # Load configuration args, dir_name = parse_args_eval(sys.argv[1:]) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) set_logger(os.path.join(args.recog_dir, 'plot.log'), stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): # Load dataloader dataloader = build_dataloader( args=args, tsv_path=s, batch_size=1, is_test=True, first_n_utterances=args.recog_first_n_utt, longform_max_n_frames=args.recog_longform_max_n_frames) if i == 0: # Load ASR model model = Speech2Text(args, dir_name) epoch = int(float(args.recog_model[0].split('-')[-1]) * 10) / 10 if args.recog_n_average > 1: # Model averaging for Transformer model = average_checkpoints(model, args.recog_model[0], n_average=args.recog_n_average) else: load_checkpoint(args.recog_model[0], model) if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('epoch: %d' % epoch) logger.info('batch size: %d' % args.recog_batch_size) # GPU setting if args.recog_n_gpus >= 1: model.cudnn_setting(deterministic=True, benchmark=False) model.cuda() save_path = mkdir_join(args.recog_dir, 'ctc_probs') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) for batch in dataloader: nbest_hyps_id, _ = model.decode(batch['xs'], args, dataloader.idx2token[0]) best_hyps_id = [h[0] for h in nbest_hyps_id] # Get CTC probs ctc_probs, topk_ids, xlens = model.get_ctc_probs(batch['xs'], temperature=1, topk=min( 100, model.vocab)) # NOTE: ctc_probs: '[B, T, topk]' for b in range(len(batch['xs'])): tokens = dataloader.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_ctc_probs( ctc_probs[b, :xlens[b]], topk_ids[b], factor=args.subsample_factor, spectrogram=batch['xs'][b][:, :dataloader.input_dim], save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8)) hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50)
def main(): args = parse() # Load a conf file dir_name = os.path.dirname(args.recog_model[0]) conf = load_config(os.path.join(dir_name, 'conf.yml')) # Overwrite conf for k, v in conf.items(): if 'recog' not in k: setattr(args, k, v) recog_params = vars(args) # Setting for logging if os.path.isfile(os.path.join(args.recog_dir, 'plot.log')): os.remove(os.path.join(args.recog_dir, 'plot.log')) logger = set_logger(os.path.join(args.recog_dir, 'plot.log'), key='decoding', stdout=args.recog_stdout) for i, s in enumerate(args.recog_sets): subsample_factor = 1 subsample = [int(s) for s in args.subsample.split('_')] if args.conv_poolings: for p in args.conv_poolings.split('_'): p = int(p.split(',')[0].replace('(', '')) if p > 1: subsample_factor *= p subsample_factor *= np.prod(subsample) # Load dataset dataset = Dataset(corpus=args.corpus, tsv_path=s, dict_path=os.path.join(dir_name, 'dict.txt'), dict_path_sub1=os.path.join(dir_name, 'dict_sub1.txt') if os.path.isfile( os.path.join(dir_name, 'dict_sub1.txt')) else False, nlsyms=args.nlsyms, wp_model=os.path.join(dir_name, 'wp.model'), unit=args.unit, unit_sub1=args.unit_sub1, batch_size=args.recog_batch_size, is_test=True) if i == 0: # Load the ASR model model = Speech2Text(args, dir_name) model = load_checkpoint(model, args.recog_model[0])[0] epoch = int(args.recog_model[0].split('-')[-1]) if not args.recog_unit: args.recog_unit = args.unit logger.info('recog unit: %s' % args.recog_unit) logger.info('epoch: %d' % (epoch - 1)) logger.info('batch size: %d' % args.recog_batch_size) # GPU setting model.cuda() save_path = mkdir_join(args.recog_dir, 'ctc_probs') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) while True: batch, is_new_epoch = dataset.next(recog_params['recog_batch_size']) best_hyps_id, _, _ = model.decode(batch['xs'], recog_params, exclude_eos=False) # Get CTC probs ctc_probs, indices_topk, xlens = model.get_ctc_probs( batch['xs'], temperature=1, topk=min(100, model.vocab)) # NOTE: ctc_probs: '[B, T, topk]' for b in range(len(batch['xs'])): tokens = dataset.idx2token[0](best_hyps_id[b], return_list=True) spk = batch['speakers'][b] plot_ctc_probs( ctc_probs[b, :xlens[b]], indices_topk[b], n_frames=xlens[b], subsample_factor=subsample_factor, spectrogram=batch['xs'][b][:, :dataset.input_dim], save_path=mkdir_join(save_path, spk, batch['utt_ids'][b] + '.png'), figsize=(20, 8)) hyp = ' '.join(tokens) logger.info('utt-id: %s' % batch['utt_ids'][b]) logger.info('Ref: %s' % batch['text'][b].lower()) logger.info('Hyp: %s' % hyp) logger.info('-' * 50) if is_new_epoch: break