def main(): args = get_args() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) # Set random seed torch.manual_seed(777) print(args) with open(args.config, 'r') as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) if len(args.override_config) > 0: configs = override_config(configs, args.override_config) distributed = args.world_size > 1 if distributed: logging.info('training on multiple gpus, this gpu {}'.format(args.gpu)) dist.init_process_group(args.dist_backend, init_method=args.init_method, world_size=args.world_size, rank=args.rank) symbol_table = read_symbol_table(args.symbol_table) train_conf = configs['dataset_conf'] cv_conf = copy.deepcopy(train_conf) cv_conf['speed_perturb'] = False cv_conf['spec_aug'] = False cv_conf['shuffle'] = False cv_conf['apply_alaw_codec'] = False cv_conf['add_noise'] = False cv_conf['add_babble'] = False cv_conf['add_reverb'] = False cv_conf['apply_codec'] = False cv_conf['volume_perturb'] = False cv_conf['pitch_shift'] = False non_lang_syms = read_non_lang_symbols(args.non_lang_syms) train_dataset = Dataset(args.data_type, args.train_data, symbol_table, train_conf, args.bpe_model, non_lang_syms, True) cv_dataset = Dataset(args.data_type, args.cv_data, symbol_table, cv_conf, args.bpe_model, non_lang_syms, partition=False) train_data_loader = DataLoader(train_dataset, batch_size=None, pin_memory=args.pin_memory, num_workers=args.num_workers, prefetch_factor=args.prefetch) cv_data_loader = DataLoader(cv_dataset, batch_size=None, pin_memory=args.pin_memory, num_workers=args.num_workers, prefetch_factor=args.prefetch) if 'fbank_conf' in configs['dataset_conf']: input_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] else: input_dim = configs['dataset_conf']['mfcc_conf']['num_mel_bins'] vocab_size = len(symbol_table) # Save configs to model_dir/train.yaml for inference and export configs['input_dim'] = input_dim configs['output_dim'] = vocab_size configs['cmvn_file'] = args.cmvn configs['is_json_cmvn'] = True if args.rank == 0: saved_config_path = os.path.join(args.model_dir, 'train.yaml') with open(saved_config_path, 'w') as fout: data = yaml.dump(configs) fout.write(data) # Init asr model from configs model = init_asr_model(configs) if args.rank == 0: print(model) num_params = sum(p.numel() for p in model.parameters()) print('the number of model params: {}'.format(num_params)) # !!!IMPORTANT!!! # Try to export the model by script, if fails, we should refine # the code to satisfy the script export requirements if args.rank == 0: script_model = torch.jit.script(model) script_model.save(os.path.join(args.model_dir, 'init.zip')) executor = Executor() # If specify checkpoint, load some info from checkpoint if args.checkpoint is not None: infos = load_checkpoint(model, args.checkpoint) elif args.enc_init is not None: logging.info('load pretrained encoders: {}'.format(args.enc_init)) infos = load_trained_modules(model, args) else: infos = {} start_epoch = infos.get('epoch', -1) + 1 cv_loss = infos.get('cv_loss', 0.0) step = infos.get('step', -1) num_epochs = configs.get('max_epoch', 100) model_dir = args.model_dir writer = None if args.rank == 0: os.makedirs(model_dir, exist_ok=True) exp_id = os.path.basename(model_dir) writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id)) if distributed: assert (torch.cuda.is_available()) # cuda model is required for nn.parallel.DistributedDataParallel model.cuda() model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) device = torch.device("cuda") if args.fp16_grad_sync: from torch.distributed.algorithms.ddp_comm_hooks import ( default as comm_hooks, ) model.register_comm_hook( state=None, hook=comm_hooks.fp16_compress_hook ) else: use_cuda = args.gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') model = model.to(device) if configs['optim'] == 'adam': print('optimizer is adam') optimizer = optim.Adam(model.parameters(), **configs['optim_conf']) elif configs['optim'] == 'sgd': print('optimizer is sgd') optimizer = optim.SGD(model.parameters(), **configs['optim_conf']) scheduler = WarmupLR(optimizer, **configs['scheduler_conf']) final_epoch = None configs['rank'] = args.rank configs['is_distributed'] = distributed configs['use_amp'] = args.use_amp if start_epoch == 0 and args.rank == 0: save_model_path = os.path.join(model_dir, 'init.pt') save_checkpoint(model, save_model_path) # Start training loop executor.step = step scheduler.set_step(step) # used for pytorch amp mixed precision training scaler = None if args.use_amp: scaler = torch.cuda.amp.GradScaler() for epoch in range(start_epoch, num_epochs): train_dataset.set_epoch(epoch) configs['epoch'] = epoch lr = optimizer.param_groups[0]['lr'] logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr)) executor.train(model, optimizer, scheduler, train_data_loader, device, writer, configs, scaler) total_loss, total_loss_att, total_loss_ctc, num_seen_utts = executor.cv( model, cv_data_loader, device, configs) cv_loss = total_loss / num_seen_utts cv_loss_att = total_loss_att / num_seen_utts cv_loss_ctc = total_loss_ctc / num_seen_utts logging.info('Epoch {} CV info cv_loss {}'.format(epoch, cv_loss)) if args.rank == 0: save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch)) save_checkpoint( model, save_model_path, { 'epoch': epoch, 'lr': lr, 'cv_loss': cv_loss, 'cv_loss_att': cv_loss_att, 'cv_loss_ctc': cv_loss_ctc, 'step': executor.step }) writer.add_scalar('epoch/cv_loss', cv_loss, epoch) writer.add_scalar('epoch/cv_loss_att', cv_loss, epoch) writer.add_scalar('epoch/cv_loss_ctc', cv_loss, epoch) writer.add_scalar('epoch/lr', lr, epoch) final_epoch = epoch if final_epoch is not None and args.rank == 0: final_model_path = os.path.join(model_dir, 'final.pt') os.symlink('{}.pt'.format(final_epoch), final_model_path) writer.close()
def main(): args = get_args() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' ] and args.batch_size > 1: logging.fatal( 'decoding mode {} must be running with batch_size == 1'.format( args.mode)) sys.exit(1) with open(args.config, 'r') as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) if len(args.override_config) > 0: configs = override_config(configs, args.override_config) symbol_table = read_symbol_table(args.dict) test_conf = copy.deepcopy(configs['dataset_conf']) test_conf['filter_conf']['max_length'] = 102400 test_conf['filter_conf']['min_length'] = 0 test_conf['filter_conf']['token_max_length'] = 102400 test_conf['filter_conf']['token_min_length'] = 0 test_conf['filter_conf']['max_output_input_ratio'] = 102400 test_conf['filter_conf']['min_output_input_ratio'] = 0 test_conf['speed_perturb'] = False test_conf['spec_aug'] = False test_conf['shuffle'] = False test_conf['sort'] = False if 'fbank_conf' in test_conf: test_conf['fbank_conf']['dither'] = 0.0 elif 'mfcc_conf' in test_conf: test_conf['mfcc_conf']['dither'] = 0.0 test_conf['batch_conf']['batch_type'] = "static" test_conf['batch_conf']['batch_size'] = args.batch_size non_lang_syms = read_non_lang_symbols(args.non_lang_syms) test_dataset = Dataset(args.data_type, args.test_data, symbol_table, test_conf, args.bpe_model, non_lang_syms, partition=False) test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) # Init asr model from configs model = init_asr_model(configs) # Load dict char_dict = {v: k for k, v in symbol_table.items()} eos = len(char_dict) - 1 load_checkpoint(model, args.checkpoint) use_cuda = args.gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') model = model.to(device) model.eval() with torch.no_grad(), open(args.result_file, 'w') as fout: for batch_idx, batch in enumerate(test_data_loader): keys, feats, target, feats_lengths, target_lengths = batch feats = feats.to(device) target = target.to(device) feats_lengths = feats_lengths.to(device) target_lengths = target_lengths.to(device) if args.mode == 'attention': hyps, _ = model.recognize( feats, feats_lengths, beam_size=args.beam_size, decoding_chunk_size=args.decoding_chunk_size, num_decoding_left_chunks=args.num_decoding_left_chunks, simulate_streaming=args.simulate_streaming) hyps = [hyp.tolist() for hyp in hyps] elif args.mode == 'ctc_greedy_search': hyps, _ = model.ctc_greedy_search( feats, feats_lengths, decoding_chunk_size=args.decoding_chunk_size, num_decoding_left_chunks=args.num_decoding_left_chunks, simulate_streaming=args.simulate_streaming) # ctc_prefix_beam_search and attention_rescoring only return one # result in List[int], change it to List[List[int]] for compatible # with other batch decoding mode elif args.mode == 'ctc_prefix_beam_search': assert (feats.size(0) == 1) hyp, _ = model.ctc_prefix_beam_search( feats, feats_lengths, args.beam_size, decoding_chunk_size=args.decoding_chunk_size, num_decoding_left_chunks=args.num_decoding_left_chunks, simulate_streaming=args.simulate_streaming) hyps = [hyp] elif args.mode == 'attention_rescoring': assert (feats.size(0) == 1) hyp, _ = model.attention_rescoring( feats, feats_lengths, args.beam_size, decoding_chunk_size=args.decoding_chunk_size, num_decoding_left_chunks=args.num_decoding_left_chunks, ctc_weight=args.ctc_weight, simulate_streaming=args.simulate_streaming, reverse_weight=args.reverse_weight) hyps = [hyp] for i, key in enumerate(keys): content = '' for w in hyps[i]: if w == eos: break content += char_dict[w] logging.info('{} {}'.format(key, content)) fout.write('{} {}\n'.format(key, content))
def main(): args = get_args() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) with open(args.config, 'r') as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) if len(args.override_config) > 0: configs = override_config(configs, args.override_config) symbol_table = read_symbol_table(args.dict) test_conf = copy.deepcopy(configs['dataset_conf']) test_conf['filter_conf']['max_length'] = 102400 test_conf['filter_conf']['min_length'] = 0 test_conf['filter_conf']['token_max_length'] = 102400 test_conf['filter_conf']['token_min_length'] = 0 test_conf['filter_conf']['max_output_input_ratio'] = 102400 test_conf['filter_conf']['min_output_input_ratio'] = 0 test_conf['speed_perturb'] = False test_conf['spec_aug'] = False test_conf['shuffle'] = False test_conf['sort'] = False test_conf['fbank_conf']['dither'] = 0.0 test_conf['batch_conf']['batch_type'] = "static" test_conf['batch_conf']['batch_size'] = args.batch_size test_dataset = Dataset(args.data_type, args.test_data, symbol_table, test_conf, args.bpe_model, partition=False) test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0) # Init asr model from configs use_cuda = args.gpu >= 0 and torch.cuda.is_available() if use_cuda: EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] else: EP_list = ['CPUExecutionProvider'] encoder_ort_session = rt.InferenceSession(args.encoder_onnx, providers=EP_list) decoder_ort_session = None if args.mode == "attention_rescoring": decoder_ort_session = rt.InferenceSession(args.decoder_onnx, providers=EP_list) # Load dict vocabulary = [] char_dict = {} with open(args.dict, 'r') as fin: for line in fin: arr = line.strip().split() assert len(arr) == 2 char_dict[int(arr[1])] = arr[0] vocabulary.append(arr[0]) eos = sos = len(char_dict) - 1 with torch.no_grad(), open(args.result_file, 'w') as fout: for _, batch in enumerate(test_data_loader): keys, feats, _, feats_lengths, _ = batch ort_inputs = { encoder_ort_session.get_inputs()[0].name: feats.numpy(), encoder_ort_session.get_inputs()[1].name: feats_lengths.numpy() } ort_outs = encoder_ort_session.run(None, ort_inputs) encoder_out, encoder_out_lens, ctc_log_probs, \ beam_log_probs, beam_log_probs_idx = ort_outs beam_size = beam_log_probs.shape[-1] batch_size = beam_log_probs.shape[0] num_processes = min(multiprocessing.cpu_count(), batch_size) if args.mode == 'ctc_greedy_search': if beam_size != 1: log_probs_idx = beam_log_probs_idx[:, :, 0] batch_sents = [] for idx, seq in enumerate(log_probs_idx): batch_sents.append(seq[0:encoder_out_lens[idx]].tolist()) hyps = map_batch(batch_sents, vocabulary, num_processes, True, 0) elif args.mode in ('ctc_prefix_beam_search', "attention_rescoring"): batch_log_probs_seq_list = beam_log_probs.tolist() batch_log_probs_idx_list = beam_log_probs_idx.tolist() batch_len_list = encoder_out_lens.tolist() batch_log_probs_seq = [] batch_log_probs_ids = [] batch_start = [] # only effective in streaming deployment batch_root = TrieVector() root_dict = {} for i in range(len(batch_len_list)): num_sent = batch_len_list[i] batch_log_probs_seq.append( batch_log_probs_seq_list[i][0:num_sent]) batch_log_probs_ids.append( batch_log_probs_idx_list[i][0:num_sent]) root_dict[i] = PathTrie() batch_root.append(root_dict[i]) batch_start.append(True) score_hyps = ctc_beam_search_decoder_batch( batch_log_probs_seq, batch_log_probs_ids, batch_root, batch_start, beam_size, num_processes, 0, -2, 0.99999) if args.mode == 'ctc_prefix_beam_search': hyps = [] for cand_hyps in score_hyps: hyps.append(cand_hyps[0][1]) hyps = map_batch(hyps, vocabulary, num_processes, False, 0) if args.mode == 'attention_rescoring': ctc_score, all_hyps = [], [] max_len = 0 for hyps in score_hyps: cur_len = len(hyps) if len(hyps) < beam_size: hyps += (beam_size - cur_len) * [(-float("INF"), (0, ))] for hyp in hyps: ctc_score.append(hyp[0]) all_hyps.append(list(hyp[1])) if len(hyp[1]) + 1 > max_len: max_len = len(hyp[1]) + 1 assert len(ctc_score) == beam_size * batch_size hyps_pad_sos = np.ones((batch_size, beam_size, max_len), dtype=np.int64) * IGNORE_ID r_hyps_pad_sos = np.ones((batch_size, beam_size, max_len), dtype=np.int64) * IGNORE_ID hyps_lens_sos = np.ones((batch_size, beam_size), dtype=np.int32) k = 0 for i in range(batch_size): for j in range(beam_size): cand = all_hyps[k] hyps_pad_sos[i][j][0:len(cand) + 1] = [sos] + cand r_hyps_pad_sos[i][j][0:len(cand) + 1] = [sos] + cand[::-1] hyps_lens_sos[i][j] = len(cand) + 1 k += 1 decoder_ort_inputs = { decoder_ort_session.get_inputs()[0].name: encoder_out, decoder_ort_session.get_inputs()[1].name: encoder_out_lens, decoder_ort_session.get_inputs()[2].name: hyps_pad_sos, decoder_ort_session.get_inputs()[3].name: hyps_lens_sos, decoder_ort_session.get_inputs()[4].name: r_hyps_pad_sos } decoder_out, r_decoder_out = decoder_ort_session.run( None, decoder_ort_inputs) best_sents = [] k = 0 for d_o, r_d_o in zip(decoder_out, r_decoder_out): # d_0 & r_d_o: beam x T x V cur_best_sent = [] cur_best_score = -float("inf") for sent_d_o, sent_r_d_o in zip(d_o, r_d_o): cand = all_hyps[k] + [eos] r_cand = all_hyps[k][::-1] + [eos] score, r_score = 0, 0 for i in range(len(cand)): index, r_index = cand[i], r_cand[i] score += sent_d_o[i][index] r_score += sent_r_d_o[i][r_index] if args.reverse_weight > 0: score = score * (1 - args.reverse_weight) + \ args.reverse_weight * r_score score = score + args.ctc_weight * ctc_score[k] if score > cur_best_score: cur_best_sent = all_hyps[k] cur_best_score = score k += 1 best_sents.append(cur_best_sent) hyps = map_batch(best_sents, vocabulary, num_processes) for i, key in enumerate(keys): content = hyps[i] logging.info('{} {}'.format(key, content)) fout.write('{} {}\n'.format(key, content))
char_dict = {} with open(args.symbol_table, mode='r') as fin: for line in fin: arr = line.strip().split() assert len(arr) == 2 char_dict[int(arr[1])] = arr[0] eos = len(char_dict) - 1 train_conf = configs['dataset_conf'] cv_conf = copy.deepcopy(train_conf) cv_conf['speed_perturb'] = False cv_conf['spec_aug'] = False cv_dataset = Dataset(args.data_type, args.input_data, symbol_table, cv_conf, None, partition=False) cv_data_loader = DataLoader(cv_dataset, batch_size=None, pin_memory=args.pin_memory, num_workers=args.num_workers, prefetch_factor=args.prefetch) print("Reading: ", args.keyword_unit_dict) word_id_dict, word_unit_dict = map_words2char(args.keyword_unit_dict) word_unit_list = list(word_unit_dict.keys()) print("word_unit_list has the size of %d" % (len(word_unit_list))) # Init asr model from configs
ali_conf['filter_conf']['token_min_length'] = 0 ali_conf['filter_conf']['max_output_input_ratio'] = 102400 ali_conf['filter_conf']['min_output_input_ratio'] = 0 ali_conf['speed_perturb'] = False ali_conf['spec_aug'] = False ali_conf['shuffle'] = False ali_conf['sort'] = False ali_conf['fbank_conf']['dither'] = 0.0 ali_conf['batch_conf']['batch_type'] = "static" ali_conf['batch_conf']['batch_size'] = args.batch_size non_lang_syms = read_non_lang_symbols(args.non_lang_syms) ali_dataset = Dataset(args.data_type, args.input_file, symbol_table, ali_conf, args.bpe_model, non_lang_syms, partition=False) ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0) # Init asr model from configs model = init_asr_model(configs) load_checkpoint(model, args.checkpoint) use_cuda = args.gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') model = model.to(device) model.eval()