def recognize(args): #import pdb #pdb.set_trace() char_list, sos_id, eos_id = process_dict(args.dict) args.char_list = char_list model = Seq2Seq.load_model(args.model_path, args) print(model) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): print('(%d/%d) decoding %s' % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]['input'][0]['feat']) # TxD input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() input_length = input_length.cuda() nbest_hyps = model.recognize(input, input_length, char_list, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def recognize(args): # model char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) encoder = Encoder( args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen, ) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen, ) model = Transformer(encoder, decoder) model.load_state_dict(flow.load(args.model_path)) device = flow.device("cuda") model.eval() model.to(device) LFR_m = args.LFR_m LFR_n = args.LFR_n char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] # decode each utterance new_js = {} with flow.no_grad(): for idx, name in enumerate(js.keys(), 1): print("(%d/%d) decoding %s" % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]["input"][0]["feat"]) input = build_LFR_features(input, LFR_m, LFR_n) input = flow.tensor(input).to(dtype=flow.float32) input_length = flow.tensor([input.size(0)], dtype=flow.int64) input = input.to(device) input_length = input_length.to(device) nbest_hyps = model.recognize(input, input_length, char_list, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, "wb") as f: f.write(json.dumps({"utts": new_js}, indent=4, sort_keys=True).encode("utf_8"))
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path) print(model) model.eval() # model.cuda() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): print('(%d/%d) decoding %s' % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]['input'][0]['feat']) # TxD input = build_LFR_features(input, LFR_m, LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) # input = input.cuda() # input_length = input_length.cuda() input = input.to(device) input_length = input_length.to(device) nbest_hyps = model.recognize(input, input_length, char_list, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True).encode('utf_8'))
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model encoder = Encoder(args.einput, args.ehidden, args.elayer, dropout=args.edropout, bidirectional=args.ebidirectional, rnn_type=args.etype) decoder = Decoder(vocab_size, args.dembed, sos_id, eos_id, args.dhidden, args.dlayer, bidirectional_encoder=args.ebidirectional) model = Seq2Seq(encoder, decoder) print(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver ctc = 0 solver = Solver(data, model, optimizier, args) solver.train()
def __init__(self, traincfg): dir_path = os.path.dirname(os.path.realpath(__file__)) self.base_path = dir_path self.recog_json = os.path.join(dir_path, self.recog_json) self.dict_txt = os.path.join(dir_path, self.dict_txt) # Construct the model self.model, self.LFR_m, self.LFR_n = Transformer( traincfg.encoder, traincfg.decoder), traincfg.LFR_m, traincfg.LFR_n self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt) assert self.model.decoder.sos_id == self.sos_id and self.model.decoder.eos_id == self.eos_id # Read json data with open(self.recog_json, "rb") as f: self.js = json.load(f)['utts']
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path, args) print(model) # device = torch.device("cuda:0") # model = model.to(device) # model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu))) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id # read json data with open(args.recog_json, 'r', encoding='utf-8') as f: js = json.load(f)['utts'] # decode each utterance new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): print('(%d/%d) decoding %s' % (idx, len(js.keys()), name), flush=True) input = kaldi_io.read_mat(js[name]['input'][0]['feat']) # TxD input = build_LFR_features(input, LFR_m, LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() input_length = input_length.cuda() nbest_hyps = model.recognize(input, input_length, char_list, args) new_js[name] = add_results_to_json(js[name], nbest_hyps, char_list) # new_js[name]['output'][0]['rec_text'] = new_js[name]['output'][0]['rec_text'].encode('utf_8') # new_js[name]['output'][0]['rec_token'] = new_js[name]['output'][0]['rec_token'].encode('utf_8') # new_js[name]['output'][0]['text'] = new_js[name]['output'][0]['text'].encode('utf_8') # new_js[name]['output'][0]['token'] = new_js[name]['output'][0]['token'].encode('utf_8') # print(new_js[name]) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, sort_keys=True, ensure_ascii=False).encode('utf_8'))
train_x, train_desc, train_att = getTrain_Img_Desc_Att( '../images/train', '../word_c10/train', '../att_per_classes.npy') iter_ = data_iterator(train_x, train_desc, train_att, device) #test visual features val_x, val_pre_proto, val_pre_att_proto, val_x2label, val_proto2label = getTest_Img_proto_labels( '../images/val', '../word_c10/val', '../att_per_classes.npy') #NOTE: Here Vocab starts from 1 where 1 being '<END>' vocab = torchfile.load('../vocab_c10.t7', force_8bytes_long=True) word2idx_vocab = {str(key, 'utf-8'): value for key, value in vocab.items()} print('Vocabs Loaded: Total words in vocabulary including pad....{}'.format( len(word2idx_vocab))) #Gives all words in glove's vocab to its glove embedding matrix (dictionary contains 0 as'<PAD>') glove_dict = process_dict('../glove.6B.50d.txt', dim=input_dim) print('Loading embedding layer....') embed_layer = getEmbed_layer(word2idx=word2idx_vocab, glove_dict=glove_dict, dim=input_dim).to(device) print('Loaded Embedding!!') w1 = Variable(torch.FloatTensor(512, 700).to(device), requires_grad=True) b1 = Variable(torch.FloatTensor(700).to(device), requires_grad=True) w2 = Variable(torch.FloatTensor(700, 1024).to(device), requires_grad=True) b2 = Variable(torch.FloatTensor(1024).to(device), requires_grad=True) w3 = Variable(torch.FloatTensor(312, 700).to(device), requires_grad=True) b3 = Variable(torch.FloatTensor(700).to(device), requires_grad=True) # must initialize!
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers, shuffle=args.shuffle, LFR_m=args.LFR_m, LFR_n=args.LFR_n) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model encoder = Encoder(args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) print(model) model.cuda() # optimizer model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]) optimizier = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps) # solver solver = Solver(data, model, optimizier, args) solver.train()
parser.add_argument('json', type=str, help='json files') parser.add_argument('dict', type=str, help='dict') parser.add_argument('ref', type=str, help='ref') parser.add_argument('hyp', type=str, help='hyp') args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.info("reading %s", args.json) with open(args.json, 'r') as f: j = json.load(f) logging.info("reading %s", args.dict) char_list, sos_id, eos_id = process_dict(args.dict) # with open(args.dict, 'r') as f: # dictionary = f.readlines() # char_list = [unicode(entry.split(' ')[0], 'utf_8') for entry in dictionary] # char_list.insert(0, '<blank>') # char_list.append('<eos>') # print([x.encode('utf-8') for x in char_list]) logging.info("writing hyp trn to %s", args.hyp) logging.info("writing ref trn to %s", args.ref) h = open(args.hyp, 'w') r = open(args.ref, 'w') for x in j['utts']: seq = [char_list[int(i)] for i in j['utts'][x] ['output'][0]['rec_tokenid'].split()]
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) args.char_list = char_list vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model #import pdb #pdb.set_trace() encoder = Encoder(args.einput * args.LFR_m, args.ehidden, args.elayer, vocab_size, dropout=args.edropout, bidirectional=args.ebidirectional, rnn_type=args.etype) decoder = Decoder(vocab_size, args.dembed, sos_id, eos_id, args.dhidden, args.dlayer, args.offset, args.atype, dropout=args.edropout, bidirectional_encoder=args.ebidirectional) if args.ebidirectional: eprojs = args.ehidden * 2 else: eprojs = args.ehidden ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout) #lstm_model = Lstmctc.load_model(args.continue_from) model = Seq2Seq(encoder, decoder, ctc, args) #model_dict = model.state_dict() print(model) #print(lstm_model) #pretrained_dict = torch.load(args.ctc_model) #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict} #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict} #model_dict.update(pretrained_dict) #model.load_state_dict(model_dict) #for k,v in model.named_parameters(): # if k.startswith("encoder"): # print(k) # v.requires_grad=False model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver ctc = 0 solver = Solver(data, model, optimizier, args) solver.train()
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path) print(model) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id tr_dataset = AudioDataset('test', args.batch_size) path_list = tr_dataset.path_lst label_list = tr_dataset.han_lst num_data = tr_dataset.path_count ran_num = random.randint(0, num_data - 1) num = args.count words_num = 0 word_error_num = 0 seq_error = 0 data = '' with torch.no_grad(): for index in range(num): try: print('\nthe ', index + 1, 'th example.') data += 'the ' + str(index + 1) + 'th example.\n' index = (ran_num + index) % num_data standard_label = label_list[index] feature, label = get_fbank_and_hanzi_data( index, args.feature_dim, char_list, path_list, label_list) if len(feature) > 1600: continue input = build_LFR_features(feature, args.LFR_m, args.LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() nbest_hyps = model.recognize(input, input_length, char_list, args) pred_label = nbest_hyps[0]['yseq'][1:-1] pred_res = ''.join([char_list[index] for index in pred_label]) print("stand:", label) print("pred :", pred_label) data += "stand:" + str(standard_label) + '\n' data += "pred :" + str(pred_res) + '\n' words_n = len(label) words_num += words_n word_distance = GetEditDistance(pred_label, label) if (word_distance <= words_n): word_error_num += word_distance else: word_error_num += words_n if pred_label != label: seq_error += 1 except ValueError: continue print('WER = ', (1 - word_error_num / words_num) * 100, '%') print('CER = ', (1 - seq_error / args.count) * 100, '%') data += 'WER = ' + str((1 - word_error_num / words_num) * 100) + '%' data += 'CER = ' + str((1 - seq_error / args.count) * 100) + '%' with open('../../model_log/pred/test_' + str(args.count) + '.txt', 'w', encoding='utf-8') as f: f.writelines(data)
def main(args): # Construct Solver # data tr_dataset = AudioDataset( args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames, ) cv_dataset = AudioDataset( args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames, ) tr_loader = AudioDataLoader( tr_dataset, batch_size=1, num_workers=args.num_workers, shuffle=args.shuffle, LFR_m=args.LFR_m, LFR_n=args.LFR_n, ) cv_loader = AudioDataLoader( cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, ) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {"tr_loader": tr_loader, "cv_loader": cv_loader} # model encoder = Encoder( args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen, ) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen, ) model = Transformer(encoder, decoder) device = flow.device("cuda") model.to(device) # optimizer optimizier = TransformerOptimizer( flow.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps, args.step_num, ) # solver solver = Solver(data, model, optimizier, device, args) solver.train()
def __init__(self): dir_path = os.path.dirname(os.path.realpath(__file__)) self.train_json = os.path.join(dir_path, self.train_json) self.valid_json = os.path.join(dir_path, self.valid_json) self.dict_txt = os.path.join(dir_path, self.dict_txt) self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt) self.vocab_size = len(self.char_list) self.tr_dataset = AudioDataset(self.train_json, self.batch_size, self.maxlen_in, self.maxlen_out, batch_frames=self.batch_frames) self.cv_dataset = AudioDataset(self.valid_json, self.batch_size, self.maxlen_in, self.maxlen_out, batch_frames=self.batch_frames) self.tr_loader = AudioDataLoader(self.tr_dataset, batch_size=1, num_workers=self.num_workers, shuffle=self.shuffle, LFR_m=self.LFR_m, LFR_n=self.LFR_n) self.cv_loader = AudioDataLoader(self.cv_dataset, batch_size=1, num_workers=self.num_workers, LFR_m=self.LFR_m, LFR_n=self.LFR_n) self.data = {'tr_loader': self.tr_loader, 'cv_loader': self.cv_loader} self.encoder = Encoder(self.d_input * self.LFR_m, self.n_layers_enc, self.n_head, self.d_k, self.d_v, self.d_model, self.d_inner, dropout=self.dropout, pe_maxlen=self.pe_maxlen) self.decoder = Decoder( self.sos_id, self.eos_id, self.vocab_size, self.d_word_vec, self.n_layers_dec, self.n_head, self.d_k, self.d_v, self.d_model, self.d_inner, dropout=self.dropout, tgt_emb_prj_weight_sharing=self.tgt_emb_prj_weight_sharing, pe_maxlen=self.pe_maxlen) self.tr_loss = torch.Tensor(self.epochs) self.cv_loss = torch.Tensor(self.epochs) self.model = Transformer(self.encoder, self.decoder) self.optimizer = TransformerOptimizer( torch.optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09), self.k, self.d_model, self.warmup_steps) self._reset()