def predict_one(img_path, predict_func, idx): img = misc.imread(img_path, 'L') if img.shape[0] != cfg.input_height: if cfg.input_width != None: img = cv2.resize(img, (cfg.input_width, cfg.input_height)) else: scale = cfg.input_height / img.shape[0] img = cv2.resize(img, None, fx=scale, fy=scale) seqlen = img.shape[1] img = np.expand_dims(np.expand_dims(img, axis=2), axis=0) logits = predict_func([img, [seqlen]])[0][0] pytorch_logits = torch.from_numpy(np.transpose(logits)) labels = cfg.dictionary labels.append('çİş') decoder = BeamCTCDecoder(''.join(labels), space_index=labels.index(' '), blank_index=len(labels)-1, lm_path='language_model/bigram.klm', dict_path='language_model/a' ) strings, offsets, conf, char_probs = decoder.decode(pytorch_logits) # if idx == None: # logger.info(img_path) # logger.info(result) # else: # logger.info(str(idx) + ": " + img_path) # logger.info(str(idx) + ": " + result) result = strings[0][0] print(result) return result
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float(len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float(len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(device) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out, output_sizes = model(inputs, input_sizes) if args.save_output: # add output to data array, and continue output_data.append((out.cpu().numpy(), output_sizes.numpy())) decoded_output, _ = decoder.decode(out, output_sizes) target_strings = target_decoder.convert_to_strings(split_targets) for x in range(len(target_strings)): total += 1 transcript, reference = decoded_output[x][0], target_strings[ x][0] wer_inst = decoder.wer(transcript, reference) cer_inst = decoder.cer(transcript, reference) total_wer += wer_inst total_cer += cer_inst num_tokens += len(reference.split()) num_chars += len(reference) if args.verbose: print("Filename: ", filenames[x]) print("Ref:", reference.lower()) print("Hyp: \"" + transcript.lower() + "\"")
if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output, decoded_offsets = decoder.decode(out.data) print(json.dumps(decode_results(decoded_output, decoded_offsets)))
if args.decoder == "beam": decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta1=args.lm_beta1, lm_beta2=args.lm_beta2) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) t0 = time.time() spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output = decoder.decode(out.data) t1 = time.time() print(decoded_output[0]) print("Decoded {0:.2f} seconds of audio in {1:.2f} seconds".format( spect.size(3) * audio_conf['window_stride'], t1 - t0), file=sys.stderr)
inputs = inputs.cuda() target_batch= target_batch.cuda() out = model(inputs,target_batch) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() # print " seq_length ", seq_length, "sizes ", sizes if decoder is None: # add output to data array, and continue output_data.append((out.data.cpu().numpy(), sizes.numpy())) continue if args.decoder == "beam" : decoded_output, offsets, char_probs, scores, seq_lens = decoder.decode(out.data, sizes) else: decoded_output, _, = decoder.decode(out.data, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] if args.save_path or args.eval : ref_file.write(reference.encode('utf-8')+"("+audio_ids[x]+")\n") trans_file.write(transcript.encode('utf-8')+"("+audio_ids[x]+")\n") cp=cp+1 wer_inst=0 if args.eval == 'concept': # Concept error rate evaluation new_ref=convert_to_NE(reference) new_hyp=convert_to_NE(transcript)
spect, input_sizes = transcribe(audio_path=path, use_half=args.half) if length < input_sizes.item(): length = input_sizes.item() a_.append([spect, input_sizes, path]) except: continue a_ = sorted(a_, key=lambda x: x[1].item(), reverse=True) input_sizes = torch.tensor([i[1] for i in a_]) batch_temp = [i[2] for i in a_] spect = torch.stack([ F.pad(input=i[0], pad=(0, length - i[0].shape[-1]), mode='constant', value=0) for i in a_ ]) out, output_sizes = model(spect, input_sizes) decoded_output, scores, = decoder.decode(out, output_sizes) [ save_(batch_temp[m], decoded_output[m][0]) for m in range(out.shape[0]) ] # print('done') print("finished")
input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(device) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out, output_sizes = model(inputs, input_sizes) if args.save_output: # add output to data array, and continue output_data.append((out.cpu().numpy(), output_sizes.numpy())) decoded_output, _ = decoder.decode(out, output_sizes, args.rescore) #decoded_output, _ = decoder.decode(out, output_sizes) target_strings = target_decoder.convert_to_strings(split_targets) target_strings_copy = target_decoder.convert_to_strings(split_targets) for x in range(len(target_strings)): print("1st pass decoding :" + str(x) + "/" + str(len(target_strings))) transcript, reference = decoded_output[x][0], target_strings[x][0] #if args.auto_correct: greedy_output = "/speech/data_to_decode.txt" with open(greedy_output, 'a') as f: if transcript != "": f.write(decoded_output[x][0] + "\n")
offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out, output_sizes = model(inputs, input_sizes) if decoder is None: # add output to data array, and continue output_data.append((out.numpy(), output_sizes.numpy())) continue decoded_output, _ = decoder.decode(out.data, output_sizes.data) target_strings = target_decoder.convert_to_strings(split_targets) for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) cer_inst = decoder.cer(transcript, reference) total_wer += wer_inst total_cer += cer_inst num_tokens += len(reference.split()) num_chars += len(reference) if args.verbose: print("Ref:", reference.lower()) print("Hyp:", transcript.lower()) print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference), "\n") if decoder is not None:
class create_model(nn.Module): def __init__(self, args): super(create_model, self).__init__() self.args = args self.model = SpeechNet(args) self.model.to(args.device) self.criterion = nn.CTCLoss() self.decoder = BeamCTCDecoder(PHONEME_MAP, blank_index=0, beam_width=args.beam_width) self.state_names = ['loss', 'edit_dist', 'lr'] def train_setup(self): self.lr = self.args.lr self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay) if self.args.use_step_schedule: self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.decay_steps, gamma=self.args.lr_gamma) elif self.args.use_reduce_schedule: self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.5, patience=1) else: self.scheduler = ParamScheduler(self.optimizer, scale_cos, self.args.num_epochs * self.args.loader_length) # self.model.apply(weights_init) self.model.train() def optimize_parameters(self, input, input_lens, target, target_lens): input, target = input.to(self.args.device), target.to(self.args.device) output, output_lens, self.loss = self.forward(input, input_lens, target, target_lens) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() self.edit_dist = self.get_edit_dist(output, output_lens, target, target_lens) del input del target del input_lens del target_lens del output del output_lens def update_learning_rate(self, dist=None): if self.args.use_reduce_schedule: self.scheduler.step(dist) else: self.scheduler.step() self.lr = self.optimizer.param_groups[0]['lr'] def get_current_states(self): errors_ret = OrderedDict() for name in self.state_names: if isinstance(name, str): # float(...) works for both scalar tensor and float number errors_ret[name] = float(getattr(self, name)) return errors_ret def get_edit_dist(self, output, output_lens, target, target_lens): output, target = output.cpu(), target.cpu() phonome_preds = self.decoder.decode(output, output_lens) phonomes = self.decoder.convert_to_strings(target, target_lens) edit_dist = np.sum( [self.decoder.Lev_dist(phonome_pred, phonome) for (phonome_pred, phonome) in zip(phonome_preds, phonomes)]) return edit_dist def forward(self, input, input_lens, target=None, target_lens=None, is_training=True): output, output_lens = self.model(input, input_lens) if is_training: # The official documentation is your best friend: https://pytorch.org/docs/stable/nn.html#ctcloss # nn.CTCLoss takes 4 arguments to compute the loss: # [log_probs]: Prediction of your model at each time step. Shape: (seq_len, batch_size, vocab_size) # Values must be log probabilities. Neither probabilities nor logits will work. # Make sure the output of your network is log probabilities, by adding a nn.LogSoftmax after the last layer. # [targets]: The ground truth sequences. Shape: (batch_size, seq_len) # Values are indices of phonemes. Again, remember that index 0 is reserved for "blank" # [input_lengths]: Lengths of sequences in log_probs. Shape: (batch_size,). # This is not necessarily the same as lengths of input of the model. # [target_lengths]: Lengths of sequences in targets. Shape: (batch_size,). loss = self.criterion(output.permute(1, 0, 2), target, input_lens, target_lens) return output, output_lens, loss else: return output, output_lens, def train(self): try: self.model.train() except: print('train() cannot be implemented as model does not exist.') def eval(self): try: self.model.eval() except: print('eval() cannot be implemented as model does not exist.') def load_model(self, model_path): self.model.load_state_dict(torch.load(model_path)) def save_model(self, which_epoch): save_filename = '%s_net.pth' % (which_epoch) save_path = os.path.join(self.args.expr_dir, save_filename) if torch.cuda.is_available(): try: torch.save(self.model.module.cpu().state_dict(), save_path) except: torch.save(self.model.cpu().state_dict(), save_path) else: torch.save(self.model.cpu().state_dict(), save_path) self.model.to(self.args.device)
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, index, labels, eval): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) model_name = re.sub('.json.pth.tar', '', os.path.basename(args.model_path)) ref_file = None if eval == 'concept': eval_dir = "%s/%s/%s" % (os.path.dirname( args.output_path), model_name, index) if not os.path.exists(eval_dir): os.makedirs(eval_dir) ref_file = open( "%s/%s_reference.txt" % (eval_dir, re.sub('.csv', '', os.path.basename( args.test_manifest))), 'w') trans_file = open( "%s/%s_transcription.txt" % (eval_dir, re.sub('.csv', '', os.path.basename( args.test_manifest))), 'w') total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes, audio_ids = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, _, _, _ = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] if eval == 'concept': ref_file.write( reference.encode('utf-8') + "(" + audio_ids[x] + ")\n") trans_file.write( transcript.encode('utf-8') + "(" + audio_ids[x] + ")\n") wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer ref_file.close() trans_file.close() wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) if eval == 'concept': # Concept error rate evaluation cmd = "perl /lium/buster1/ghannay/deepSpeech2/deepspeech.pytorch/data/eval.sclit_cer.pl %s" % ( eval_dir) print("cmd ", cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) coner, error = p.communicate() print(" coner ", coner) return [mesh_x, mesh_y, lm_alpha, lm_beta, float(coner) / 100, cer] else: return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) out = out.transpose(0, 1) # TxNxH decoded_output, decoded_offsets = decoder.decode(out, output_sizes) print(json.dumps(decode_results(model, decoded_output, decoded_offsets)))
from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=args.top_paths, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta=args.lm_beta, label_size=args.label_size, label_margin=args.label_margin) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output, decoded_offsets, confs, char_probs = decoder.decode( out.data) for pi in range(args.top_paths): print(decoded_output[pi][0]) if args.offsets: print(decoded_offsets[pi][0])
# unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings( decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float( len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float( len(target_strings[x])) total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) print('Test Summary \t'
# unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size inputs = inputs.to(device) out, output_sizes = model(inputs, input_sizes) if decoder is None: # add output to data array, and continue output_data.append((out.numpy(), output_sizes.numpy())) continue decoded_output, _ = decoder.decode(out.data, output_sizes.data) target_strings = target_decoder.convert_to_strings(split_targets) for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) cer_inst = decoder.cer(transcript, reference) total_wer += wer_inst total_cer += cer_inst num_tokens += len(reference.split()) num_chars += len(reference) if args.verbose: print("Ref:", reference.lower()) print("Hyp:", transcript.lower()) print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference), "\n")
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = FeatLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 #decoding_log = [] for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _ = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst # ver1 # write result to logFile # can't do this because multi processing code cannot do this #logFile.write('decoding : ' + transcript) #logFIle.write('reference : ' + reference) #logFile.write('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) if (random.uniform(0, 1) < float(args.detail_log_print_prob)): print('decoding : ' + transcript) print('reference : ' + reference) print('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) print(' ') #ver1 #decoding_log_sample = [] #decoding_log_sample.append(transcript) #decoding_log_sample.append(reference) #decoding_log.append(decoding_log_sample) #ver2. thread safe but does not write anything to file #logging.info('decoding : ' + transcript) #logging.info('reference : ' + reference) #logging.info('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) #logging.info(' ') #ver3 logger.error('decoding : ' + transcript) logger.error('reference : ' + reference) logger.error('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) logger.error(' ') total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]