def predict_one(img_path, predict_func, idx): img = misc.imread(img_path, 'L') if img.shape[0] != cfg.input_height: if cfg.input_width != None: img = cv2.resize(img, (cfg.input_width, cfg.input_height)) else: scale = cfg.input_height / img.shape[0] img = cv2.resize(img, None, fx=scale, fy=scale) seqlen = img.shape[1] img = np.expand_dims(np.expand_dims(img, axis=2), axis=0) logits = predict_func([img, [seqlen]])[0][0] pytorch_logits = torch.from_numpy(np.transpose(logits)) labels = cfg.dictionary labels.append('空') decoder = BeamCTCDecoder(''.join(labels), space_index=labels.index(' '), blank_index=len(labels)-1, lm_path='language_model/bigram.klm', dict_path='language_model/a' ) strings, offsets, conf, char_probs = decoder.decode(pytorch_logits) # if idx == None: # logger.info(img_path) # logger.info(result) # else: # logger.info(str(idx) + ": " + img_path) # logger.info(str(idx) + ": " + result) result = strings[0][0] print(result) return result
def __init__(self, args): super(create_model, self).__init__() self.args = args self.model = SpeechNet(args) self.model.to(args.device) self.criterion = nn.CTCLoss() self.decoder = BeamCTCDecoder(PHONEME_MAP, blank_index=0, beam_width=args.beam_width) self.state_names = ['loss', 'edit_dist', 'lr']
def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def init(beam_width, blank_index, lm_path): global decoder decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index)
def model_setup(args=None): test_dataset = data.MASRDataset(args.test_index_path, args.labels_path, args.mode, config=args) dataloader = data.MASRDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) model = GatedConv.load(args.pretrained_path) global decoder decoder = BeamCTCDecoder( dataloader.dataset.labels_str, alpha=0.8, beta=0.3, lm_path="/root/lm/zh_giga.no_cna_cmn.prune01244.klm", cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=args.num_workers, blank_index=0, ) return model, dataloader
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def main(model_path, confs): model, __ = MultiTask.load_model(model_path) if confs['cuda']: model = model.cuda() if not model._meta['use_transcripts_out']: # only accent classification criterion = nn.CrossEntropyLoss() elif not model._meta['use_accents_out']: # only text recognition criterion = CTCLoss() else: # both tasks criterion = (CTCLoss(), nn.CrossEntropyLoss()) # Results results = {} for manifest, lm in confs['testing_manifests']: eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}') # Decoder if model._meta['use_transcripts_out']: decoder = BeamCTCDecoder(confs['labels'], lm_path=lm, alpha=confs['decoder_alpha'], beta=confs['decoder_beta'], cutoff_top_n=confs['decoder_cutoff_top_n'], cutoff_prob=confs['decoder_cutoff_top_n'], beam_width=confs['decoder_beam_width'], num_processes=confs['num_workers']) target_decoder = GreedyDecoder(confs['labels']) else: decoder, target_decoder = None, None # Test results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers']) if not PRINT_LATEX_TABLE: print(f'Model: {model_path.split("/")[-1]}') for name, res in results.items(): print(f'\nResults for {name}:') print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()])) else: print(' & '.join(['model']+list([k[:-4] for k in results.keys()]))) val_dict = {} for k in list(results.values())[0].keys(): val_dict[k] = [] for res in results.values(): [val_dict[k].append(f'{v:.1f}') for k, v in res.items()] for val in val_dict.values(): print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float(len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float(len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def main(): import argparse global model, spect_parser, decoder, args, device, decompressor parser = argparse.ArgumentParser( description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') decompressor = LZString() server = WebsocketServer(host=args.host, port=args.port) server.set_fn_new_client(new_client) server.set_fn_client_left(client_left) server.set_fn_message_received(message_received) server.run_forever()
def main(): import argparse global model, spect_parser, decoder, args, device parser = argparse.ArgumentParser( description="DeepSpeech transcription server") parser.add_argument( "--host", type=str, default="0.0.0.0", help="Host to be used by the server", ) parser.add_argument("--port", type=int, default=8888, help="Port to be used by the server") parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info("Setting up server...") torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder( model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers, ) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index("_")) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info("Server initialised") app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
spect = spect.to(device) input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets if __name__ == "__main__": args = get_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = load_model(device, args.model_file, True) parser = SpectrogramParser(model.audio_conf, normalize=True) decoder = BeamCTCDecoder(model.labels, beam_width=args.beam_size, num_processes=args.num_worker, blank_index=0) with open(args.manifest_file) as f: data = f.read().split('\n')[:-1] src = [] lbl = [] for line in tqdm(data): vp, tp = line.split(',') with open(tp) as f: text = f.read().strip() decoded_output, decoded_offsets = transcribe(vp, parser, model, decoder, device) for h in decoded_output[0]: src.append(h)
labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": scorer = None if args.lm_path is not None: scorer = KenLMScorer(labels, args.lm_path, args.trie_path) scorer.set_lm_weight(args.lm_alpha) scorer.set_word_weight(args.lm_beta1) scorer.set_valid_word_weight(args.lm_beta2) else: scorer = Scorer() decoder = BeamCTCDecoder(labels, scorer, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_')) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) t0 = time.time() spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output = decoder.decode(out.data)
model_name=re.sub('.json.pth.tar','',os.path.basename(args.model_path)) corpus=re.sub('csv','',os.path.basename(args.test_manifest)) if args.save_path : ref_file=open("%s/%s_reference.%s.txt"%( args.save_path, corpus, model_name),'w') trans_file=open("%s/%s_transcription.%s.txt"%(args.save_path, corpus, model_name),'w') print (ref_file ) labels = DeepSpeech.get_labels(model) print ("model_name ", model_name) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder print ( "alpha=args.alpha, beta=args.beta ", args.alpha, args.beta) decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers, blank_index=labels.index('_')) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer = 0, 0 n_concept_ref=0 n_concept_err=0 output_data = [] total_err_c=0
def save_(p_, data_): with open(f"{args.save}/{os.path.basename(p_).split('.')[0]}.txt", "w") as f: f.write(data_) if __name__ == '__main__': if args.decoder == "beam": print(f"using beam decoder") from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.workers, blank_index=model.labels.index('_')) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) print(f'Reading files from {args.path}') directory_files = [i for i in glob.glob(args.path + '*.wav')] print(f"{len(directory_files)} number of Files have been read") t_ = len(directory_files) trans = [
if (not audio_conf): audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, n_mels=args.n_mels, process_mel=args.process_mel) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) if (args.preprocess == 'file'): test_dataset = FeatDataset(manifest_filepath=args.test_manifest, labels=labels) test_loader = FeatLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
return wer * 100, cer * 100, output_data if __name__ == '__main__': args = parser.parse_args() torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset,
import warnings from opts import add_decoder_args, add_inference_args from utils import load_model from decoder import BeamCTCDecoder warnings.simplefilter('ignore') parser = argparse.ArgumentParser() parser.add_argument("fileAddr",help = "The file for which the prediction needs to be made",type= str) args = parser.parse_args() prepath = os.getcwd() device = torch.device("cpu") half = False model = load_model(device, prepath+ "/public/models/deepspeech_final.pth", True).type(torch.FloatTensor) decoder = BeamCTCDecoder(model.labels, lm_path=prepath+"/public/models/libri.binary", alpha=0.47, beta=0.28, beam_width=2048, num_processes=12) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) def transcribe(audio_path, spect_parser, model, decoder, device, use_half): spect = spect_parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) if use_half: spect = spect.half() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) with open("out.txt","wb") as f: pickle.dump(out.cpu().detach().numpy(),f) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets
if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=args.top_paths, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta=args.lm_beta, label_size=args.label_size, label_margin=args.label_margin) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True))
class create_model(nn.Module): def __init__(self, args): super(create_model, self).__init__() self.args = args self.model = SpeechNet(args) self.model.to(args.device) self.criterion = nn.CTCLoss() self.decoder = BeamCTCDecoder(PHONEME_MAP, blank_index=0, beam_width=args.beam_width) self.state_names = ['loss', 'edit_dist', 'lr'] def train_setup(self): self.lr = self.args.lr self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay) if self.args.use_step_schedule: self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.decay_steps, gamma=self.args.lr_gamma) elif self.args.use_reduce_schedule: self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.5, patience=1) else: self.scheduler = ParamScheduler(self.optimizer, scale_cos, self.args.num_epochs * self.args.loader_length) # self.model.apply(weights_init) self.model.train() def optimize_parameters(self, input, input_lens, target, target_lens): input, target = input.to(self.args.device), target.to(self.args.device) output, output_lens, self.loss = self.forward(input, input_lens, target, target_lens) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() self.edit_dist = self.get_edit_dist(output, output_lens, target, target_lens) del input del target del input_lens del target_lens del output del output_lens def update_learning_rate(self, dist=None): if self.args.use_reduce_schedule: self.scheduler.step(dist) else: self.scheduler.step() self.lr = self.optimizer.param_groups[0]['lr'] def get_current_states(self): errors_ret = OrderedDict() for name in self.state_names: if isinstance(name, str): # float(...) works for both scalar tensor and float number errors_ret[name] = float(getattr(self, name)) return errors_ret def get_edit_dist(self, output, output_lens, target, target_lens): output, target = output.cpu(), target.cpu() phonome_preds = self.decoder.decode(output, output_lens) phonomes = self.decoder.convert_to_strings(target, target_lens) edit_dist = np.sum( [self.decoder.Lev_dist(phonome_pred, phonome) for (phonome_pred, phonome) in zip(phonome_preds, phonomes)]) return edit_dist def forward(self, input, input_lens, target=None, target_lens=None, is_training=True): output, output_lens = self.model(input, input_lens) if is_training: # The official documentation is your best friend: https://pytorch.org/docs/stable/nn.html#ctcloss # nn.CTCLoss takes 4 arguments to compute the loss: # [log_probs]: Prediction of your model at each time step. Shape: (seq_len, batch_size, vocab_size) # Values must be log probabilities. Neither probabilities nor logits will work. # Make sure the output of your network is log probabilities, by adding a nn.LogSoftmax after the last layer. # [targets]: The ground truth sequences. Shape: (batch_size, seq_len) # Values are indices of phonemes. Again, remember that index 0 is reserved for "blank" # [input_lengths]: Lengths of sequences in log_probs. Shape: (batch_size,). # This is not necessarily the same as lengths of input of the model. # [target_lengths]: Lengths of sequences in targets. Shape: (batch_size,). loss = self.criterion(output.permute(1, 0, 2), target, input_lens, target_lens) return output, output_lens, loss else: return output, output_lens, def train(self): try: self.model.train() except: print('train() cannot be implemented as model does not exist.') def eval(self): try: self.model.eval() except: print('eval() cannot be implemented as model does not exist.') def load_model(self, model_path): self.model.load_state_dict(torch.load(model_path)) def save_model(self, which_epoch): save_filename = '%s_net.pth' % (which_epoch) save_path = os.path.join(self.args.expr_dir, save_filename) if torch.cuda.is_available(): try: torch.save(self.model.module.cpu().state_dict(), save_path) except: torch.save(self.model.cpu().state_dict(), save_path) else: torch.save(self.model.cpu().state_dict(), save_path) self.model.to(self.args.device)
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, index, labels, eval): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) model_name = re.sub('.json.pth.tar', '', os.path.basename(args.model_path)) ref_file = None if eval == 'concept': eval_dir = "%s/%s/%s" % (os.path.dirname( args.output_path), model_name, index) if not os.path.exists(eval_dir): os.makedirs(eval_dir) ref_file = open( "%s/%s_reference.txt" % (eval_dir, re.sub('.csv', '', os.path.basename( args.test_manifest))), 'w') trans_file = open( "%s/%s_transcription.txt" % (eval_dir, re.sub('.csv', '', os.path.basename( args.test_manifest))), 'w') total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes, audio_ids = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, _, _, _ = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] if eval == 'concept': ref_file.write( reference.encode('utf-8') + "(" + audio_ids[x] + ")\n") trans_file.write( transcript.encode('utf-8') + "(" + audio_ids[x] + ")\n") wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer ref_file.close() trans_file.close() wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) if eval == 'concept': # Concept error rate evaluation cmd = "perl /lium/buster1/ghannay/deepSpeech2/deepspeech.pytorch/data/eval.sclit_cer.pl %s" % ( eval_dir) print("cmd ", cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) coner, error = p.communicate() print(" coner ", coner) return [mesh_x, mesh_y, lm_alpha, lm_beta, float(coner) / 100, cer] else: return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
help='Language model word bonus (IV words)') args = parser.parse_args() if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta1=args.lm_beta1, lm_beta2=args.lm_beta2) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) t0 = time.time() spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True))
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = FeatLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 #decoding_log = [] for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _ = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst # ver1 # write result to logFile # can't do this because multi processing code cannot do this #logFile.write('decoding : ' + transcript) #logFIle.write('reference : ' + reference) #logFile.write('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) if (random.uniform(0, 1) < float(args.detail_log_print_prob)): print('decoding : ' + transcript) print('reference : ' + reference) print('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) print(' ') #ver1 #decoding_log_sample = [] #decoding_log_sample.append(transcript) #decoding_log_sample.append(reference) #decoding_log.append(decoding_log_sample) #ver2. thread safe but does not write anything to file #logging.info('decoding : ' + transcript) #logging.info('reference : ' + reference) #logging.info('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) #logging.info(' ') #ver3 logger.error('decoding : ' + transcript) logger.error('reference : ' + reference) logger.error('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst)) logger.error(' ') total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output, decoded_offsets = decoder.decode(out.data) print(json.dumps(decode_results(decoded_output, decoded_offsets)))
def init(beam_width, blank_index, lm_path): global decoder, ae_decoder decoder = BeamCTCDecoder(model.vocabulary, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index) ae_decoder = GreedyDecoder(model.vocabulary)
if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets if __name__ == "__main__": args = get_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = load_model(device, args.model_file, True) parser = SpectrogramParser(model.audio_conf, normalize=True) decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, beam_width=args.beam_size, num_processes=args.num_worker, blank_index=0) with open(args.manifest_file) as f: data = f.read().split('\n')[:-1] idx = [] pred = [] lbl = [] for line in tqdm(data): vp, tp = line.split(',') with open(tp) as f: text = f.read().strip() decoded_output, decoded_offsets = transcribe(vp, parser, model,
args = parser.parse_args() if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta1=args.lm_beta1, lm_beta2=args.lm_beta2) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": scorer = None if args.lm_path is not None: scorer = KenLMScorer(labels, args.lm_path, args.trie_path) scorer.set_lm_weight(args.lm_alpha) scorer.set_word_weight(args.lm_beta1) scorer.set_valid_word_weight(args.lm_beta2) else: scorer = Scorer() decoder = BeamCTCDecoder(labels, scorer, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_')) else: decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer = 0, 0
torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") model = model.to(device) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
def run_experiment(_exp_name, _epochs, _train_manifest, _test_manifest, _labels, _use_mfcc_in, _use_ivectors_in, _use_embeddings_in, _use_transcripts_out, _use_accents_out, _batch_size, _num_workers, _mfcc_size, _ivector_size, _embedding_size, _rnn_type, _rnn_hidden_size, _nb_head_layers, _nb_speech_layers, _nb_accents_layers, _bidirectional, _losses_mix, _learning_rate, _lm_path, _decoder_alpha, _decoder_beta, _decoder_cutoff_top_n, _decoder_beam_width, _cuda, _tensorboard_path, _saved_models_path, _bottleneck_size, _accent_loss): print(f'\n##### Running experiment {_exp_name} #####') # Tools to log values results_dict = {} results_dict['train_loss'] = [] results_dict['train_loss_text'] = [] results_dict['train_loss_accent'] = [] results_dict['test_loss'] = [] results_dict['test_loss_text'] = [] results_dict['test_loss_accent'] = [] results_dict['test_wer'] = [] results_dict['test_accent_acc'] = [] tb_path = Path(_tensorboard_path) / _exp_name makedirs(tb_path, exist_ok=True) tb_writer = SummaryWriter(tb_path) ### DATA LOADING # Training set train_dataset = MultiDataset(_train_manifest, _labels, use_mfcc_in=_use_mfcc_in, use_ivectors_in=_use_ivectors_in, use_embeddings_in=_use_embeddings_in, embedding_size=_embedding_size, use_transcripts_out=_use_transcripts_out, use_accents_out=_use_accents_out) train_loader = MultiDataLoader(train_dataset, batch_size=_batch_size, shuffle=True, num_workers=_num_workers) # Testing set test_dataset = MultiDataset(_test_manifest, _labels, use_mfcc_in=_use_mfcc_in, use_ivectors_in=_use_ivectors_in, use_embeddings_in=_use_embeddings_in, embedding_size=_embedding_size, use_transcripts_out=_use_transcripts_out, use_accents_out=_use_accents_out) test_loader = MultiDataLoader(test_dataset, batch_size=_batch_size, shuffle=True, num_workers=_num_workers) ### CREATE MODEL model = MultiTask(use_mfcc_in=_use_mfcc_in, use_ivectors_in=_use_ivectors_in, use_embeddings_in=_use_embeddings_in, use_transcripts_out=_use_transcripts_out, use_accents_out=_use_accents_out, mfcc_size=_mfcc_size, ivector_size=_ivector_size, embedding_size=_embedding_size, rnn_type=_rnn_type, labels=_labels, accents_dict=train_dataset.accent_dict, rnn_hidden_size=_rnn_hidden_size, nb_head_layers=_nb_head_layers, nb_speech_layers=_nb_speech_layers, nb_accents_layers=_nb_accents_layers, bidirectional=_bidirectional, bottleneck_size=_bottleneck_size, DEBUG=False) if _cuda: model = model.cuda() print(model, '\n') print('Model parameters counts:', MultiTask.get_param_size(model), '\n') ### OPTIMIZER, CRITERION, DECODER # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=_learning_rate) # Criterion if _use_accents_out: if _accent_loss == 'focal': AccLoss = FocalLoss() elif _accent_loss == 'CE': AccLoss = nn.CrossEntropyLoss() else: raise ValueError( f'Loss {_accent_loss} for accent_loss is unknown. Please use either "focal" or "CE".' ) if not _use_transcripts_out: # only accent classification criterion = AccLoss elif not _use_accents_out: # only text recognition criterion = nn.CTCLoss() else: # both tasks criterion = (nn.CTCLoss(), FocalLoss()) # Decoder if _use_transcripts_out: decoder = BeamCTCDecoder(_labels, lm_path=_lm_path, alpha=_decoder_alpha, beta=_decoder_beta, cutoff_top_n=_decoder_cutoff_top_n, cutoff_prob=_decoder_cutoff_top_n, beam_width=_decoder_beam_width, num_processes=_num_workers) target_decoder = GreedyDecoder(_labels) else: decoder, target_decoder = None, None ### EPOCHS best_wer = math.inf best_acc = 0 for epoch in range(1, _epochs + 1): ### TRAIN print(f'Epoch {epoch} training: {exp_name}') train_results = train(model, train_loader, criterion, optimizer, losses_mix=_losses_mix) train_loss, train_loss_text, train_loss_accent = train_results results_dict['train_loss'].append(train_loss) results_dict['train_loss_text'].append(train_loss_text) results_dict['train_loss_accent'].append(train_loss_accent) print(f'Epoch {epoch} training loss: {train_loss}') ### TEST print(f'Epoch {epoch} testing') test_results = test(model, test_loader, criterion, decoder, target_decoder, losses_mix=_losses_mix) test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results results_dict['test_loss'].append(test_loss) results_dict['test_loss_text'].append(test_loss_text) results_dict['test_loss_accent'].append(test_loss_accent) results_dict['test_wer'].append(test_wer) results_dict['test_accent_acc'].append(test_accent_acc) print(f'Epoch {epoch} testing loss: {test_loss}') # Add values to tensorboard for key, results in results_dict.items(): tb_writer.add_scalar(key, results[-1], epoch) #Save model if it is best save_new = False if _use_transcripts_out: if test_wer < best_wer: save_new = True best_wer = test_wer else: if test_accent_acc > best_acc: save_new = True best_acc = test_accent_acc if save_new: MultiTask.serialize( model, Path(_saved_models_path) / _exp_name, save=True, exp_name=_exp_name, optimizer=optimizer, epoch=epoch, train_losses=results_dict['train_loss'], test_losses=results_dict['test_loss'], text_train_losses=results_dict['train_loss_text'], text_test_losses=results_dict['test_loss_text'], text_wers=results_dict['test_wer'], accent_train_losses=results_dict['train_loss_accent'], accent_test_losses=results_dict['test_loss_accent'], accent_accuracies=results_dict['test_accent_acc']) del model gc.collect() torch.cuda.empty_cache()