def eval(model, dataloader): model.eval() decoder = GreedyDecoder(dataloader.dataset.labels_str) ctcloss = CTCLoss(size_average=True) cer = 0 epoch_loss = 0 print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): x = x.to(device) outs, out_lens = model(x, x_lens) loss = ctcloss( outs.transpose(0, 1).transpose(0, 2), y, out_lens, y_lens) epoch_loss += loss.item() outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset:offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] #if len(ref) == 0 : print("ref:", ref, y_strings) cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) epoch_loss /= i + 1 model.train() return cer, epoch_loss
def eval(model, dataloader): model.eval() decoder = GreedyDecoder(dataloader.dataset.labels_str) cer = 0 print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): # x = x.to("cuda") x = x.to(device) outs, out_lens = model(x, x_lens) outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset : offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = decoder.convert_to_strings(ys) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) model.train() return cer
def eval(model, dataloader, device, save_output=None, lm_alpha=None, lm_beta=None): model.eval() ae_decoder = GreedyDecoder(dataloader.dataset.labels_str) global decoder if lm_alpha is not None or lm_beta is not None: decoder._decoder.reset_params(lm_alpha, lm_beta) # from warpctc_pytorch import CTCLoss # ctcloss = CTCLoss(size_average=True) cer = 0 epoch_loss = 0 output_data = [] print("decoding") with torch.no_grad(): for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)): x = x.to(device) outs, out_lens = model(x, x_lens) # loss = ctcloss(outs.transpose(0, 1).transpose(0, 2), y, out_lens, y_lens) # epoch_loss += loss.item() outs = F.softmax(outs, 1) outs = outs.transpose(1, 2) ys = [] offset = 0 for y_len in y_lens: ys.append(y[offset:offset + y_len]) offset += y_len out_strings, out_offsets = decoder.decode(outs, out_lens) y_strings = ae_decoder.convert_to_strings(ys) if save_output is not None: # add output to data array, and continue output_data.append( (outs.cpu().numpy(), out_lens.numpy(), y_strings)) for pred, truth in zip(out_strings, y_strings): trans, ref = pred[0], truth[0] cer += decoder.cer(trans, ref) / float(len(ref)) cer /= len(dataloader.dataset) epoch_loss /= i + 1 print("cer:{}, epoch_loss:{}".format(cer, epoch_loss)) if save_output is not None: np.save(save_output, output_data) return cer, epoch_loss
def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True)
def __init__(self, config, data_loader=None): self.config = config self.data_loader = data_loader # needed for VAE self.lr = config.lr self.beta1 = config.beta1 self.beta2 = config.beta2 self.optimizer = config.optimizer self.batch_size = config.batch_size self.diffLoss = L1Loss_mask() # custom module self.valmin_iter = 0 self.model_dir = 'logs/' + str(config.expnum) self.savename_G = '' self.savename_D = '' self.savename_ASR = '' self.kt = 0 # used for Proportional Control Theory in BEGAN, initialized as 0 self.lb = self.config.lambda_k self.gamma = self.config.gamma self.conv_measure = 0 # convergence measure self.ctc_tr = AverageMeter() self.ctc_tr_local = AverageMeter() self.ctc_val = AverageMeter() self.adv_ny_tr = AverageMeter() self.adv_ny_val = AverageMeter() self.wer_tr = AverageMeter() self.wer_val = AverageMeter() self.cer_tr = AverageMeter() self.cer_val = AverageMeter() self.CTCLoss = CTCLoss() self.decoder = GreedyDecoder(data_loader.labels) self.build_model() self.G.loss_stop = 100000 #self.get_weight_statistic() if self.config.gpu >= 0: self.G.cuda() self.D.cuda() self.diffLoss.cuda() self.ASR.cuda() if len(self.config.load_path) > 0: self.load_model() if config.mode == 'train': self.logFile = open(self.model_dir + '/log.txt', 'w')
def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
class SpeechRecognizer(object): def __init__(self, config_path='config.ini'): if config_path is None: raise Exception('Path to config file is None') self.config = configparser.ConfigParser() self.config.read(config_path, encoding='UTF-8') self.labels = self.config['Wav2Letter']['labels'][1:-1] self.sample_rate = int(self.config['Wav2Letter']['sample_rate']) self.window_size = float(self.config['Wav2Letter']['window_size']) self.window_stride = float(self.config['Wav2Letter']['window_stride']) self.greedy = int(self.config['Wav2Letter']['greedy']) self.cpu = int(self.config['Wav2Letter']['cpu']) if self.cpu: from PuzzleLib import Config Config.backend = Config.Backend.cpu from PuzzleLib.Models.Nets.WaveToLetter import loadW2L from PuzzleLib.Modules import MoveAxis nfft = int(self.sample_rate * self.window_size) self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2), nlabels=len(self.labels)) self.w2l.append(MoveAxis(src=2, dst=0)) if not self.cpu: self.w2l.calcMode(np.float16) self.w2l.evalMode() if not self.greedy: from decoder import TrieDecoder lexicon = self.config['Wav2Letter']['lexicon'] tokens = self.config['Wav2Letter']['tokens'] lm_path = self.config['Wav2Letter']['lm_path'] beam_threshold = float(self.config['Wav2Letter']['beam_threshold']) self.decoder = TrieDecoder(lexicon, tokens, lm_path, beam_threshold) else: self.decoder = GreedyDecoder(self.labels) def recognize(self, audio_path): preprocessed_audio = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride) if self.cpu: from PuzzleLib.CPU.CPUArray import CPUArray inputs = CPUArray.toDevice(np.array([preprocessed_audio]).astype(np.float32)) else: from PuzzleLib.Backend import gpuarray inputs = gpuarray.to_gpu(np.array([preprocessed_audio]).astype(np.float16)) output = self.w2l(inputs).get() output = np.vstack(output).astype(np.float32) result = self.decoder.decode(output) if not self.cpu: from PuzzleLib.Backend.gpuarray import memoryPool memoryPool.freeHeld() del inputs, output return result
def __init__(self, lr1, num_iterations1, lr2, num_iterations2, batch_size, l2penalty): self.lr1 = lr1 self.num_iterations1 = num_iterations1 self.lr2 = lr2 self.num_iterations2 = num_iterations2 self.batch_size = batch_size self.l2penalty = l2penalty with open('labels.json') as label_file: self.labels = str(''.join(json.load(label_file))) self.labels_map = dict([(self.labels[i], i) for i in range(len(self.labels))]) self.decoder = GreedyDecoder(self.labels, blank_index=self.labels.index('_'))
def acc(model): decoder = GreedyDecoder() train_loader = SpeechDataloader(SpeechDataset('uf.csv'), batch_size=16) train_acc = evaluate(model, train_loader, decoder) test_loader = SpeechDataloader(SpeechDataset('test.csv'), batch_size=16) test_acc = evaluate(model, test_loader, decoder) return train_acc, test_acc
def training_loop(model, kwargs, train_dataset, train_batch_loader, eval_dataset): device = 'cuda:0' if torch.cuda.is_available() and kwargs['cuda'] else 'cpu' model.to(device) greedy_decoder = GreedyDecoder(model.labels) criterion = nn.CTCLoss(blank=0,reduction='none') parameters = model.parameters() optimizer = torch.optim.SGD(parameters,lr=kwargs['lr'],momentum=kwargs['momentum'],nesterov=True,weight_decay=1e-5) scaling_factor = model.get_scaling_factor() epochs=kwargs['epochs'] print('Train dataset size:%d' % len(train_dataset)) batch_count = math.ceil(len(train_dataset) / kwargs['batch_size']) for epoch in range(epochs): with timing.EpochTimer(epoch,_log_to_tensorboard) as et: model.train() total_loss = 0 for idx, data in et.across_epoch('Data Loading time', tqdm.tqdm(enumerate(train_batch_loader),total=batch_count)): inputs, input_lengths, targets, target_lengths, file_paths, texts = data with et.timed_action('Model execution time'): out = model(torch.FloatTensor(inputs).to(device)) out = out.transpose(1,0) output_lengths = [l // scaling_factor for l in input_lengths] with et.timed_action('Loss and BP time'): loss = criterion(out, targets.to(device), torch.IntTensor(output_lengths), torch.IntTensor(target_lengths)) optimizer.zero_grad() loss.mean().backward() optimizer.step() total_loss += loss.mean().item() log_loss_to_tensorboard(epoch, total_loss / batch_count) evaluate(model,eval_dataset,greedy_decoder,epoch,kwargs) if epoch != 0 and epoch % kwargs['epochs_per_save'] == 0 : save_epoch_model(model,epoch, kwargs['model_dir']) if kwargs['model_dir']: save_model(model, kwargs['model_dir']+'/final.pth') print('Finished at %s' % time.asctime())
def an4_prepare(self): from audio_data.data_loader import AudioDataLoader, SpectrogramDataset, BucketingSampler, DistributedBucketingSampler from decoder import GreedyDecoder audio_conf = self.ext['audio_conf'] labels = self.ext['labels'] train_manifest = os.path.join(self.data_dir, 'an4_train_manifest.csv') val_manifest = os.path.join(self.data_dir, 'an4_val_manifest.csv') with open('labels.json') as label_file: labels = str(''.join(json.load(label_file))) trainset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, normalize=True, augment=True) self.trainset = trainset testset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, normalize=True, augment=False) self.testset = testset if self.nworkers > 1: train_sampler = DistributedBucketingSampler(self.trainset, batch_size=self.batch_size, num_replicas=self.nworkers, rank=self.rank) else: train_sampler = BucketingSampler(self.trainset, batch_size=self.batch_size) self.train_sampler = train_sampler trainloader = AudioDataLoader(self.trainset, num_workers=4, batch_sampler=self.train_sampler) testloader = AudioDataLoader(self.testset, batch_size=self.batch_size, num_workers=4) self.trainloader = trainloader self.testloader = testloader decoder = GreedyDecoder(labels) self.decoder = decoder
class SpeechTranscriber: def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True) def transcribe(self, audio_file): """ :param audio_file: :return: """ spect = self.parser.parse_audio(audio_file).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = self.deep_speech_model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output = self.decoder.decode(out.data) return decoded_output
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float( len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float( len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def test_beam_is_not_greedy(): ''' Example from https://towardsdatascience.com/beam-search-decoding-in-ctc-trained-neural-networks-51889a3d85a7 Shows that beam search can find a path that greedy decoding can not. ''' labels = ['_', 'A', 'B', ' '] samples = np.array([[0.8, 0.2, 0, 0], [0.6, 0.4, 0, 0]]) res = prefix_beam_search(samples, labels, blank_index=0, return_weights=True) assert res == ('A', 0.52) greedy_decoder = GreedyDecoder(labels, blank_index=0) greedy_res = greedy_decoder.decode(torch.FloatTensor(samples).unsqueeze(0), sizes=None) assert greedy_res == ['']
def get_decoder(decoder_type, lm_path, labels, beam_search_params): if decoder_type == 'beam': decoder = PrefixBeamSearchLMDecoder(lm_path,labels,**beam_search_params) else: if not decoder_type == 'greedy': print ('Decoder type not recognized, defaulting to greedy') decoder = GreedyDecoder(labels) return decoder
def main(model_path, confs): model, __ = MultiTask.load_model(model_path) if confs['cuda']: model = model.cuda() if not model._meta['use_transcripts_out']: # only accent classification criterion = nn.CrossEntropyLoss() elif not model._meta['use_accents_out']: # only text recognition criterion = CTCLoss() else: # both tasks criterion = (CTCLoss(), nn.CrossEntropyLoss()) # Results results = {} for manifest, lm in confs['testing_manifests']: eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}') # Decoder if model._meta['use_transcripts_out']: decoder = BeamCTCDecoder(confs['labels'], lm_path=lm, alpha=confs['decoder_alpha'], beta=confs['decoder_beta'], cutoff_top_n=confs['decoder_cutoff_top_n'], cutoff_prob=confs['decoder_cutoff_top_n'], beam_width=confs['decoder_beam_width'], num_processes=confs['num_workers']) target_decoder = GreedyDecoder(confs['labels']) else: decoder, target_decoder = None, None # Test results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers']) if not PRINT_LATEX_TABLE: print(f'Model: {model_path.split("/")[-1]}') for name, res in results.items(): print(f'\nResults for {name}:') print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()])) else: print(' & '.join(['model']+list([k[:-4] for k in results.keys()]))) val_dict = {} for k in list(results.values())[0].keys(): val_dict[k] = [] for res in results.values(): [val_dict[k].append(f'{v:.1f}') for k, v in res.items()] for val in val_dict.values(): print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
def __init__(self, extractor, needs=None, store=False, key=None, **extractor_args): super(TextFeature, self).__init__(extractor, needs=needs, store=store, encoder=TextEncoder, decoder=GreedyDecoder(), key=key, **extractor_args)
def __init__(self, purge=True): self.facedetector = dlib.get_frontal_face_detector() self.facepredictor = dlib.shape_predictor( talkPredictor.FACE_MODEL_PATH) self.start_time = int(round(time.time() * 1000)) self.log = pd.DataFrame(data=[], columns=['ts', 'key', 'value']) self.log.set_index(['ts', 'key']) self.purge = purge self.procdevice = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.talkmodel = Model.load_model( talkPredictor.RNN_MODEL_PATH) # , map_location='cpu' self.talkmodel.eval() self.talklabels = Model.get_labels(self.talkmodel) self.talkdecoder = GreedyDecoder( self.talklabels, blank_index=self.talklabels.index('_')) self.audio_conf = Model.get_audio_conf(self.talkmodel) self.samplerate = 16000 self.framerate = None self.video_queue = {} self.audio_queue = [] self.pred_queue = []
def __init__(self, config_path='config.ini'): if config_path is None: raise Exception('Path to config file is None') self.config = configparser.ConfigParser() self.config.read(config_path, encoding='UTF-8') self.labels = self.config['Wav2Letter']['labels'][1:-1] self.sample_rate = int(self.config['Wav2Letter']['sample_rate']) self.window_size = float(self.config['Wav2Letter']['window_size']) self.window_stride = float(self.config['Wav2Letter']['window_stride']) self.greedy = int(self.config['Wav2Letter']['greedy']) self.cpu = int(self.config['Wav2Letter']['cpu']) if self.cpu: from PuzzleLib import Config Config.backend = Config.Backend.cpu from PuzzleLib.Models.Nets.WaveToLetter import loadW2L from PuzzleLib.Modules import MoveAxis nfft = int(self.sample_rate * self.window_size) self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2), nlabels=len(self.labels)) self.w2l.append(MoveAxis(src=2, dst=0)) if not self.cpu: self.w2l.calcMode(np.float16) self.w2l.evalMode() if not self.greedy: from decoder import TrieDecoder lexicon = self.config['Wav2Letter']['lexicon'] tokens = self.config['Wav2Letter']['tokens'] lm_path = self.config['Wav2Letter']['lm_path'] beam_threshold = float(self.config['Wav2Letter']['beam_threshold']) self.decoder = TrieDecoder(lexicon, tokens, lm_path, beam_threshold) else: self.decoder = GreedyDecoder(self.labels)
def build(): global model, spect_parser, decoder, device logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cpu") model = load_model(device, "/workspace/models/deepspeech_final.pth", False) decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised')
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index): print("Beginning decode for {}, {}".format(lm_alpha, lm_beta)) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0) target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n, blank_index=labels.index('_'), lm_path=args.lm_path, alpha=lm_alpha, beta=lm_beta, num_processes=1) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): inputs, targets, input_percentages, target_sizes = data # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out = torch.from_numpy(logits[i][0]) sizes = torch.from_numpy(logits[i][1]) decoded_output, _, = decoder.decode(out, sizes) target_strings = target_decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer_inst = decoder.wer(transcript, reference) / float(len(reference.split())) cer_inst = decoder.cer(transcript, reference) / float(len(reference)) wer += wer_inst cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def main(): import argparse global model, spect_parser, decoder, args, device, decompressor parser = argparse.ArgumentParser( description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') decompressor = LZString() server = WebsocketServer(host=args.host, port=args.port) server.set_fn_new_client(new_client) server.set_fn_client_left(client_left) server.set_fn_message_received(message_received) server.run_forever()
def main(): import argparse global model, spect_parser, decoder, args, device parser = argparse.ArgumentParser( description="DeepSpeech transcription server") parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to be used by the server") parser.add_argument("--port", type=int, default=8888, help="Port to be used by the server") parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info("Setting up server...") torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.cuda) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder( model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers, ) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index("_")) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info("Server initialised") app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) device = 'cpu' model_path = '/home/chris/git/deepspeech.pytorch/models/ted_pretrained_v2.pth' half = False model = load_model(device, model_path, half) # if args.decoder == "beam": # from decoder import BeamCTCDecoder # decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, # cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, # beam_width=args.beam_width, num_processes=args.lm_workers) # else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) self.half = half self.device = device self.decoder = decoder self.model = model self.spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) criterion = CTCLoss() decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size, num_replicas=args.world_size, rank=args.rank) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(
def main(): args = parser.parse_args() cf = ConfigParser.ConfigParser() try: cf.read(args.conf) except: print("conf file not exists") sys.exit(1) USE_CUDA = cf.getboolean('Training', 'use_cuda') try: seed = long(cf.get('Training', 'seed')) except: seed = torch.cuda.initial_seed() cf.set('Training', 'seed', seed) cf.write(open(args.conf, 'w')) torch.manual_seed(seed) if USE_CUDA: torch.cuda.manual_seed(seed) log_dir = cf.get('Data', 'log_dir') log_file = os.path.join(log_dir, cf.get('Data', 'log_file')) logger = init_logger(log_file) # Define Model rnn_input_size = cf.getint('Model', 'rnn_input_size') rnn_hidden_size = cf.getint('Model', 'rnn_hidden_size') rnn_layers = cf.getint('Model', 'rnn_layers') rnn_type = RNN[cf.get('Model', 'rnn_type')] bidirectional = cf.getboolean('Model', 'bidirectional') batch_norm = cf.getboolean('Model', 'batch_norm') rnn_param = { "rnn_input_size": rnn_input_size, "rnn_hidden_size": rnn_hidden_size, "rnn_layers": rnn_layers, "rnn_type": rnn_type, "bidirectional": bidirectional, "batch_norm": batch_norm } num_class = cf.getint('Model', 'num_class') drop_out = cf.getfloat('Model', 'drop_out') model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out) print("Model Structure:") logger.info("Model Structure:") for idx, m in enumerate(model.children()): print(idx, m) logger.info(str(idx) + "->" + str(m)) data_dir = cf.get('Data', 'data_dir') batch_size = cf.getint("Training", 'batch_size') # Data Loader train_dataset = SpeechDataset(data_dir, data_set='train') dev_dataset = SpeechDataset(data_dir, data_set="dev") train_loader = SpeechDataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) dev_loader = SpeechDataLoader(dev_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # ensure the feats is equal to the rnn_input_Size assert train_dataset.n_feats == rnn_input_size # decoder for dev set decoder = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index=0) # Training init_lr = cf.getfloat('Training', 'init_lr') num_epoches = cf.getint('Training', 'num_epoches') end_adjust_acc = cf.getfloat('Training', 'end_adjust_acc') decay = cf.getfloat("Training", 'lr_decay') weight_decay = cf.getfloat("Training", 'weight_decay') params = { 'num_epoches': num_epoches, 'end_adjust_acc': end_adjust_acc, 'seed': seed, 'decay': decay, 'learning_rate': init_lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'n_feats': train_dataset.n_feats } print(params) if USE_CUDA: model = model.cuda() loss_fn = CTCLoss() optimizer = torch.optim.Adam(model.parameters(), lr=init_lr, weight_decay=weight_decay) # visualization for training from visdom import Visdom viz = Visdom() title = 'TIMIT LSTM_CTC Acoustic Model' opts = [ dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'), dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'), dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch') ] viz_window = [None, None, None] count = 0 learning_rate = init_lr loss_best = 1000 loss_best_true = 1000 adjust_rate_flag = False stop_train = False adjust_time = 0 acc_best = 0 start_time = time.time() loss_results = [] dev_loss_results = [] dev_cer_results = [] while not stop_train: if count >= num_epoches: break count += 1 if adjust_rate_flag: learning_rate *= decay adjust_rate_flag = False for param in optimizer.param_groups: param['lr'] *= decay print("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) logger.info("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) loss = train(model, train_loader, loss_fn, optimizer, logger, print_every=20, USE_CUDA=USE_CUDA) loss_results.append(loss) acc, dev_loss = dev(model, dev_loader, loss_fn, decoder, logger, USE_CUDA=USE_CUDA) print("loss on dev set is %.4f" % dev_loss) logger.info("loss on dev set is %.4f" % dev_loss) dev_loss_results.append(dev_loss) dev_cer_results.append(acc) # adjust learning rate by dev_loss #adjust_rate_count : 表示连续超过count个epoch的loss在end_adjust_acc区间内认为稳定 if dev_loss < (loss_best - end_adjust_acc): loss_best = dev_loss loss_best_true = dev_loss adjust_rate_count = 0 acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) elif (dev_loss < loss_best + end_adjust_acc): adjust_rate_count += 1 if dev_loss < loss_best and dev_loss < loss_best_true: loss_best_true = dev_loss acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) else: adjust_rate_count = 10 print("adjust_rate_count: %d" % adjust_rate_count) print('adjust_time: %d' % adjust_time) logger.info("adjust_rate_count: %d" % adjust_rate_count) logger.info('adjust_time: %d' % adjust_time) if adjust_rate_count == 10: adjust_rate_flag = True adjust_time += 1 adjust_rate_count = 0 if loss_best > loss_best_true: loss_best = loss_best_true model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) if adjust_time == 8: stop_train = True time_used = (time.time() - start_time) / 60 print("epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) logger.info( "epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) x_axis = range(count) y_axis = [ loss_results[0:count], dev_loss_results[0:count], dev_cer_results[0:count] ] for x in range(len(viz_window)): if viz_window[x] is None: viz_window[x] = viz.line( X=np.array(x_axis), Y=np.array(y_axis[x]), opts=opts[x], ) else: viz.line( X=np.array(x_axis), Y=np.array(y_axis[x]), win=viz_window[x], update='replace', ) print("End training, best dev loss is: %.4f, acc is: %.4f" % (loss_best_true, acc_best)) logger.info("End training, best dev loss acc is: %.4f, acc is: %.4f" % (loss_best_true, acc_best)) model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) best_path = os.path.join(log_dir, 'best_model' + '_dev' + str(acc_best) + '.pkl') cf.set('Model', 'model_file', best_path) cf.write(open(args.conf, 'w')) params['epoch'] = count torch.save( CTC_Model.save_package(model, optimizer=optimizer, epoch=params, loss_results=loss_results, dev_loss_results=dev_loss_results, dev_cer_results=dev_cer_results), best_path)
rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) criterion = CTCLoss() decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(
class Trainer(object): # the most basic model def __init__(self, config, data_loader=None): self.config = config self.data_loader = data_loader # needed for VAE self.lr = config.lr self.beta1 = config.beta1 self.beta2 = config.beta2 self.optimizer = config.optimizer self.batch_size = config.batch_size self.diffLoss = L1Loss_mask() # custom module self.valmin_iter = 0 self.model_dir = 'logs/' + str(config.expnum) self.savename_G = '' self.decoder = GreedyDecoder(data_loader.labels) self.kt = 0 # used for Proportional Control Theory in BEGAN, initialized as 0 self.lb = 0.001 self.conv_measure = 0 # convergence measure self.dce_tr = AverageMeter() self.dce_val = AverageMeter() self.wer_tr = AverageMeter() self.cer_tr = AverageMeter() self.wer_val = AverageMeter() self.cer_val = AverageMeter() self.build_model() self.G.loss_stop = 100000 #self.get_weight_statistic() if self.config.gpu >= 0: self.G.cuda() self.ASR.cuda() if len(self.config.load_path) > 0: self.load_model() if config.mode == 'train': self.logFile = open(self.model_dir + '/log.txt', 'w') def zero_grad_all(self): self.G.zero_grad() def build_model(self): print('initialize enhancement model') self.G = stackedBRNN(I=self.config.nFeat, H=self.config.rnn_size, L=self.config.rnn_layers, rnn_type=supported_rnns[self.config.rnn_type]) print('load pre-trained ASR model') package_ASR = torch.load(self.config.ASR_path, map_location=lambda storage, loc: storage) self.ASR = DeepSpeech.load_model_package(package_ASR) # Weight initialization is done inside the module def load_model(self): print("[*] Load models from {}...".format(self.load_path)) postfix = '_valmin' paths = glob(os.path.join(self.load_path, 'G{}*.pth'.format(postfix))) paths.sort() if len(paths) == 0: print("[!] No checkpoint found in {}...".format(self.load_path)) assert (0), 'checkpoint not avilable' idxes = [ int(os.path.basename(path.split('.')[0].split('_')[-1])) for path in paths ] if self.config.start_iter < 0: self.config.start_iter = max(idxes) if (self.config.start_iter < 0): # if still 0, then raise error raise Exception( "start iter is still less than 0 --> probably try to load initial random model" ) if self.config.gpu < 0: #CPU map_location = lambda storage, loc: storage else: # GPU map_location = None # Ver2 print('Load models from ' + self.load_path + ', ITERATION = ' + str(self.config.start_iter)) self.G.load_state_dict( torch.load('{}/G{}_{}.pth'.format(self.load_path[:-1], postfix, self.config.start_iter), map_location=map_location)) print("[*] Model loaded") def train(self): # Setting optimizer_g = torch.optim.Adam(self.G.parameters(), lr=self.config.lr, betas=(self.beta1, self.beta2), amsgrad=True) for iter in trange(self.config.start_iter, self.config.max_iter): # Train data_list = self.data_loader.next(cl_ny='ny', type='train') inputs, cleans, mask = _get_variable_nograd( data_list[0]), _get_variable_nograd( data_list[1]), _get_variable_nograd(data_list[2]) # forward outputs = self.G(inputs) dce, nElement = self.diffLoss( outputs, cleans, mask) # already normalized inside function # backward self.zero_grad_all() dce.backward() optimizer_g.step() # log #pdb.set_trace() if (iter + 1) % self.config.log_iter == 0: str_loss = "[{}/{}] (train) DCE: {:.7f}".format( iter, self.config.max_iter, dce.data[0]) print(str_loss) self.logFile.write(str_loss + '\n') self.logFile.flush() if (iter + 1) % self.config.save_iter == 0: self.G.eval() # Measure performance on training subset self.dce_tr.reset() self.wer_tr.reset() self.cer_tr.reset() for _ in trange(0, len(self.data_loader.trsub_dl)): data_list = self.data_loader.next(cl_ny='ny', type='trsub') inputs, cleans, mask, targets, input_percentages, target_sizes = \ _get_variable_volatile(data_list[0]), _get_variable_volatile(data_list[1]), _get_variable_volatile(data_list[2]), \ data_list[3], data_list[4], data_list[5] outputs = self.G(inputs) dce, nElement = self.diffLoss( outputs, cleans, mask) # already normalized inside function self.dce_tr.update(dce.data[0], nElement) # Greedy decodoing wer, cer, nWord, nChar = self.greedy_decoding( inputs, targets, input_percentages, target_sizes) self.wer_tr.update(wer, nWord) self.cer_tr.update(cer, nChar) str_loss = "[{}/{}] (training subset) DCE: {:.7f}".format( iter, self.config.max_iter, self.dce_tr.avg) print(str_loss) self.logFile.write(str_loss + '\n') str_loss = "[{}/{}] (training subset) WER: {:.7f}, CER: {:.7f}".format( iter, self.config.max_iter, self.wer_tr.avg * 100, self.cer_tr.avg * 100) print(str_loss) self.logFile.write(str_loss + '\n') # Measure performance on validation data self.dce_val.reset() self.wer_val.reset() self.cer_val.reset() for _ in trange(0, len(self.data_loader.val_dl)): data_list = self.data_loader.next(cl_ny='ny', type='val') inputs, cleans, mask, targets, input_percentages, target_sizes = \ _get_variable_volatile(data_list[0]), _get_variable_volatile(data_list[1]), _get_variable_volatile(data_list[2]), \ data_list[3], data_list[4], data_list[5] outputs = self.G(inputs) dce, nElement = self.diffLoss( outputs, cleans, mask) # already normalized inside function self.dce_val.update(dce.data[0], nElement) # Greedy decodoing wer, cer, nWord, nChar = self.greedy_decoding( inputs, targets, input_percentages, target_sizes) self.wer_val.update(wer, nWord) self.cer_val.update(cer, nChar) str_loss = "[{}/{}] (validation) DCE: {:.7f}".format( iter, self.config.max_iter, self.dce_val.avg) print(str_loss) self.logFile.write(str_loss + '\n') str_loss = "[{}/{}] (validation) WER: {:.7f}, CER: {:.7f}".format( iter, self.config.max_iter, self.wer_val.avg * 100, self.cer_val.avg * 100) print(str_loss) self.logFile.write(str_loss + '\n') self.G.train() # end of validation self.logFile.flush() # Save model if (len(self.savename_G) > 0): # do not remove here if os.path.exists(self.savename_G): os.remove(self.savename_G) # remove previous model self.savename_G = '{}/G_{}.pth'.format(self.model_dir, iter) torch.save(self.G.state_dict(), self.savename_G) if (self.G.loss_stop > self.wer_val.avg): self.G.loss_stop = self.wer_val.avg savename_G_valmin_prev = '{}/G_valmin_{}.pth'.format( self.model_dir, self.valmin_iter) if os.path.exists(savename_G_valmin_prev): os.remove( savename_G_valmin_prev) # remove previous model print('save model for this checkpoint') savename_G_valmin = '{}/G_valmin_{}.pth'.format( self.model_dir, iter) copyfile(self.savename_G, savename_G_valmin) self.valmin_iter = iter def greedy_decoding(self, inputs, targets, input_percentages, target_sizes, transcript_prob=0.001): # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size # step 1) Decoding to get wer & cer enhanced = self.G(inputs) prob = self.ASR(enhanced) prob = prob.transpose(0, 1) T = prob.size(0) sizes = input_percentages.mul_(int(T)).int() decoded_output, _ = self.decoder.decode(prob.data, sizes) target_strings = self.decoder.convert_to_strings(split_targets) we, ce, total_word, total_char = 0, 0, 0, 0 for x in range(len(target_strings)): decoding, reference = decoded_output[x][0], target_strings[x][0] nChar = len(reference) nWord = len(reference.split()) we_i = self.decoder.wer(decoding, reference) ce_i = self.decoder.cer(decoding, reference) we += we_i ce += ce_i total_word += nWord total_char += nChar if (random.uniform(0, 1) < transcript_prob): print('reference = ' + reference) print('decoding = ' + decoding) print('wer = ' + str(we_i / float(nWord)) + ', cer = ' + str(ce_i / float(nChar))) wer = we / total_word cer = ce / total_word return wer, cer, total_word, total_char
def test(): args = parser.parse_args() cf = ConfigParser.ConfigParser() cf.read(args.conf) USE_CUDA = cf.getboolean('Training', 'USE_CUDA') model_path = cf.get('Model', 'model_file') data_dir = cf.get('Data', 'data_dir') beam_width = cf.getint('Decode', 'beam_width') package = torch.load(model_path) rnn_param = package["rnn_param"] num_class = package["num_class"] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] decoder_type = cf.get('Decode', 'decoder_type') data_set = cf.get('Decode', 'eval_dataset') test_dataset = SpeechDataset(data_dir, data_set=data_set) model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out) test_loader = SpeechDataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=False) model.load_state_dict(package['state_dict']) model.eval() if USE_CUDA: model = model.cuda() if decoder_type == 'Greedy': decoder = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index = 0) else: decoder = BeamDecoder(int2char, beam_width=beam_width, blank_index = 0, space_idx = len(int2char) - 1) total_wer = 0 total_cer = 0 start = time.time() for data in test_loader: inputs, target, input_sizes, input_size_list, target_sizes = data inputs = inputs.transpose(0,1) inputs = Variable(inputs, volatile=True, requires_grad=False) if USE_CUDA: inputs = inputs.cuda() inputs = nn.utils.rnn.pack_padded_sequence(inputs, input_size_list) probs = model(inputs) probs = probs.data.cpu() decoded = decoder.decode(probs, input_size_list) targets = decoder._unflatten_targets(target, target_sizes) labels = decoder._process_strings(decoder._convert_to_strings(targets)) for x in range(len(labels)): print("origin : " + labels[x]) print("decoded: " + decoded[x]) cer = 0 wer = 0 for x in range(len(labels)): cer += decoder.cer(decoded[x], labels[x]) wer += decoder.wer(decoded[x], labels[x]) decoder.num_word += len(labels[x].split()) decoder.num_char += len(labels[x]) total_cer += cer total_wer += wer CER = (1 - float(total_cer) / decoder.num_char)*100 WER = (1 - float(total_wer) / decoder.num_word)*100 print("Character error rate on test set: %.4f" % CER) print("Word error rate on test set: %.4f" % WER) end = time.time() time_used = (end - start) / 60.0 print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))
model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size])
device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder,
labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
def __init__(self, config, data_loader=None): if (config.w_minWvar > 0): config.minimize_W_var = True self.varLoss = var_mask() self.config = config self.data_loader = data_loader # needed for VAE self.lr = config.lr self.beta1 = config.beta1 self.beta2 = config.beta2 self.optimizer = config.optimizer self.batch_size = config.batch_size self.diffLoss = L1Loss_mask() # custom module log_domain = False if (self.config.linear_to_mel): log_domain = True self.get_SNRout = get_SNRout(log_domain=log_domain) self.valmin_iter = 0 self.model_dir = 'models/' + str(config.expnum) self.log_dir = 'logs_only/' + str(config.expnum) self.savename_G = '' self.decoder = GreedyDecoder(data_loader.labels) self.kt = 0 # used for Proportional Control Theory in BEGAN, initialized as 0 self.lb = 0.001 self.conv_measure = 0 # convergence measure self.dce_tr = AverageMeter() self.dce_val = AverageMeter() self.snrout_tr = AverageMeter() self.snrout_val = AverageMeter() self.snrimpv_tr = AverageMeter() self.snrimpv_val = AverageMeter() if (config.linear_to_mel): self.mel_basis = librosa.filters.mel(self.config.fs, self.config.nFFT, self.config.nMel) self.melF_to_linearFs = get_linearF_from_melF(self.mel_basis) self.STFT_to_LMFB = STFT_to_LMFB(self.mel_basis, window_change=False) self.mag2mfb = linearmag2mel(self.mel_basis) mel_basis_20ms = librosa.filters.mel( self.config.fs, 320, self.config.nMel ) # mel_basis will be used only for 20ms window spectrogram self.STFT_to_LMFB_20ms = STFT_to_LMFB(mel_basis_20ms, win_size=self.config.nFFT) self.F = int(self.config.nFFT / 2 + 1) self.build_model() self.G.loss_stop = 100000 #self.get_weight_statistic() if self.config.gpu >= 0: self.G.cuda() if len(self.config.load_path) > 0: self.load_model() if config.mode == 'train': self.logFile = open(self.log_dir + '/log.txt', 'w')