def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def define_rnn(rnn_options, audio_conf): rnn = DeepSpeech(rnn_hidden_size=800, nb_layers=5, labels=rnn_options['labels'], rnn_type=rnn_options['rnn_type'], audio_conf=audio_conf, bidirectional=True) parameters = rnn.parameters() return (rnn, parameters)
def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True)
def decode_results(model, decoded_output, decoded_offsets): results = { "output": [], "_meta": { "acoustic_model": { "name": os.path.basename(args.model_path) }, "language_model": { "name": os.path.basename(args.lm_path) if args.lm_path else None, }, "decoder": { "lm": args.lm_path is not None, "alpha": args.alpha if args.lm_path is not None else None, "beta": args.beta if args.lm_path is not None else None, "type": args.decoder, } } } results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model)) for b in range(len(decoded_output)): for pi in range(min(args.top_paths, len(decoded_output[b]))): result = {'transcription': decoded_output[b][pi]} if args.offsets: result['offsets'] = decoded_offsets[b][pi] results['output'].append(result) return results
def load_model(device, model_path, use_half): model = DeepSpeech.load_model(model_path) model.eval() model = model.to(device) if use_half: model = model.half() return model
def load_model(device, model_path, is_cuda): model = DeepSpeech.load_model(model_path) model.eval() model = model.to(device) if is_cuda and model.mixed_precision: model = convert_model_to_half(model) return model
def build_deepspeech_model(): sample_rate = 16000 window_size = .02 window_stride = .01 window = 'hamming' noise_dir = None noise_prob = 0.4 noise_min = 0.0 noise_max = 0.5 audio_conf = dict(sample_rate=sample_rate, window_size=window_size, window_stride=window_stride, window=window, noise_dir=noise_dir, noise_prob=noise_prob, noise_levels=(noise_min, noise_max)) hidden_size = 100 hidden_layers = 5 labels_path = 'labels.json' rnn_type = 'gru' bidirectional = True model = DeepSpeech(rnn_hidden_size=hidden_size, nb_layers=hidden_layers, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=bidirectional) return model
def load_state(cls, state_path): print("Loading state from model %s" % state_path) state = torch.load(state_path, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(state) optim_state = state['optim_dict'] amp_state = state['amp'] epoch = int(state.get('epoch', 1)) - 1 # Index start at 0 for training training_step = state.get('iteration', None) if training_step is None: epoch += 1 # We saved model after epoch finished, start at the next epoch. training_step = 0 else: training_step += 1 avg_loss = int(state.get('avg_loss', 0)) loss_results = state['loss_results'] cer_results = state['cer_results'] wer_results = state['wer_results'] best_wer = state.get('best_wer') result_state = ResultState(loss_results=loss_results, cer_results=cer_results, wer_results=wer_results) return cls(optim_state=optim_state, amp_state=amp_state, model=model, result_state=result_state, best_wer=best_wer, avg_loss=avg_loss, epoch=epoch, training_step=training_step)
def get_model(params): if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print( "ERROR: GRU does not currently support activations other than tanh" ) sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=params.hidden_size, nb_layers=params.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=False, rnn_activation=params.rnn_act_type, bias=params.bias) return model
def decode_results(model, decoded_output, decoded_offsets): results = { "output": [], "_meta": { "acoustic_model": { "name": os.path.basename(args.model_path) }, "language_model": { "name": os.path.basename(args.lm_path) if args.lm_path else None, }, "decoder": { "lm": args.lm_path is not None, "alpha": args.alpha if args.lm_path is not None else None, "beta": args.beta if args.lm_path is not None else None, "type": args.decoder, } } } results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model)) for b in range(len(decoded_output)): for pi in range(min(args.top_paths, len(decoded_output[b]))): result = {'transcription': decoded_output[b][pi]} if args.offsets: result['offsets'] = decoded_offsets[b][pi].tolist() results['output'].append(result) return results
def load_model(device, model_path, model_name, use_half): if model_name == 'DeepSpeech': model = DeepSpeech.load_model(model_path) elif model_name == 'DFCNN': model = DFCNN.load_model(model_path) model.eval() model = model.to(device) if use_half: model = model.half() return model
def on_epoch_end(self, model, optimizer, epoch, loss_results, wer_results, cer_results): self.logger.debug("Saving checkpoint {}".format(epoch + 1)) file_path = '%s/deepspeech_%d.pth' % (self.save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path)
def on_batch_end(self, model, optimizer, epoch, batch_no, loss_results, wer_results, cer_results, avg_loss): if batch_no > 0 and (batch_no + 1) % self.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % ( self.save_folder, epoch + 1, batch_no + 1) self.logger.debug("Saving checkpoint model to %s" % file_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=batch_no, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path)
def __init__(self, package, input_size): super(M_Noise_Deepspeech, self).__init__() self.small_const = 1e-6 self.T = input_size[3] self.K = input_size[2] self.m = torch.nn.Parameter(torch.Tensor( np.array([args.m] * self.T * self.K, dtype=np.float32).reshape(self.K, self.T, 1)).cuda(), requires_grad=True) self.range1 = torch.Tensor( np.array(list(range(self.K)) * self.K * self.T).reshape( (self.K, self.T, self.K))).cuda() self.range2 = torch.Tensor( np.array(list(range(self.K)) * self.K * self.T).reshape( (self.K, self.T, self.K)).transpose()).cuda() self.relu = torch.nn.ReLU() self.deepspeech_net = DeepSpeech.load_model_package(package)
def load_model(device, model_path, use_half): # use load_model method from DeepSpeech class model = DeepSpeech.load_model(model_path) # set model to eval model.eval() # put model on device (GPU/CPU) model = model.to(device) # if the model is using half-precision sampling, use the half method of the model to indicate so if use_half: model = model.half() # return the model return model
def __init__(self, package, input_size): super(M_Noise_Deepspeech, self).__init__() small_const = 1e-6 self.T = input_size[3] self.K = input_size[2] self.m = torch.nn.Parameter(torch.Tensor( np.array([0.1] * self.T * self.K, dtype=np.float32).reshape(self.K, self.T, 1)), requires_grad=True) self.m_tile = self.m.repeat([1, 1, self.K]).cuda() self.range1 = torch.Tensor( np.array(list(range(self.K)) * self.K * self.T).reshape( (self.K, self.T, self.K))).cuda() self.range2 = torch.Tensor( np.array(list(range(self.K)) * self.K * self.T).reshape( (self.K, self.T, self.K)).transpose()).cuda() self.relu = torch.nn.ReLU() out = self.relu(self.m_tile - torch.abs(self.range1 - self.range2)) / ( torch.pow(self.m_tile, 2) + small_const) self.blar = torch.mul(out, (self.m_tile > 1).float()) + torch.mul( (self.m_tile < 1).float(), (self.range1 == self.range2).float()) self.deepspeech_net = DeepSpeech.load_model_package(package)
def decode_results(model, decoded_output, decoded_offsets): results = { "output": [], "_meta": { "acoustic_model": { "name": os.path.basename(args.model_path) }, "language_model": { "name": os.path.basename(args.lm_path) if args.lm_path else None, }, "decoder": { "lm": args.lm_path is not None, "alpha": args.alpha if args.lm_path is not None else None, "beta": args.beta if args.lm_path is not None else None, "type": args.decoder, } } } results['_meta']['acoustic_model'].update(DeepSpeech.get_meta(model)) str = '' print("len is : ", len(decoded_output)) for b in range(len(decoded_output)): str2 = '' for pi in range(min(args.top_paths, len(decoded_output[b]))): result = {'transcription': decoded_output[b][pi]} #if(decoded_output[b][pi]!=" "): str2 += decoded_output[b][pi] if args.offsets: result['offsets'] = decoded_offsets[b][pi] results['output'].append(result) #str+=',' #str+=removerepeat(str2) str += str2 str = removerepeat(str) str = str.lower() print(str) #return results return str
input_data = torch.randn(args.num_samples, 1, 161, args.seconds * 100) input_data = input_data.to(device) input_data = torch.chunk(input_data, int(len(input_data) / args.batch_size)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size) model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, audio_conf=audio_conf, labels=labels, rnn_type=supported_rnns[rnn_type], mixed_precision=args.mixed_precision) model = model.to(device) if args.mixed_precision: model = convert_model_to_half(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True, weight_decay=1e-5) if args.distributed: model = DistributedDataParallel(model) if args.mixed_precision: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale)
default=16000, type=int, help='Sample rate') parser.add_argument('--window_size', default=.02, type=float, help='Window size for spectrogram in seconds') args = parser.parse_args() input = torch.randn(args.batch_size, 1, 161, args.seconds * 100).cuda() rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, num_classes=29, rnn_type=supported_rnns[rnn_type], sample_rate=args.sample_rate, window_size=args.window_size) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True) model = torch.nn.DataParallel(model).cuda() criterion = CTCLoss() seconds = int(args.seconds) batch_size = int(args.batch_size) def iteration(input_data): target = torch.IntTensor(int(batch_size * ((seconds * 100) / 2))).fill_( 1) # targets, align half of the audio
save_folder = args.save_folder os.makedirs(save_folder, exist_ok=True) # Ensure save folder exists loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor( args.epochs) best_wer = None if main_proc and args.visdom: visdom_logger = VisdomLogger(args.id, args.epochs) if main_proc and args.tensorboard: tensorboard_logger = TensorBoardLogger(args.id, args.log_dir, args.log_params) avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = model.labels audio_conf = model.audio_conf if not args.finetune: # Don't want to restart training optim_state = package['optim_dict'] start_epoch = int(package.get('epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # model is after epoch finished, start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], \ package['wer_results'] best_wer = wer_results[start_epoch]
cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer] if __name__ == '__main__': if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = DeepSpeech.load_model(args.model_path) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) logits = np.load(args.logits) batch_size = logits[0][0].shape[0] results = [] def result_callback(result): results.append(result) p = Pool(args.num_workers)
def attack2(self, init_delta, target, model_path): self.delta2 = torch.FloatTensor(init_delta).cuda() self.delta2.requires_grad = True self.rescale = torch.ones((self.batch_size, 1)).cuda() self.final_deltas = [None] * self.batch_size self.alpha = torch.ones((self.batch_size, )).cuda() * 1 #self.alpha = 1 model = DeepSpeech.load_model(model_path) model = model.cuda() self.optim21 = torch.optim.Adam([self.delta2], lr=2) self.optim22 = torch.optim.Adam([self.delta2], lr=self.lr2) criterion = CTCLoss() th_batch = [] psd_max_batch = [] for ii in range(self.batch_size): th, _, psd_max = generate_th(self.original[ii].cpu().numpy(), fs=16000, window_size=2048) th_batch.append(th) psd_max_batch.append(psd_max) th_batch = np.array(th_batch) psd_max_batch = np.array(psd_max_batch) th_batch = torch.FloatTensor(th_batch).cuda() psd_max_batch = torch.FloatTensor(psd_max_batch).cuda() MAX = self.num_iterations2 model.train() loss_th = [np.inf] * self.batch_size for i in range(MAX): # print out some debug information every 10 iterations #print(self.delta) apply_delta = torch.clamp( self.delta2, -2000, 2000) * self.rescale #[batch_size * max_audio_len] new_input = apply_delta * self.mask + self.original #[batch_size * max_audio_len] #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len] pass_in = torch.clamp(new_input, -2**15, 2**15 - 1) pass_in = torch.div(pass_in, 2**15) #[batch_szie * max_audio_len] logits, logits_sizes = get_logits(pass_in, self.lengths.int(), model) #[batch_size * T * H] logits_ = logits.transpose(0, 1) # loss loss2 = criterion(logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() loss_value_2 = loss2.item() self.optim21.zero_grad() loss2.backward(retain_graph=True) self.delta2.grad = torch.sign(self.delta2.grad) self.optim21.step() loss1 = 0 loss1_each = [] for ii in range(self.batch_size): psd = psd_transform(apply_delta[ii], psd_max_batch[ii], win_length=2048, win_step=512) loss1 += self.alpha[ii] * torch.mean( torch.relu(psd - th_batch[ii])) loss1_each.append( torch.mean(torch.relu(psd - th_batch[ii])).item()) #psd_num = psd.cpu().detach().numpy() #th_ = th_batch[ii].cpu().detach().numpy() loss1 = loss1 / self.batch_size loss_value_1 = np.mean(loss1_each) self.optim22.zero_grad() loss1.backward() for ii in range(self.batch_size): self.delta2.grad[ii] = self.alpha[ii] * torch.sign( self.delta2.grad[ii]) #grad = np.sum(self.delta2.grad.cpu().numpy()) #if grad != grad: # print("NaN") self.optim22.step() apply_delta_ = torch.clamp(self.delta2, -2000, 2000) * self.rescale print('loss: ', loss_value_1, loss_value_2) if i + 1 == 2000: param_groups = self.optim21.param_groups for g in param_groups: g['lr'] = 0.1 param_groups = self.optim22.param_groups for g in param_groups: g['lr'] = 0.1 if i + 1 == 3200: param_groups = self.optim21.param_groups for g in param_groups: g['lr'] = 0.01 param_groups = self.optim22.param_groups for g in param_groups: g['lr'] = 0.01 if (i + 1) % 10 == 0: decode_out, _ = self.decoder.decode(logits, logits_sizes) print(i + 1, decode_out[0], [target[0]]) for ii in range(self.batch_size): if ((i + 1) % 50 == 0 and decode_out[ii] == [target[ii].upper()]) or ( i == MAX - 1 and self.final_deltas[ii] is None): self.alpha[ii] = 1.2 * self.alpha[ii] if self.alpha[ii] > 1000: self.alpha[ii] = 1000 # Adjust the best solution found so far if loss1_each[ii] < loss_th[ii]: loss_th[ii] = loss1_each[ii] self.final_deltas[ii] = new_input[ii][ 0:self.lengths[ii].int()].cpu().detach().numpy() print("up alpha=%f" % (self.alpha[ii])) if ((i + 1) % 100 == 0 and decode_out[ii] != [target[ii].upper()]): self.alpha[ii] = 0.6 * self.alpha[ii] ''' if self.alpha <= 100: self.alpha = 100 else: # Adjust the best solution found so far print("down alpha=%f" % (self.alpha)) ''' print("down alpha=%f" % (self.alpha[ii])) return self.final_deltas
def attack1(self, audios, lengths, max_audio_len, targets, model_path): self.max_audio_len = max_audio_len self.original = torch.FloatTensor(audios).cuda() self.lengths = torch.FloatTensor(lengths) #define some variables self.delta1 = torch.zeros((self.batch_size, self.max_audio_len)).cuda() self.delta1.requires_grad = True self.rescale = torch.ones((self.batch_size, 1)).cuda() self.mask = torch.FloatTensor( np.array([[1 if i < l else 0 for i in range(self.max_audio_len)] for l in self.lengths])).cuda() self.final_deltas = [None] * self.batch_size self.target_phrase_lengths = torch.IntTensor(self.batch_size) self.target_phrase = [] for x in range(self.batch_size): phrase = list( filter( None, [self.labels_map.get(x) for x in list(targets[x].upper())])) self.target_phrase_lengths[x] = len(phrase) self.target_phrase.extend(phrase) self.target_phrase = torch.IntTensor(self.target_phrase) #print(self.target_phrase.size(), self.target_phrase_lengths) model = DeepSpeech.load_model(model_path) model = model.cuda() self.optim1 = torch.optim.Adam([self.delta1], lr=self.lr1) criterion = CTCLoss() MAX = self.num_iterations1 model.train() #self.noise = torch.randn(self.delta1.shape).cuda() #[batch_szie * max_audio_len] for i in range(MAX): # print out some debug information every 10 iterations apply_delta = torch.clamp( self.delta1, -2000, 2000) * self.rescale #[batch_size * max_audio_len] new_input = apply_delta * self.mask + self.original #[batch_size * max_audio_len] #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len] pass_in = torch.clamp(new_input, -2**15, 2**15 - 1) pass_in = torch.div(pass_in, 2**15) #[batch_szie * max_audio_len] logits, logits_sizes = get_logits(pass_in, self.lengths.int(), model) #[batch_size * T * H] logits_ = logits.transpose(0, 1) # loss if not np.isinf(self.l2penalty): loss = torch.mean( (new_input - self.original) **2) + self.l2penalty * criterion( logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() else: loss = criterion(logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() loss_value = loss.item() # optimize self.optim1.zero_grad() loss.backward() # grad sign self.delta1.grad = torch.sign(self.delta1.grad) self.optim1.step() print('loss: ', loss_value) if (i + 1) % 10 == 0: decode_out, _ = self.decoder.decode(logits, logits_sizes) #print(decode_out, targets) for ii in range(self.batch_size): if ((i + 1) % 10 == 0 and decode_out[ii] == [targets[ii].upper()]) or ( i == MAX - 1 and self.final_deltas[ii] is None): bound_tmp = torch.max(torch.abs(self.delta1[ii])).item() if self.rescale[ii][0] * 2000 > bound_tmp: print("It's way over", bound_tmp / 2000.0) self.rescale[ii][0] = bound_tmp / 2000.0 self.rescale[ii][0] *= .8 # Adjust the best solution found so far self.final_deltas[ii] = new_input[ii].cpu().detach().numpy( ) print("bound=%f" % (2000 * self.rescale[ii][0])) return self.final_deltas
else: input_data = torch.randn(args.num_samples, 1, 161, args.seconds * 100).cuda() input_data = torch.chunk(input_data, int(len(input_data) / args.batch_size)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size) model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, audio_conf=audio_conf, labels=labels, rnn_type=supported_rnns[rnn_type]) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True) model.cuda() if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model) criterion = CTCLoss() seconds = int(args.seconds)
def transcribe(audio_path, parser, model, decoder, cuda=False): spect = parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) if cuda: spect = spect.cuda() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
def main(): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = audio_conf, bidirectional = True, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay = params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch] print(loss_results) epoch = start_epoch else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) ctc_start_time = time.time() loss = criterion(out, targets, sizes, target_sizes) ctc_time.update(time.time() - ctc_start_time) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm) # SGD step optimizer.step() if params.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, ctc_time=ctc_time, loss=losses)) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t' .format( epoch + 1, loss=avg_loss, )) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() wer, cer = eval_model( model, test_loader, decoder) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format( epoch + 1, wer=wer, cer=cer)) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr'])) if best_wer is None or best_wer > wer: print("Found better validated model, saving to %s" % args.model_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results) , args.model_path) best_wer = wer avg_loss = 0 #If set to exit at a given accuracy, exit if params.exit_at_acc and (best_wer <= args.acc): break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")
default=.01, type=float, help='Window stride for spectrogram in seconds') parser.add_argument('--window', default='hamming', help='Window type for spectrogram generation') parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to train model') args = parser.parse_args() if __name__ == '__main__': package = torch.load(args.model_path) model = DeepSpeech(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], num_classes=package['nout']) if args.cuda: model = torch.nn.DataParallel(model).cuda() model.load_state_dict(package['state_dict']) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window) with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) decoder = ArgMaxDecoder(labels) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect))
def convert(parser): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) val_batch_size = min(8,params.batch_size_val) print("Using bs={} for validation. Parameter found was {}".format(val_batch_size,params.batch_size_val)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=(1 if params.cuda else 1)) test_loader = AudioDataLoader(test_dataset, batch_size=val_batch_size, num_workers=(1 if params.cuda else 1)) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = audio_conf, bidirectional = False, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) if params.cuda: model = model.cuda() if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) #################################################### # Begin ONNX conversion #################################################### model.train(False) # Input to the model data = next(iter(train_loader)) inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() x = inputs print(x.size()) # Export the model onnx_file_path = osp.join(osp.dirname(args.continue_from),osp.basename(args.continue_from).split('.')[0]+".onnx") print("Saving new ONNX model to: {}".format(onnx_file_path)) torch.onnx.export(model, # model being run inputs, # model input (or a tuple for multiple inputs) onnx_file_path, # where to save the model (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model file verbose=False)
with open(args.labels_path) as label_file: labels = json.load(label_file) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) state = TrainingState(model=model) state.init_results_tracking(epochs=args.epochs) # Data setup evaluation_decoder = GreedyDecoder(model.labels) # Decoder used for validation train_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.train_manifest, labels=model.labels, normalize=True, speed_volume_perturb=args.speed_volume_perturb, spec_augment=args.spec_augment) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
viz = Visdom() opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER']) viz_window = None epochs = torch.arange(1, args.epochs + 1) if args.tensorboard and main_proc: os.makedirs(args.log_dir, exist_ok=True) from tensorboardX import SummaryWriter tensorboard_writer = SummaryWriter(args.log_dir) os.makedirs(save_folder, exist_ok=True) avg_loss, start_epoch, start_iter = 0, 0, 0 if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) if not args.finetune: # Don't want to restart training if args.cuda: model.cuda() optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0 else:
loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) best_wer = None if main_proc and args.visdom: visdom_logger = VisdomLogger(args.id, args.epochs) if main_proc and args.tensorboard: tensorboard_logger = TensorBoardLogger(args.id, args.log_dir, args.log_params) avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = model.labels audio_conf = model.audio_conf if not args.finetune: # Don't want to restart training optim_state = package['optim_dict'] start_epoch = int(package.get( 'epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], \ package['wer_results']
legend=['Loss', 'WER', 'CER']) viz_window = None epochs = torch.arange(1, args.epochs + 1) if args.tensorboard and main_proc: os.makedirs(args.log_dir, exist_ok=True) from tensorboardX import SummaryWriter tensorboard_writer = SummaryWriter(args.log_dir) os.makedirs(save_folder, exist_ok=True) avg_loss, start_epoch, start_iter = 0, 0, 0 if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) if not args.finetune: # Don't want to restart training optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0
def main(): args = parser.parse_args() save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) if args.visdom: from visdom import Visdom viz = Visdom() opts = [ dict(title='Loss', ylabel='Loss', xlabel='Epoch'), dict(title='WER', ylabel='WER', xlabel='Epoch'), dict(title='CER', ylabel='CER', xlabel='Epoch') ] viz_windows = [None, None, None] epochs = torch.arange(1, args.epochs + 1) if args.tensorboard: from logger import TensorBoardLogger try: os.makedirs(args.log_dir) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') for file in os.listdir(args.log_dir): file_path = os.path.join(args.log_dir, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: raise else: raise logger = TensorBoardLogger(args.log_dir) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package[ 'loss_results'], package['cer_results'], package['wer_results'] if args.visdom and \ package['loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph x_axis = epochs[0:start_epoch] y_axis = [ loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch] ] for x in range(len(viz_windows)): viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) if args.tensorboard and \ package['loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): info = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } for tag, val in info.items(): logger.scalar_summary(tag, val, i + 1) if not args.no_bucketing: print("Using bucketing sampler for the following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler else: avg_loss = 0 start_epoch = 0 start_iter = 0 if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(start_epoch, args.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss = criterion(out, targets, sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() if args.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if args.checkpoint_per_batch > 0 and i > 0 and ( i + 1) % args.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % ( save_folder, epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss)) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() for i, (data) in enumerate(test_loader): # test inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, volatile=True) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings( decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float( len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float( len(target_strings[x])) total_cer += cer total_wer += wer if args.cuda: torch.cuda.synchronize() del out wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.visdom: # epoch += 1 x_axis = epochs[0:epoch + 1] y_axis = [ loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1] ] for x in range(len(viz_windows)): if viz_windows[x] is None: viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) else: viz.line( X=x_axis, Y=y_axis[x], win=viz_windows[x], update='replace', ) if args.tensorboard: info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer} for tag, val in info.items(): logger.scalar_summary(tag, val, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, to_np(value), epoch + 1) logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0][ 'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) avg_loss = 0 if not args.no_bucketing and epoch == 0: print("Switching to bucketing sampler for following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler torch.save(DeepSpeech.serialize(model, optimizer=optimizer), args.final_model_path)
parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample") no_decoder_args = parser.add_argument_group( "No Decoder Options", "Configuration options for when no decoder is " "specified") no_decoder_args.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output") parser = add_decoder_args(parser) args = parser.parse_args() if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") model = model.to(device) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n,
try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Model Save directory already exists.') else: raise criterion = CTCLoss() avg_loss, start_epoch, start_iter = 0, 0, 0 if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model_teacher = DeepSpeech.load_model_package(package) labels = DeepSpeech.get_labels(model_teacher) audio_conf = DeepSpeech.get_audio_conf(model_teacher) parameters_teacher = model_teacher.parameters() optimizer_teacher = torch.optim.SGD(parameters_teacher, lr=args.lr, momentum=args.momentum, nesterov=True) # load student model with pretrained model ''' model_student = DeepSpeech.load_model_package(package) parameters_student = model_student.parameters() optimizer_student = torch.optim.SGD(parameters_student, lr=args.lr, momentum=args.momentum, nesterov=True) '''