def load_model(device, model_path, is_cuda): model = DeepSpeech.load_model(model_path) model.eval() model = model.to(device) if is_cuda and model.mixed_precision: model = convert_model_to_half(model) return model
def load_model(device, model_path, use_half): model = DeepSpeech.load_model(model_path) model.eval() model = model.to(device) if use_half: model = model.half() return model
def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def load_model(device, model_path, model_name, use_half): if model_name == 'DeepSpeech': model = DeepSpeech.load_model(model_path) elif model_name == 'DFCNN': model = DFCNN.load_model(model_path) model.eval() model = model.to(device) if use_half: model = model.half() return model
def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True)
def load_model(device, model_path, use_half): # use load_model method from DeepSpeech class model = DeepSpeech.load_model(model_path) # set model to eval model.eval() # put model on device (GPU/CPU) model = model.to(device) # if the model is using half-precision sampling, use the half method of the model to indicate so if use_half: model = model.half() # return the model return model
parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample") no_decoder_args = parser.add_argument_group( "No Decoder Options", "Configuration options for when no decoder is " "specified") no_decoder_args.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output") parser = add_decoder_args(parser) args = parser.parse_args() if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") model = model.to(device) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n,
beam_args.add_argument('--lm_alpha', default=0.8, type=float, help='Language model weight') beam_args.add_argument('--lm_beta1', default=1, type=float, help='Language model word bonus (all words)') beam_args.add_argument('--lm_beta2', default=1, type=float, help='Language model word bonus (IV words)') args = parser.parse_args() if __name__ == '__main__': model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'), lm_path=args.lm_path, trie_path=args.trie_path, lm_alpha=args.lm_alpha,
def attack2(self, init_delta, target, model_path): self.delta2 = torch.FloatTensor(init_delta).cuda() self.delta2.requires_grad = True self.rescale = torch.ones((self.batch_size, 1)).cuda() self.final_deltas = [None] * self.batch_size self.alpha = torch.ones((self.batch_size, )).cuda() * 1 #self.alpha = 1 model = DeepSpeech.load_model(model_path) model = model.cuda() self.optim21 = torch.optim.Adam([self.delta2], lr=2) self.optim22 = torch.optim.Adam([self.delta2], lr=self.lr2) criterion = CTCLoss() th_batch = [] psd_max_batch = [] for ii in range(self.batch_size): th, _, psd_max = generate_th(self.original[ii].cpu().numpy(), fs=16000, window_size=2048) th_batch.append(th) psd_max_batch.append(psd_max) th_batch = np.array(th_batch) psd_max_batch = np.array(psd_max_batch) th_batch = torch.FloatTensor(th_batch).cuda() psd_max_batch = torch.FloatTensor(psd_max_batch).cuda() MAX = self.num_iterations2 model.train() loss_th = [np.inf] * self.batch_size for i in range(MAX): # print out some debug information every 10 iterations #print(self.delta) apply_delta = torch.clamp( self.delta2, -2000, 2000) * self.rescale #[batch_size * max_audio_len] new_input = apply_delta * self.mask + self.original #[batch_size * max_audio_len] #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len] pass_in = torch.clamp(new_input, -2**15, 2**15 - 1) pass_in = torch.div(pass_in, 2**15) #[batch_szie * max_audio_len] logits, logits_sizes = get_logits(pass_in, self.lengths.int(), model) #[batch_size * T * H] logits_ = logits.transpose(0, 1) # loss loss2 = criterion(logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() loss_value_2 = loss2.item() self.optim21.zero_grad() loss2.backward(retain_graph=True) self.delta2.grad = torch.sign(self.delta2.grad) self.optim21.step() loss1 = 0 loss1_each = [] for ii in range(self.batch_size): psd = psd_transform(apply_delta[ii], psd_max_batch[ii], win_length=2048, win_step=512) loss1 += self.alpha[ii] * torch.mean( torch.relu(psd - th_batch[ii])) loss1_each.append( torch.mean(torch.relu(psd - th_batch[ii])).item()) #psd_num = psd.cpu().detach().numpy() #th_ = th_batch[ii].cpu().detach().numpy() loss1 = loss1 / self.batch_size loss_value_1 = np.mean(loss1_each) self.optim22.zero_grad() loss1.backward() for ii in range(self.batch_size): self.delta2.grad[ii] = self.alpha[ii] * torch.sign( self.delta2.grad[ii]) #grad = np.sum(self.delta2.grad.cpu().numpy()) #if grad != grad: # print("NaN") self.optim22.step() apply_delta_ = torch.clamp(self.delta2, -2000, 2000) * self.rescale print('loss: ', loss_value_1, loss_value_2) if i + 1 == 2000: param_groups = self.optim21.param_groups for g in param_groups: g['lr'] = 0.1 param_groups = self.optim22.param_groups for g in param_groups: g['lr'] = 0.1 if i + 1 == 3200: param_groups = self.optim21.param_groups for g in param_groups: g['lr'] = 0.01 param_groups = self.optim22.param_groups for g in param_groups: g['lr'] = 0.01 if (i + 1) % 10 == 0: decode_out, _ = self.decoder.decode(logits, logits_sizes) print(i + 1, decode_out[0], [target[0]]) for ii in range(self.batch_size): if ((i + 1) % 50 == 0 and decode_out[ii] == [target[ii].upper()]) or ( i == MAX - 1 and self.final_deltas[ii] is None): self.alpha[ii] = 1.2 * self.alpha[ii] if self.alpha[ii] > 1000: self.alpha[ii] = 1000 # Adjust the best solution found so far if loss1_each[ii] < loss_th[ii]: loss_th[ii] = loss1_each[ii] self.final_deltas[ii] = new_input[ii][ 0:self.lengths[ii].int()].cpu().detach().numpy() print("up alpha=%f" % (self.alpha[ii])) if ((i + 1) % 100 == 0 and decode_out[ii] != [target[ii].upper()]): self.alpha[ii] = 0.6 * self.alpha[ii] ''' if self.alpha <= 100: self.alpha = 100 else: # Adjust the best solution found so far print("down alpha=%f" % (self.alpha)) ''' print("down alpha=%f" % (self.alpha[ii])) return self.final_deltas
def attack1(self, audios, lengths, max_audio_len, targets, model_path): self.max_audio_len = max_audio_len self.original = torch.FloatTensor(audios).cuda() self.lengths = torch.FloatTensor(lengths) #define some variables self.delta1 = torch.zeros((self.batch_size, self.max_audio_len)).cuda() self.delta1.requires_grad = True self.rescale = torch.ones((self.batch_size, 1)).cuda() self.mask = torch.FloatTensor( np.array([[1 if i < l else 0 for i in range(self.max_audio_len)] for l in self.lengths])).cuda() self.final_deltas = [None] * self.batch_size self.target_phrase_lengths = torch.IntTensor(self.batch_size) self.target_phrase = [] for x in range(self.batch_size): phrase = list( filter( None, [self.labels_map.get(x) for x in list(targets[x].upper())])) self.target_phrase_lengths[x] = len(phrase) self.target_phrase.extend(phrase) self.target_phrase = torch.IntTensor(self.target_phrase) #print(self.target_phrase.size(), self.target_phrase_lengths) model = DeepSpeech.load_model(model_path) model = model.cuda() self.optim1 = torch.optim.Adam([self.delta1], lr=self.lr1) criterion = CTCLoss() MAX = self.num_iterations1 model.train() #self.noise = torch.randn(self.delta1.shape).cuda() #[batch_szie * max_audio_len] for i in range(MAX): # print out some debug information every 10 iterations apply_delta = torch.clamp( self.delta1, -2000, 2000) * self.rescale #[batch_size * max_audio_len] new_input = apply_delta * self.mask + self.original #[batch_size * max_audio_len] #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len] pass_in = torch.clamp(new_input, -2**15, 2**15 - 1) pass_in = torch.div(pass_in, 2**15) #[batch_szie * max_audio_len] logits, logits_sizes = get_logits(pass_in, self.lengths.int(), model) #[batch_size * T * H] logits_ = logits.transpose(0, 1) # loss if not np.isinf(self.l2penalty): loss = torch.mean( (new_input - self.original) **2) + self.l2penalty * criterion( logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() else: loss = criterion(logits_, self.target_phrase, logits_sizes, self.target_phrase_lengths).cuda() loss_value = loss.item() # optimize self.optim1.zero_grad() loss.backward() # grad sign self.delta1.grad = torch.sign(self.delta1.grad) self.optim1.step() print('loss: ', loss_value) if (i + 1) % 10 == 0: decode_out, _ = self.decoder.decode(logits, logits_sizes) #print(decode_out, targets) for ii in range(self.batch_size): if ((i + 1) % 10 == 0 and decode_out[ii] == [targets[ii].upper()]) or ( i == MAX - 1 and self.final_deltas[ii] is None): bound_tmp = torch.max(torch.abs(self.delta1[ii])).item() if self.rescale[ii][0] * 2000 > bound_tmp: print("It's way over", bound_tmp / 2000.0) self.rescale[ii][0] = bound_tmp / 2000.0 self.rescale[ii][0] *= .8 # Adjust the best solution found so far self.final_deltas[ii] = new_input[ii].cpu().detach().numpy( ) print("bound=%f" % (2000 * self.rescale[ii][0])) return self.final_deltas
def transcribe(audio_path, parser, model, decoder, cuda=False): spect = parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) if cuda: spect = spect.cuda() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
model_file = model_path + "\\librispeech_pretrained.pth" cuda = "store_true" batch_size = 20 num_workers = 4 decoder = "greedy" verbose = "store_true" top_paths = 1 beam_width = 10 lm_path = None alpha = 0.8 beta = 1 cutoff_top_n = 40 cutoff_prob = 1 lm_workers = 1 model = DeepSpeech.load_model(model_file) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=lm_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=lm_workers) elif decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None
cer += cer_inst total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer] if __name__ == '__main__': if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = DeepSpeech.load_model(args.model_path, cuda=False) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) logits = np.load(args.logits) batch_size = logits[0][0].shape[0] results = [] def result_callback(result): results.append(result)
args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_num) if __name__ == '__main__': # args.cuda = True # args.verbose = True # args.decoder = "beam" for a in range(args.start_epoch, args.end_epoch + 1): from jiwer import wer t0 = time.time() model_path = os.path.join(args.model_path, "deepspeech_{}.pth".format(str(a))) torch.set_grad_enabled(False) if not args.finetune: model = DeepSpeech.load_model(model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) C_labels = labels.copy() with open(args.E2C) as label_file: E2C = json.load(label_file) with open(args.C2E) as label_file: C2E = json.load(label_file) for i, v in enumerate(C_labels): if v in E2C: C_labels[i] = E2C[v]
window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" #model = DeepSpeech(rnn_hidden_size=args.hidden_size, # nb_layers=args.hidden_layers, # labels=labels, # rnn_type=supported_rnns[rnn_type], # audio_conf=audio_conf, # bidirectional=args.bidirectional) model = DeepSpeech.load_model('models/librispeech_pretrained.pth') parameters = model.parameters() #optimizer = torch.optim.SGD(parameters, lr=args.lr, # momentum=args.momentum, nesterov=True) decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False)