def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True)
def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
class SpeechTranscriber: def __init__(self, model_path): """ :param model_path: """ assert os.path.exists(model_path), "Cannot find model here {}".format( model_path) self.deep_speech_model = DeepSpeech.load_model(model_path) self.deep_speech_model.eval() labels = DeepSpeech.get_labels(self.deep_speech_model) self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model) self.decoder = GreedyDecoder(labels) self.parser = SpectrogramParser(self.audio_conf, normalize=True) def transcribe(self, audio_file): """ :param audio_file: :return: """ spect = self.parser.parse_audio(audio_file).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = self.deep_speech_model(Variable(spect, volatile=True)) out = out.transpose(0, 1) # TxNxH decoded_output = self.decoder.decode(out.data) return decoded_output
def build(): global model, spect_parser, decoder, device logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cpu") model = load_model(device, "/workspace/models/deepspeech_final.pth", False) decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised')
def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) device = 'cpu' model_path = '/home/chris/git/deepspeech.pytorch/models/ted_pretrained_v2.pth' half = False model = load_model(device, model_path, half) # if args.decoder == "beam": # from decoder import BeamCTCDecoder # decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, # cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, # beam_width=args.beam_width, num_processes=args.lm_workers) # else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) self.half = half self.device = device self.decoder = decoder self.model = model self.spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
spect = spect.to(device) input_sizes = torch.IntTensor([1]).int() out, output_sizes = model(spect, input_sizes) return out_to_preds(out) if __name__ == '__main__': parser = argparse.ArgumentParser(description='DeepSpeech transcription') parser = add_inference_args(parser) parser.add_argument('--audio-path', default='audio.wav', help='Audio file to predict on') parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information') parser = add_decoder_args(parser) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.cuda) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) parser = SpectrogramParser(model.audio_conf, normalize=True) decoded_output, decoded_offsets = transcribe(args.audio_path, parser, model, decoder, device) print(json.dumps(decode_results(model, decoded_output, decoded_offsets)))
default='hamming', help='Window type for spectrogram generation') parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to train model') args = parser.parse_args() if __name__ == '__main__': package = torch.load(args.model_path) model = DeepSpeech(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], num_classes=package['nout']) if args.cuda: model = torch.nn.DataParallel(model).cuda() model.load_state_dict(package['state_dict']) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window) with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) decoder = ArgMaxDecoder(labels) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect)) out = out.transpose(0, 1) # TxNxH decoded_output = decoder.decode(out.data) print(decoded_output[0])
model = DeepSpeech.load_model(args.model_path, cuda=args.cuda) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, normalize=True) ###--- server setup sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_address = ('', 10001) print('starting up on %s port %s' % server_address) sock.bind(server_address) sock.listen(1) file_name = 'data/recorded.wav' bFileFound = 0 while True: # Wait for a connection connection, client_address = sock.accept() print('connection from', client_address)
model = DeepSpeech.load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") model = model.to(device) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) parser = SpectrogramParser(audio_conf, cache_path=args.cache_dir, normalize='max_frame', channel=args.channel, augment=True) decoded_output, decoded_offsets = transcribe(args.audio_path, parser, model, decoder, device) output = decode_results(model, decoded_output, decoded_offsets) output['input'] = { 'channel': args.channel, 'source': args.audio_path} output['model'] = { 'model': args.model_path, } print(json.dumps(output, ensure_ascii=False))