Beispiel #1
0
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)
Beispiel #2
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
Beispiel #3
0
class SpeechTranscriber:
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)

    def transcribe(self, audio_file):
        """

        :param audio_file:
        :return:
        """
        spect = self.parser.parse_audio(audio_file).contiguous()
        spect = spect.view(1, 1, spect.size(0), spect.size(1))
        out = self.deep_speech_model(Variable(spect, volatile=True))
        out = out.transpose(0, 1)  # TxNxH
        decoded_output = self.decoder.decode(out.data)
        return decoded_output
Beispiel #4
0
def build():
    global model, spect_parser, decoder, device

    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cpu")
    model = load_model(device, "/workspace/models/deepspeech_final.pth", False)

    decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')
    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)
        device = 'cpu'
        model_path = '/home/chris/git/deepspeech.pytorch/models/ted_pretrained_v2.pth'
        half = False
        model = load_model(device, model_path, half)

        # if args.decoder == "beam":
            # from decoder import BeamCTCDecoder
            # decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                     # cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                     # beam_width=args.beam_width, num_processes=args.lm_workers)
        # else:
        decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))

        self.half = half
        self.device = device
        self.decoder = decoder
        self.model = model
        self.spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
Beispiel #6
0
    spect = spect.to(device)
    input_sizes = torch.IntTensor([1]).int()
    out, output_sizes = model(spect, input_sizes)

    return out_to_preds(out)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='DeepSpeech transcription')
    parser = add_inference_args(parser)
    parser.add_argument('--audio-path', default='audio.wav',
                        help='Audio file to predict on')
    parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information')
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.cuda)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))

    parser = SpectrogramParser(model.audio_conf, normalize=True)

    decoded_output, decoded_offsets = transcribe(args.audio_path, parser, model, decoder, device)
    print(json.dumps(decode_results(model, decoded_output, decoded_offsets)))
Beispiel #7
0
                    default='hamming',
                    help='Window type for spectrogram generation')
parser.add_argument('--cuda',
                    default=True,
                    type=bool,
                    help='Use cuda to train model')
args = parser.parse_args()

if __name__ == '__main__':
    package = torch.load(args.model_path)
    model = DeepSpeech(rnn_hidden_size=package['hidden_size'],
                       nb_layers=package['hidden_layers'],
                       num_classes=package['nout'])
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(package['state_dict'])
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)
    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    decoder = ArgMaxDecoder(labels)
    parser = SpectrogramParser(audio_conf, normalize=True)
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect))
    out = out.transpose(0, 1)  # TxNxH
    decoded_output = decoder.decode(out.data)
    print(decoded_output[0])
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, normalize=True)



    ###--- server setup
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_address = ('', 10001)
    print('starting up on %s port %s' % server_address)
    sock.bind(server_address)
    sock.listen(1)
    file_name = 'data/recorded.wav'
    bFileFound = 0
    while True:
        # Wait for a connection
        connection, client_address = sock.accept()
        print('connection from', client_address)
Beispiel #9
0
    model = DeepSpeech.load_model(args.model_path)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = model.to(device)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, cache_path=args.cache_dir, 
                               normalize='max_frame', channel=args.channel, augment=True)

    decoded_output, decoded_offsets = transcribe(args.audio_path, parser, model, decoder, device)
    output = decode_results(model, decoded_output, decoded_offsets)
    output['input'] = {
        'channel': args.channel,
        'source': args.audio_path}
    output['model'] = {
        'model': args.model_path,
    }

    print(json.dumps(output, ensure_ascii=False))