Beispiel #1
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
    def decode(self, probs, sizes=None, rescore=False):
        """
        Decodes probability output using ctcdecode package.
        Arguments:
            probs: Tensor of character probabilities, where probs[c,t]
                            is the probability of character c at time t
            sizes: Size of each sequence in the mini-batch
        Returns:
            string: sequences of the model's best guess for the transcription
        """
        probs = probs.cpu()
        out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)

        strings = self.convert_to_strings(out, seq_lens)
        offsets = self.convert_tensor(offsets, seq_lens)
        if rescore:
            from rescoring import rescore_sent

            parser = argparse.ArgumentParser(description="DeepSpeech decoder")
            parser = add_inference_args(parser)
            parser.add_argument("--audio-path",
                                default="audio.wav",
                                help="Audio file to predict on")
            parser.add_argument(
                "--test-manifest",
                metavar="DIR",
                help="path to validation manifest csv",
                default="data/test_manifest.csv",
            )
            parser.add_argument("--batch-size",
                                default=20,
                                type=int,
                                help="Batch size for training")
            parser.add_argument("--num-workers",
                                default=4,
                                type=int,
                                help="Number of workers used in dataloading")
            parser.add_argument(
                "--verbose",
                action="store_true",
                help="print out decoded output and error of each sample",
            )
            parser.add_argument("--output-path",
                                default=None,
                                type=str,
                                help="Where to save raw acoustic output")
            parser = add_decoder_args(parser)
            parser.add_argument("--save-output",
                                action="store_true",
                                help="Saves output of model from test")
            args = parser.parse_args()

            strings = rescore_sent(strings, args.rescore_lm)
        return strings, offsets
def main():
    import argparse
    global model, spect_parser, decoder, args, device, decompressor

    parser = argparse.ArgumentParser(
        description='DeepSpeech transcription server')
    parser.add_argument('--host',
                        type=str,
                        default='0.0.0.0',
                        help='Host to be used by the server')
    parser.add_argument('--port',
                        type=int,
                        default=8888,
                        help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')

    decompressor = LZString()

    server = WebsocketServer(host=args.host, port=args.port)
    server.set_fn_new_client(new_client)
    server.set_fn_client_left(client_left)
    server.set_fn_message_received(message_received)
    server.run_forever()
Beispiel #4
0
def main():
    import argparse

    global model, spect_parser, decoder, args, device
    parser = argparse.ArgumentParser(
        description="DeepSpeech transcription server")
    parser.add_argument("--host",
                        type=str,
                        default="0.0.0.0",
                        help="Host to be used by the server")
    parser.add_argument("--port",
                        type=int,
                        default=8888,
                        help="Port to be used by the server")
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info("Setting up server...")
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.cuda)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(
            model.labels,
            lm_path=args.lm_path,
            alpha=args.alpha,
            beta=args.beta,
            cutoff_top_n=args.cutoff_top_n,
            cutoff_prob=args.cutoff_prob,
            beam_width=args.beam_width,
            num_processes=args.lm_workers,
        )
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index("_"))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info("Server initialised")
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
Beispiel #5
0
import argparse

import numpy as np
import torch
from tqdm import tqdm

from data.data_loader import SpectrogramDataset, AudioDataLoader
from decoder import GreedyDecoder
from model import DeepSpeech
from opts import add_decoder_args, add_inference_args

parser = argparse.ArgumentParser(description='DeepSpeech transcription')
parser = add_inference_args(parser)
parser.add_argument('--test-manifest',
                    metavar='DIR',
                    help='path to validation manifest csv',
                    default='data/test_manifest.csv')
parser.add_argument('--batch-size',
                    default=20,
                    type=int,
                    help='Batch size for training')
parser.add_argument('--num-workers',
                    default=4,
                    type=int,
                    help='Number of workers used in dataloading')
parser.add_argument('--verbose',
                    action="store_true",
                    help="print out decoded output and error of each sample")
no_decoder_args = parser.add_argument_group(
    "No Decoder Options", "Configuration options for when no decoder is "
    "specified")
from opts import add_decoder_args, add_inference_args

warnings.simplefilter('ignore')

from decoder import GreedyDecoder

import torch

from data.data_loader import SpectrogramParser
from model import DeepSpeech
import os.path
import json

parser = argparse.ArgumentParser(description='DeepSpeech transcription')
parser = add_inference_args(parser)
parser.add_argument('--audio-path', default='audio.wav',
                    help='Audio file to predict on')
parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information')
parser = add_decoder_args(parser)
args = parser.parse_args()


def decode_results(model, decoded_output, decoded_offsets):
    results = {
        "output": [],
        "_meta": {
            "acoustic_model": {
                "name": os.path.basename(args.model_path)
            },
            "language_model": {
Beispiel #7
0
def transcribe(audio_path, spect_parser, model, decoder, device, use_half):
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    if use_half:
        spect = spect.half()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(
        description='DeepSpeech transcription')
    arg_parser = add_inference_args(arg_parser)
    arg_parser.add_argument('--audio-path',
                            default='audio.wav',
                            help='Audio file to predict on')
    arg_parser.add_argument('--offsets',
                            dest='offsets',
                            action='store_true',
                            help='Returns time offset information')
    arg_parser = add_decoder_args(arg_parser)
    args = arg_parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder