Esempio n. 1
0
class SpeechTranscriber:
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)

    def transcribe(self, audio_file):
        """

        :param audio_file:
        :return:
        """
        spect = self.parser.parse_audio(audio_file).contiguous()
        spect = spect.view(1, 1, spect.size(0), spect.size(1))
        out = self.deep_speech_model(Variable(spect, volatile=True))
        out = out.transpose(0, 1)  # TxNxH
        decoded_output = self.decoder.decode(out.data)
        return decoded_output
Esempio n. 2
0
                    default='hamming',
                    help='Window type for spectrogram generation')
parser.add_argument('--cuda',
                    default=True,
                    type=bool,
                    help='Use cuda to train model')
args = parser.parse_args()

if __name__ == '__main__':
    package = torch.load(args.model_path)
    model = DeepSpeech(rnn_hidden_size=package['hidden_size'],
                       nb_layers=package['hidden_layers'],
                       num_classes=package['nout'])
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(package['state_dict'])
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)
    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    decoder = ArgMaxDecoder(labels)
    parser = SpectrogramParser(audio_conf, normalize=True)
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect))
    out = out.transpose(0, 1)  # TxNxH
    decoded_output = decoder.decode(out.data)
    print(decoded_output[0])
            #     while recv_data != b'ok':
            #         recv_data = connection.recv(1024)
            #         print('ok waiting')
            #     print('ok received')
            recv_data = connection.recv(1024)
            recv_file = open(file_name, 'wb')
            while recv_data:
                recv_file.write(recv_data)
                recv_data = connection.recv(1024)

            recv_file.close()
            print('download complete')

            start = time.time()
            #inference
            spect = parser.parse_audio(file_name).contiguous()
            parsing_time = time.time() - start

            spect = spect.view(1, 1, spect.size(0), spect.size(1))
            out = model(spect)
            inferring_time = time.time() - parsing_time - start

            decoded_output, decoded_offsets = decoder.decode(out.data)
            decoding_time = time.time() - inferring_time - start

            print('time for parsing: %0.4f,\t inferring: %0.4f,\t decoding: %0.4f'%(parsing_time,inferring_time,decoding_time))
            print(json.dumps(decode_results(model, decoded_output, decoded_offsets)))
        except ConnectionResetError:#connection is broken by a client
            pass
        finally:
            # Clean up the connection