/
audio_detector.py
170 lines (138 loc) · 6.11 KB
/
audio_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
import argparse
import math
import numpy as np
import shutil
import sys
import state
import queue
import soundfile as sf
import time
import threading
usage_line = ' press <enter> to quit, +<enter> or -<enter> to change scaling '
from dictation.dictation_client import create_audio_stream, print_results
from dictation.service.dictation_settings import DictationSettings
from dictation.service.streaming_recognizer import StreamingRecognizer
from address_provider import AddressProvider
from os.path import join as opjoin
import action
import run_sarmata
class DictationArgs:
address = None # IP address and port (address:port) of a service the client will connect to.
interim_results = False # If set - messages with temporal results will be shown.
max_alternatives = 3 # Maximum number of recognition hypotheses to be returned.
mic = False # Use microphone as an audio source (instead of wave file).
no_input_timeout = 5000 # MRCP v2 no input timeout [ms].
recognition_timeout = 5000 # MRCP v2 recognition timeout [ms].
session_id = None # Session ID to be passed to the service. If not specified, the service will generate a default session ID itself.
single_utterance = False # If set - the recognizer will detect a single spoken utterance.
speech_complete_timeout = 5000 # MRCP v2 speech complete timeout [ms].
speech_incomplete_timeout = 6000 # MRCP v2 speech incomplete timeout [ms].
time_offsets = False # If set - the recognizer will return also word time offsets.
wave = None # Path to wave file with speech to be recognized. Should be mono, 8kHz or 16kHz.
def __init__(self, wav_filepath=None):
ap = AddressProvider()
if wav_filepath:
self.wave = opjoin(wav_filepath)
self.address = ap.get("dictation")
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
try:
columns, _ = shutil.get_terminal_size()
except AttributeError:
columns = 80
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-l', '--list-devices', action='store_true',
help='list audio devices and exit')
parser.add_argument('-b', '--block-duration', type=float,
metavar='DURATION', default=50,
help='block size (default %(default)s milliseconds)')
parser.add_argument('-c', '--columns', type=int, default=columns,
help='width of spectrogram')
parser.add_argument('-d', '--device', type=int_or_str,
help='input device (numeric ID or substring)')
parser.add_argument('-g', '--gain', type=float, default=10,
help='initial gain factor (default %(default)s)')
parser.add_argument('-r', '--range', type=float, nargs=2,
metavar=('LOW', 'HIGH'), default=[100, 2000],
help='frequency range (default %(default)s Hz)')
args = parser.parse_args()
try:
import sounddevice as sd
q = queue.Queue()
def callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
q.put(indata)
audio_data = None
sd.default.device = "hw:1,0"
#print(int( 10* np.log10(np.mean(abs(indata / 65535) ** 2))))
def record(filename):
q2 = queue.Queue()
def callback2(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
q2.put(indata)
silence = 0
with sf.SoundFile(filename, mode='x', samplerate=44100, channels=1, subtype='PCM_16') as file:
with sd.InputStream(device='pulse', channels=1, callback=callback2, blocksize=44100, samplerate=44100,
dtype="int16"):
file.write(audio_data)
counter = 0
while True:
counter +=1
data2 = q2.get()
file.write(data2)
try:
if int(10 * np.log10(np.sqrt(np.mean((data2 / 65535) ** 2)))) < -15:
print("stoprecording: " + filename)
one = state.state.getInstance()
one.state = False
break
except:
None
if counter < 5:
result = run_sarmata.RunSarmata(filename)
action.action(result)
else:
args = DictationArgs(filename)
args.mic = True
if args.wave is not None or args.mic:
with create_audio_stream(args) as stream:
settings = DictationSettings(args)
recognizer = StreamingRecognizer(args.address, settings)
print('Recognizing...')
results = recognizer.recognize(stream)
print_results(results)
import pyautogui
pyautogui.typewrite((str(results[0]['transcript'])))
recording = False
silence = 0
filename = ""
with sd.InputStream(device='pulse', channels=1, callback=callback, blocksize=16, samplerate=44100, dtype="int16"):
while True:
data = q.get()
audio_data = data
try:
level = np.sqrt(np.mean((data/65535) **2))
if level > 0.06:
filename = "records/" + str(time.time()) + ".wav"
t = threading.Thread(target=record, args={filename})
t.start()
print("recording")
one = state.state.getInstance()
one.state = True
while one.state:
None
except: None
#else: print(level)
plt.plot(np.ndarray.tolist(data))
plt.show()
except KeyboardInterrupt:
parser.exit('Interrupted by user')
except Exception as e:
parser.exit(type(e).__name__ + ': ' + str(e))