/
voice_detect.py
executable file
·115 lines (97 loc) · 3.34 KB
/
voice_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import numpy as np
from asterisk.agi import *
from scipy.io import wavfile
import os
from scikits.audiolab import Format, Sndfile
from tempfile import mkstemp
from pydub import AudioSegment
from background_processing import detect_speech
# path-related variables
PATH = "/home/hieung1707/"
PREVOICE_DIR = 'python_code/voice_detection/prerecorded_voices/'
FILE_NAME = 'sample.mp3'
# audio-related constant
SAMPLE_RATE = 8000.0
CHUNK_TIME = 0.05
BYTES_PER_SAMPLE = 2
CHUNK = int(BYTES_PER_SAMPLE * SAMPLE_RATE * CHUNK_TIME)
# time-related variables
MIN_SPOKEN_TIME = 0.2
SILENT_TIME = 1.0
TIMEOUT = 7.0
FD = 3
windows = 0
loops = 0
data = np.array([])
has_spoken = False
def rms(samples): # Root-mean-square
int64_samples = np.array(samples).astype(np.int64)
return np.sqrt(np.mean(np.square(int64_samples)))
def record():
global data, has_spoken
# patience = 20
# n_silence = 0
agi.verbose('start recording')
# start_recording = True
# --- new variables --- #
cons_sil_time = 0.0
cons_speech_time = 0.0
total_time = 0.0
# --------------------- #
# --- old logic --- #
# while start_recording:
# raw_samples = file_descriptor.read(CHUNK)
# new_samples = np.fromstring(raw_samples, dtype=np.int16)
# data = np.append(data, new_samples)
# v.data = data
# ---------------- #
# --- new logic --- #
while cons_sil_time < SILENT_TIME:
raw_samples = file_descriptor.read(CHUNK)
new_samples = np.fromstring(raw_samples, dtype=np.int16)
# new_samples = np.fromstring(raw_samples, dtype=np.int16)
total_time += CHUNK_TIME
spoke = detect_speech(new_samples)
# agi.verbose(spoke)
if spoke:
cons_sil_time = 0.0
cons_speech_time += CHUNK_TIME
if cons_speech_time >= MIN_SPOKEN_TIME and not has_spoken:
agi.verbose('has spoken')
has_spoken = True
data = np.append(data[data.size - int(0.5*SAMPLE_RATE):] if data.size - 0.5*SAMPLE_RATE > 0 else data, new_samples)
else:
cons_speech_time = 0.0
if has_spoken:
cons_sil_time += CHUNK_TIME
data = np.append(data, new_samples)
if total_time >= TIMEOUT:
break
# ---------------- #
n_samples = len(data)
padding = np.zeros(int(0.3 * SAMPLE_RATE),np.int16)
data = np.append(padding, data)
data = np.append(data, padding)
n_secs = n_samples / SAMPLE_RATE
agi.verbose("collected " + str(n_samples) + " for " + str(n_secs))
def save_record():
global data
n_channels, fmt = 1, Format('flac', 'pcm16')
caller_id = agi.get_variable("CALLERID(num)")
file_name = 'TmpSpeechFile_' + caller_id + '.flac'
_, temp_sound_file = mkstemp(file_name)
flac_file = Sndfile(temp_sound_file, 'w', fmt, n_channels, SAMPLE_RATE)
flac_file.write_frames(data)
flac_audio = AudioSegment.from_file(temp_sound_file, "flac")
flac_audio.export(PATH + FILE_NAME, format="mp3")
if __name__ == "__main__":
agi = AGI()
fs, wav_file = wavfile.read(PATH + PREVOICE_DIR + "tongdai.wav")
TIMEOUT = wav_file.size / fs
file_descriptor = os.fdopen(FD, 'r')
record()
save_record()
# reduce_noise()
agi.set_variable("has_spoken", 1 if has_spoken else 0)
agi.verbose('finish')