def Playback( self ): # similar to Go, but uses data from Load instead of collecting new data self.Status = True chunkSize = 8192 windowSize = 3 p = pyaudio.PyAudio() audioStream = p.open(format=pyaudio.paInt16, channels=1, rate=self.fs, input=True, frames_per_buffer=chunkSize) numSamples = len(self.Recording) self.Formants = np.zeros((100, 5), dtype=np.float32) self.FormantTime = np.zeros(100, dtype=np.float32) self.Pitch = np.zeros(100, dtype=np.float32) self.PitchTime = np.zeros(100, dtype=np.float32) PitchCount = 0 FormantCount = 0 ax = self.RawPlot.figure.add_subplot(111) f0ax = self.FundamentalFrequenncyPlot.figure.add_subplot(111) f0ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) f0ax.set_position([0.35, 0.05, 0.6, 0.93]) tractAx = self.VocalTractPlot.figure.add_subplot(111) tractAx.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) tractAx.set_position([0.35, 0.05, 0.6, 0.93]) tractAx.set_ylabel('Vocal Tract Length (cm)') tractAx.set_ylim((0, 25)) tractAx.set_xlim((0, 0.8)) formantAx = self.FormantPlot.figure.add_subplot(111) maxPitchLag = 3 maxVocalLag = 3 ds_rate = 3 c = 34300 # speed of sound in cm/s Count = 0 t = 0 print('Beginning Playback') time = np.linspace(0, numSamples / self.fs, numSamples) try: start = ti.time() while t < numSamples - chunkSize and self.Status: t += chunkSize data = PyAudioTest.getChunk(chunkSize, audioStream, Random=0) data = self.Recording[t - chunkSize:t] data_ds = data[0:chunkSize:ds_rate] # use yin implementation data_hamming = data * np.hamming(chunkSize) df = yin.differenceFunction(data_hamming, chunkSize, self.fs / 75) cmndf = yin.cumulativeMeanNormalizedDifferenceFunction( df, len(df)) f0 = yin.getPitch(cmndf, self.fs / 500, self.fs / 75, harmo_th=0.35) if f0: # store ot pitch and time self.Pitch[PitchCount] = 1.0 * self.fs / f0 self.PitchTime[PitchCount] = 1.0 * ( t - chunkSize / 2) / self.fs PitchCount += 1 # add space if needed if PitchCount >= len(self.PitchTime): self.Pitch = np.concatenate( (self.Pitch, np.zeros(200, dtype=np.float32))) self.PitchTime = np.concatenate( (self.PitchTime, np.zeros(200, dtype=np.float32))) RecentPitches = [] pitchIDX = PitchCount - 1 while self.PitchTime[pitchIDX] >= 1.0 * ( t - chunkSize / 2) / self.fs - maxPitchLag and pitchIDX >= 0: RecentPitches.append(self.Pitch[pitchIDX]) pitchIDX -= 1 meanPitch = np.mean(RecentPitches) if len(RecentPitches) == 1: stdPitch = 25 else: stdPitch = np.std(RecentPitches) f0ax.bar([0], [2.0 * stdPitch], bottom=[meanPitch - stdPitch]) f0ax.set_ylabel('Fundamental Frequency (Hz)') f0ax.set_ylim((0, 500)) f0ax.set_xlim((0, 0.8)) self.FundamentalFrequenncyPlot.draw() # use my terrible gaussian estimation formant finder formantAx.clear() formantAx.hold(True) if f0: fBins, PSD = sp.signal.periodogram(data_ds, self.fs / ds_rate) PSD = 20 * np.log10(PSD) try: Formants = FormantFinder.findFormantsLPC( data_ds, self.fs / ds_rate) for f in range(len(Formants)): formantAx.plot([Formants[f], Formants[f]], [-100, 75], color='red') formantAx.plot(fBins, PSD) formantAx.set_title('Power Spectrum - Formants') formantAx.set_xlabel('Frequency (Hz)') formantAx.set_ylabel('Power (dB)') formantAx.set_ylim((-90, 90)) formantAx.set_xlim((0, 5000)) ''' formantAx.bar(range(len(Formants)), Formants) formantAx.set_xlabel('Formant number') formantAx.set_ylabel('Frequency (Hz)') formantAx.set_title('Formants Frequencies') formantAx.set_xlim((0, 4.8)) formantAx.set_ylim((0, 5000)) formantAx.set_xticks([0.4, 1.4, 2.4, 3.4, 4.4]) formantAx.set_xticklabels(['F1', 'F2', 'F3', 'F4', 'F5']) ''' self.FormantPlot.draw() formantAx.hold(False) if len(Formants) >= 5: self.Formants[FormantCount, 0:5] = Formants[0:5] else: self.Formants[FormantCount, 0:len(Formants)] = Formants self.FormantTime[FormantCount] = 1.0 * ( t - chunkSize / 2) / self.fs FormantCount += 1 # add space if needed if FormantCount >= len(self.FormantTime): self.Formants = np.concatenate( (self.Formants, np.zeros((200, 5), dtype=np.float32))) self.FormantTime = np.concatenate( (self.FormantTime, np.zeros(200, dtype=np.float32))) RecentTractLength = [] tractIDX = FormantCount - 1 while self.FormantTime[tractIDX] >= 1.0 * ( t - chunkSize / 2) / self.fs - maxVocalLag and tractIDX >= 0: RecentTractLength.append( FormantFinder.getVocalTractLength( self.Formants[tractIDX, :], c, method='lammert')) tractIDX -= 1 meanTractLength = np.median(RecentTractLength) if len(RecentTractLength) == 1: stdTractLength = 2 else: stdTractLength = np.std(RecentTractLength) #TractLength = FormantFinder.getVocalTractLength(Formants, c) tractAx.bar([0], [2 * stdTractLength], bottom=[meanTractLength - stdTractLength]) #tractAx.bar([0], [TractLength]) tractAx.set_ylabel('Vocal Tract Length (cm)') tractAx.set_ylim((0, 25)) tractAx.set_xlim((0, 0.8)) self.VocalTractPlot.draw() except (RuntimeError): Formants = np.zeros(3) else: fBins = np.linspace(0, self.fs / 2, 10) PSD = np.zeros(10) Count += 1 if t > windowSize * self.fs and Count % 3 == 0: ax.plot(time[t - windowSize * self.fs:t], self.Recording[t - windowSize * self.fs:t]) plt.xlim(t / self.fs - windowSize, t / self.fs + 1) ax.set_xlabel('Time (s)') ax.set_ylabel('amplitude') ax.set_title('Raw Waveform') self.RawPlot.draw() QtCore.QCoreApplication.processEvents() except (KeyboardInterrupt, SystemExit): self.FormantPlot.draw() self.RawPlot.draw() self.FundamentalFrequenncyPlot.draw() self.Pitch = self.Pitch[0:PitchCount] self.PitchTime = self.PitchTime[0:PitchCount] self.Formants = self.Formants[0:FormantCount, :] self.FormantTime = self.FormantTime[0:FormantCount] print('Recording Completed') print('recorded time is') print(1.0 * t / self.fs) print('elapsed time is:') print(ti.time() - start) return True self.Pitch = self.Pitch[0:PitchCount] self.PitchTime = self.PitchTime[0:PitchCount] self.Formants = self.Formants[0:FormantCount, :] self.FormantTime = self.FormantTime[0:FormantCount] print('Recording Completed') print('recorded time is') print(1.0 * t / self.fs) print('elapsed time is:') print(ti.time() - start)
def main(): # CONSTANTS global min_peak_threshold min_peak_threshold = 3000 global brightness brightness = 200 global low_bright low_bright = 75 global chill_threshold chill_threshold = .25 global CHUNK CHUNK = 2**11 global RATE RATE = 44100 global basePath basePath = r"http://192.168.1.135/api/xREOsUlYetInkIHuxDldgzqJYLZySU6xDIaobRsx/" global lightArray lightArray = [6, 1, 3, 2, 4] #initalize audio stream p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK) #initialize peak variables max_peak = 0 max_peak_time = datetime.datetime.now() oldoldpeak = 0 oldpeak = 0 peak = 0 lastTriggered = datetime.datetime.now() max_buffer_counter = 0 while True: percent_string="" data = np.fromstring(stream.read(CHUNK), dtype=np.int16) peak = np.average(np.abs(data))*2 #max peak handling if (peak >= max_peak) or ((datetime.datetime.now() - max_peak_time) > datetime.timedelta(seconds=30)): if max_buffer_counter > 2: max_peak = peak max_peak_time = datetime.datetime.now() min_peak_threshold = max_peak/10 max_buffer_counter = 0 print(" NEWMAX") else: max_buffer_counter += 1 currentTrigger = datetime.datetime.now() #check for huge spike if (oldpeak + min_peak_threshold * 5) < peak or (oldoldpeak + min_peak_threshold * 5) < peak: if ((currentTrigger - lastTriggered) > datetime.timedelta(seconds=chill_threshold)): lastTriggered = currentTrigger pulseAll() percent_string=" 50%" else: print("Chill") #check for medium spike elif (oldpeak + min_peak_threshold * 3) < peak or (oldoldpeak + min_peak_threshold * 3) < peak: if ((currentTrigger - lastTriggered) > datetime.timedelta(seconds=chill_threshold)): lastTriggered = currentTrigger pulseEnds() percent_string=" 30%" else: print("Chill") #check for small spike elif (oldpeak + min_peak_threshold) < peak or (oldoldpeak+min_peak_threshold) < peak: if ((currentTrigger - lastTriggered) > datetime.timedelta(seconds=chill_threshold)): lastTriggered = currentTrigger pulseOne() percent_string=" 10%" else: print("Chill") #no spike else: percent_string=" 0%" #render wavform bars = "#" * int(200 * peak / 2**16) print("MAX:%05d Peak:%05d Delta:%s %s" % (max_peak, peak, percent_string, bars)) oldoldpeak = oldpeak oldpeak = peak time.sleep(.1) stream.stop_stream() stream.close() p.terminate()
def Run(C, R, mic, Plot): CHUNK = 44100 # number of data points to read at a time 4096 CHUNK = C # 4096 byte # the number of frames RATE = 44100 # 176400 # time resolution for reading device (Hz) 44100 samples/second RATE = R # sampling rate i.e the number of frames per second serSignal = 'S' KnockSignal = 'K' Input_Device_Index = 2 Input_Device_Index = mic plot = Plot # Define the serial port ser_port = "COM8" # for window computer, int must be used COM1 = 0,COM2=1 ... baud_rate = 9600 count = 0 flag = False signal = False mlab = Matlab(executable=r"D:\MATLAB\bin\matlab.exe") mlab.start() p = pyaudio.PyAudio() # while True: # ser.write(serSignal.encode('utf-8')) # if ser.readline().decode('utf-8') != "Spray": # break stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, input_device_index=None, frames_per_buffer=CHUNK) ser = serial.Serial(ser_port, baud_rate) print(ser.readline().decode("utf-8")) print("Input delay is %f" % stream.get_input_latency()) while (True): for i in range(int(3)): #only loop forA int(??) times #if(count>1): # sleep(1) if (count == 1): ser.write(KnockSignal.encode( "utf-8")) # encode is used for string.encode() sleep(.32) # **change here (0.1s per 5000samples) flag = True print("Must Knock Here") # The input device id "2" => built-in microphone # info = p.get_host_api_info_by_index(0) # numdevices = info.get('deviceCount') # for i in range(0, numdevices): # if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0: # pass #print('Input Device id', i, '-', p.get_device_info_by_host_api_device_index(0, i).get('name')) # get the default device info #print(p.get_default_input_device_info()) # create a numpy array holding a single read of audio data #now = datetime.now() if flag == True: # if count ==1: # sleep(.5) np.set_printoptions(threshold=sys.maxsize) data = np.fromstring(stream.read(CHUNK), dtype=np.short) #print(stream) time = np.arange(0, CHUNK) #peak=np.average(np.abs(data))*21 #bars="#"*int(50*peak/2**16) #print("%04d %s"%(i,data)) #print("%s %s" % (data/32768,now )) #print("Input data is ", type(data)) # Test Matlab data 1 #res = mlab.run_func('jk.m', {'arg1': data}) #print("Output data is ", type(res['result'])) #data1 = res['result'] # The data in matlab is float64 (e.g for 64bit window) https://stackoverflow.com/questions/8855574/convert-ndarray-from-float64-to-integer #M_data1 = data1[0] / 32768 #print("jk.m is",res) # data1 = np.array(res['result'], dtype=np.float64).astype(np.int64) # print(type(data1)) #Write data to text file before matlab # with open("SignalTest1.txt", "wt") as file: # file.write("%s" % (str(M_data1).lstrip('[').rstrip(']'))) # file.flush() # file.close() # # file.writelines("%s %04d %s\n"%(now,i,data)) # # close the stream gracefully # max_val =np.amax(data) # print(max_val) # if max_val >30000: #data/32768 #print(M_data1) if count == 1: print("Write") with open("SignalTest.txt", "wt") as out_file: out_file.writelines( str(data)) #it can only write string if plot == True and count == 2: past = stream.get_time() np.set_printoptions(threshold=sys.maxsize) data = np.fromstring(stream.read(CHUNK), dtype=np.short) present = stream.get_time() delay = present - past print("The delay is %f" % delay) plt.title('AudioSample') plt.plot(time, data) plt.ylim(-40000, 40000) plt.ylabel('Amplitude') plt.xlabel('Sample Size') #plt.pause(.0000000000000000000000000000000000000000000000000000000001) #plt.clf() #print(stream.get_time()) dataprocess = mlab.run_func( 'final_judge.m', {"arg1": data}) # ,{'arg1':data} # print("The input data is ",M_data1) print(np.amax(data)) print(dataprocess['result']) d1 = dataprocess['result'] if d1 == 1: ser.write(serSignal.encode( "utf-8")) # encode is used for string.encode() # print(ser.write(serSignal.encode("utf-8"))) #print(ser.readline().decode("utf-8")) #d1 = 2 plt.show() flag = False count = 0 count += 1 #ser.reset_output_buffer() mlab.stop() out_file.close() stream.stop_stream() stream.close() p.terminate() sys.exit(0)
def play_rec(self, out_file_name, recode_second, device_name='ReSpeaker 4 Mic Array (UAC1.0)', CHUNK=1024, input_file_name='./test_out.wav', need_data=False, order_index=None, order_ch=None): # file_name = '../_exp/Speaker_Sound/up_tsp_1num.wav' wf = wave.open(out_file_name, 'rb') sampling = wf.getframerate() if order_index is not None: index = order_index channels = order_ch else: index, channels = self.get_index(device_name) p = pyaudio.PyAudio() stream1 = p.open( format=pyaudio.paInt16, channels=channels, rate=sampling, frames_per_buffer=CHUNK, input=True, input_device_index=index, ) stream2 = p.open(format=pyaudio.paInt16, channels=1, rate=sampling, frames_per_buffer=CHUNK, output=True) if sampling * recode_second < wf.getnframes(): print('Error recode time is not enough', wf.getnframes() / sampling) sys.exit() elif sampling * recode_second > wf.getnframes() * 2: print('Error recode time is too long') sys.exit() else: out_data = wf.readframes(CHUNK) in_data = stream1.read(CHUNK) recoding_data = [in_data] for i in range(0, int(sampling / CHUNK * recode_second)): input_data = stream1.read(CHUNK) recoding_data.append(input_data) if out_data != b'': stream2.write(out_data) out_data = wf.readframes(CHUNK) recoded_data = b''.join(recoding_data) # print(type(recoded_data)) self.wave_save(recoded_data, channels=channels, sampling=sampling, wave_file=input_file_name) stream1.stop_stream() stream2.stop_stream() stream1.close() stream2.close() p.terminate() if need_data: # print('use data return data', np.frombuffer(np.array(recoding_data), dtype='int16').shape) recoded_input_data = np.array(np.frombuffer(np.array(recoding_data), dtype='int16'))\ .reshape((channels, -1), order='F') return recoded_input_data, sampling
#2019h1030124h import pyaudio import wave # output file name name = "recordedAud.mp3" chunk = 1024 # sample format FORMAT = pyaudio.paInt16 channels = 1 # 44100 samples per second sample_rate = 44100 record_seconds = 10 # initialize PyAudio object obj = pyaudio.PyAudio() # open stream object as input & output stream = obj.open(format=FORMAT, channels=channels, rate=sample_rate, input=True, output=True, frames_per_buffer=chunk) frames = [] print("Recording audio...") for i in range(int(44100 / chunk * record_seconds)): data = stream.read(chunk) frames.append(data) print("Finished recording.") stream.stop_stream()
def recognize(self, args, userin, user_full_name, user_prefix): with noalsaerr(): p = pyaudio.PyAudio() # Create a PyAudio session # Create a stream stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=True, frames_per_buffer=CHUNK) try: data = stream.read( CHUNK) # Get first data frame from the microphone # Loop over the frames of the audio / data chunks while data != '': rms = audioop.rms( data, 2) # Calculate Root Mean Square of current chunk if rms >= THRESHOLD: # If Root Mean Square value is greater than THRESHOLD constant self.decoder_pipeline.init_request( "recognize", "audio/x-raw, layout=(string)interleaved, rate=(int)16000, format=(string)S16LE, channels=(int)1" ) self.decoder_pipeline.process_data(data) silence_counter = 0 # Define silence counter # While silence counter value less than SILENCE_DETECTION constant while silence_counter < SILENCE_DETECTION: data = stream.read( CHUNK) # Read a new chunk from the stream if LISTENING: stream.write(data, CHUNK) self.decoder_pipeline.process_data(data) rms = audioop.rms( data, 2 ) # Calculate Root Mean Square of current chunk again if rms < THRESHOLD: # If Root Mean Square value is less than THRESHOLD constant silence_counter += 1 # Then increase silence counter else: # Else silence_counter = 0 # Assign zero value to silence counter stream.stop_stream() self.decoder_pipeline.end_request() while not self.finished: time.sleep(0.1) stream.start_stream() words = self.words words = [x for x in words if x != '<#s>'] com = ' '.join(words) her = VirtualAssistant(args, userin, user_full_name, user_prefix) t = Thread(target=her.command, args=(com,)) t.start() self.reset() data = stream.read(CHUNK) # Read a new chunk from the stream if LISTENING: stream.write(data, CHUNK) except KeyboardInterrupt: stream.stop_stream() stream.close() p.terminate() self.loop.quit() raise KeyboardInterrupt
def __init__(self): self.p = pyaudio.PyAudio() self.stream = self.p.open( format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024) self.bitlist = [0]
def start(self, detected_callback=play_audio_file, interrupt_check=lambda: False, sleep_time=0.03): """ Start the voice detector. For every `sleep_time` second it checks the audio buffer for triggering keywords. If detected, then call corresponding function in `detected_callback`, which can be a single function (single model) or a list of callback functions (multiple models). Every loop it also calls `interrupt_check` -- if it returns True, then breaks from the loop and return. :param detected_callback: a function or list of functions. The number of items must match the number of models in `decoder_model`. :param interrupt_check: a function that returns True if the main loop needs to stop. :param float sleep_time: how much time in second every loop waits. :return: None """ self.audio = pyaudio.PyAudio() self.stream_in = self.audio.open( input=True, output=False, format=self.audio.get_format_from_width( self.detector.BitsPerSample() / 8), channels=self.detector.NumChannels(), rate=self.detector.SampleRate(), frames_per_buffer=2048, stream_callback=audio_callback) if interrupt_check(): logger.debug("detect voice return") return tc = type(detected_callback) if tc is not list: detected_callback = [detected_callback] if len(detected_callback) == 1 and self.num_hotwords > 1: detected_callback *= self.num_hotwords assert self.num_hotwords == len(detected_callback), \ "Error: hotwords in your models (%d) do not match the number of " \ "callbacks (%d)" % (self.num_hotwords, len(detected_callback)) logger.debug("detecting...") while True: if interrupt_check(): logger.debug("detect voice break") break data = self.ring_buffer.get() if len(data) == 0: time.sleep(sleep_time) continue ans = self.detector.RunDetection(data) if ans == -1: logger.warning( "Error initializing streams or reading audio data") elif ans > 0: message = "Keyword " + str(ans) + " detected at time: " message += time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) logger.info(message) callback = detected_callback[ans - 1] if callback is not None: callback() logger.debug("finished.")
def listen(): # 変数 status = Status.WAIT # SYN判定用のハミング符号分を含めた直近16ビットのバッファ recent_bin_data = np.zeros(16, dtype=np.int8) # # 検波したバイナリデータ # bin_data = np.empty(0).astype(np.int8) # メッセージ本文の2進数データ input_bin_data = np.empty(0).astype(np.int8) # PyAudioの初期化処理 p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=common.SR, frames_per_buffer=CHUNK, input=True) # マイクから入力値を受け付ける while stream.is_active(): raw_data = stream.read(CHUNK) data = np.frombuffer(raw_data, dtype=np.int16) / np.iinfo(np.int16).max signal = demodulation(data) recent_bin_data = np.roll(recent_bin_data, -1) recent_bin_data[-1] = signal print("chunk", recent_bin_data) # 待ち状態の場合 if status == Status.WAIT: status_text.set("wait") # 直近のデータがSYNコードと一致するか判定 if check_syn(recent_bin_data): # SYNコードの場合入力受付状態に変更 status = Status.READY # 入力受付状態の場合 elif status == Status.READY: status_text.set("ready") input_bin_data = np.r_[input_bin_data, signal] print("input bin data", input_bin_data) # 信号の受信に失敗した場合は状態を戻す if np.all(recent_bin_data == -1): button.configure(state=tk.NORMAL) status_text.set("error") break # SYNコードか判定 if len(input_bin_data) % 16 == 0 and check_syn(recent_bin_data): # 直近8ビットがsynコードなので入力データから除外 input_bin_data = input_bin_data[:-16] # 成功処理 correct_data = correct_hamming_code(input_bin_data) message = decode(correct_data) end(message) break # PyAudioの終了処理 stream.stop_stream() stream.close() p.terminate()
def __init__(self, language_code='en-US', last_contexts=None): """Initialize all params and load data""" """ Constants and params """ self.CHUNK = 4096 self.FORMAT = pyaudio.paInt16 self.CHANNELS = 1 self.RATE = 16000 self.USE_AUDIO_SERVER = rospy.get_param('/dialogflow_client/use_audio_server', False) self.PLAY_AUDIO = rospy.get_param('/dialogflow_client/play_audio', True) self.DEBUG = rospy.get_param('/dialogflow_client/debug', False) # Register Ctrl-C sigint signal.signal(signal.SIGINT, self._signal_handler) """ Dialogflow setup """ # Get hints/clues rp = rospkg.RosPack() file_dir = rp.get_path('dialogflow_ros') + '/config/context.yaml' with open(file_dir, 'r') as f: try: self.phrase_hints = load(f) except YAMLError: rospy.logwarn("DF_CLIENT: Unable to open phrase hints yaml file!") self.phrase_hints = [] # Dialogflow params project_id = rospy.get_param('/dialogflow_client/project_id', 'my-project-id') session_id = str(uuid4()) # Random self._language_code = language_code self.last_contexts = last_contexts if last_contexts else [] # DF Audio Setup audio_encoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16 # Possibel models: video, phone_call, command_and_search, default self._audio_config = InputAudioConfig(audio_encoding=audio_encoding, language_code=self._language_code, sample_rate_hertz=self.RATE, phrase_hints=self.phrase_hints, model='command_and_search') self._output_audio_config = OutputAudioConfig( audio_encoding=OutputAudioEncoding.OUTPUT_AUDIO_ENCODING_LINEAR_16 ) # Create a session self._session_cli = dialogflow_v2beta1.SessionsClient() self._session = self._session_cli.session_path(project_id, session_id) rospy.logdebug("DF_CLIENT: Session Path: {}".format(self._session)) """ ROS Setup """ results_topic = rospy.get_param('/dialogflow_client/results_topic', '/dialogflow_client/results') requests_topic = rospy.get_param('/dialogflow_client/requests_topic', '/dialogflow_client/requests') text_req_topic = requests_topic + '/string_msg' text_event_topic = requests_topic + '/string_event' msg_req_topic = requests_topic + '/df_msg' event_req_topic = requests_topic + '/df_event' self._results_pub = rospy.Publisher(results_topic, DialogflowResult, queue_size=10) rospy.Subscriber(text_req_topic, String, self._text_request_cb) rospy.Subscriber(text_event_topic, String, self._text_event_cb) rospy.Subscriber(msg_req_topic, DialogflowRequest, self._msg_request_cb) rospy.Subscriber(event_req_topic, DialogflowEvent, self._event_request_cb) """ Audio setup """ # Mic stream input setup self.audio = pyaudio.PyAudio() self._server_name = rospy.get_param('/dialogflow_client/server_name', '127.0.0.1') self._port = rospy.get_param('/dialogflow_client/port', 4444) if self.PLAY_AUDIO: self._create_audio_output() rospy.logdebug("DF_CLIENT: Last Contexts: {}".format(self.last_contexts)) rospy.loginfo("DF_CLIENT: Ready!")
def __init__(self, API_KEY, URL, enviornment_id, collection_id, NLU_API_KEY, NLU_URL, ASSISTANT_API_KEY, ASSISTANT_URL, ASSISSTANT_ID, S2T_KEY, S2T_URL, SMMY_API_KEY): ''' Initialize a hindsight chatbot :param API_KEY: IBM Watson Discovery API Key :param URL: IBM Watson Discovery base url :param enviornment_id: IBM Enviornment id :param collection_id: IBM document collection id :return: ''' self.chat_states = {'add_mode': 1, 'ask_mode': 2} self.speech_mode_enabled = False self.intents = { 'show_notes': 1, 'summarize_notes': 2, 'sentiment_notes': 3 } self.state = self.chat_states['add_mode'] self.prompt = '>>> ' self.chatprompt = '\t~~~ ' self.state_prompt = 'Add a note: ' self.discovery = DiscoveryV1(version='2018-12-03', iam_apikey=API_KEY, url=URL) self.nlu = NaturalLanguageUnderstandingV1(version='2018-11-16', iam_apikey=NLU_API_KEY, url=NLU_URL) self.assistant = AssistantV2(version='2018-11-08', iam_apikey=ASSISTANT_API_KEY, url=ASSISTANT_URL) self.session_id = self.assistant.create_session( assistant_id=ASSISSTANT_ID).get_result()['session_id'] self.enviornment_id = enviornment_id self.collection_id = collection_id self.assistant_id = ASSISSTANT_ID self.ROOT_PATH = sys.path[0] self.METADATA_PATH = self.ROOT_PATH + '/notes_metadata' if not os.path.exists(self.METADATA_PATH): os.makedirs(self.METADATA_PATH) self.GLOBAL_ENTITIES = self.ROOT_PATH + '/notes_metadata/global_entities.p' if not os.path.exists(self.GLOBAL_ENTITIES): t = {'NULL': 0} pickle.dump(t, open(self.GLOBAL_ENTITIES, "wb")) self.GLOBAL_DOC_IDS = self.ROOT_PATH + '/notes_metadata/global_doc_ids.p' if not os.path.exists(self.GLOBAL_DOC_IDS): t = {'NULL': '/'} pickle.dump(t, open(self.GLOBAL_DOC_IDS, "wb")) self.NOTES_PATH = self.ROOT_PATH + '/notes.html' if not os.path.exists(self.NOTES_PATH): os.makedirs(self.NOTES_PATH) self.INTENT_LINES = [] if not os.path.exists(self.ROOT_PATH + '/intent_training_data.csv'): print('!!! ERROR: ./scripts/intent_training_data.csv required') quit() lines = open(self.ROOT_PATH + '/intent_training_data.csv').readlines() self.INTENT_LINES = [l.strip().split(',')[0] for l in lines] self.S2T_KEY = S2T_KEY self.S2T_URL = S2T_URL self.pyAudio = pyaudio.PyAudio() self.SMMY_API_KEY = SMMY_API_KEY
def recognize(self, args, userin, user_full_name, user_prefix): with noalsaerr(): p = pyaudio.PyAudio() # Create a PyAudio session # Create a stream stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=True, frames_per_buffer=CHUNK) try: data = stream.read(CHUNK) # Get first data frame from the microphone # Loop over the frames of the audio / data chunks audio = None # print("START LISTENNING") while data != '': rms = audioop.rms(data, 2) # Calculate Root Mean Square of current chunk if rms >= THRESHOLD: # If Root Mean Square value is greater than THRESHOLD constant audio = data silence_counter = 0 # Define silence counter # While silence counter value less than SILENCE_DETECTION constant while silence_counter < SILENCE_DETECTION: data = stream.read(CHUNK) # Read a new chunk from the stream if LISTENING: stream.write(data, CHUNK) audio = audio + data rms = audioop.rms(data, 2) # Calculate Root Mean Square of current chunk again if rms < THRESHOLD: # If Root Mean Square value is less than THRESHOLD constant silence_counter += 1 # Then increase silence counter else: # Else silence_counter = 0 # Assign zero value to silence counter # print("Analyzing...") stream.stop_stream() audio_data = sr.AudioData(audio, RATE, p.get_sample_size(FORMAT)) try: com = self.recognizer.recognize_google(audio_data) print(com) her = VirtualAssistant(args, userin, user_full_name, user_prefix) t = Thread(target=her.command, args=(com,)) t.start() except sr.UnknownValueError: # print("Google Speech Recognition could not understand audio") pass except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) stream.start_stream() self.reset() data = stream.read(CHUNK) # Read a new chunk from the stream if LISTENING: stream.write(data, CHUNK) except KeyboardInterrupt: stream.stop_stream() stream.close() p.terminate() # self.loop.quit() raise KeyboardInterrupt
def recognize(): # Voice Authentication FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 4 FILENAME = "./test.wav" audio = pyaudio.PyAudio() # start Recording stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) time.sleep(2.0) print("recording...") frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) print("finished recording") # stop Recording stream.stop_stream() stream.close() audio.terminate() # saving wav file waveFile = wave.open(FILENAME, 'wb') waveFile.setnchannels(CHANNELS) waveFile.setsampwidth(audio.get_sample_size(FORMAT)) waveFile.setframerate(RATE) waveFile.writeframes(b''.join(frames)) waveFile.close() modelpath = "./gmm_models/" gmm_files = [os.path.join(modelpath,fname) for fname in os.listdir(modelpath) if fname.endswith('.gmm')] models = [pickle.load(open(fname,'rb')) for fname in gmm_files] speakers = [fname.split("/")[-1].split(".gmm")[0] for fname in gmm_files] if len(models) == 0: print("No Users in the Database!") return #read test file sr,audio = read(FILENAME) # extract mfcc features vector = extract_features(audio,sr) log_likelihood = np.zeros(len(models)) #checking with each model one by one for i in range(len(models)): gmm = models[i] scores = np.array(gmm.score(vector)) log_likelihood[i] = scores.sum() pred = np.argmax(log_likelihood) identity = speakers[pred] # if voice not recognized than terminate the process if identity == 'unknown': print("Not Recognized! Try again...") return print( "Recognized as - ", identity) # face recognition print("Keep Your face infront of the camera") cap = cv2.VideoCapture(0) cap.set(3, 640) cap.set(4, 480) cascade = cv2.CascadeClassifier('./haarcascades/haarcascade_frontalface_default.xml') #loading the database database = pickle.load(open('face_database/embeddings.pickle', "rb")) time.sleep(1.0) start_time = time.time() while True: curr_time = time.time() _, frame = cap.read() frame = cv2.flip(frame, 1, 0) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) face = cascade.detectMultiScale(gray, 1.3, 5) name = 'unknown' if len(face) == 1: for (x, y, w, h) in face: roi = frame[y-10:y+h+10, x-10:x+w+10] fh, fw = roi.shape[:2] min_dist = 100 #make sure the face is of required height and width if fh < 20 and fh < 20: continue #resizing image as required by the model img = cv2.resize(roi, (96, 96)) #128 d encodings from pre-trained model encoding = img_to_encoding(img) # loop over all the recorded encodings in database for knownName in database: # find the similarity between the input encodings and recorded encodings in database using L2 norm dist = np.linalg.norm(np.subtract(database[knownName], encoding) ) # check if minimum distance or not if dist < min_dist: min_dist = dist name = knownName # if min dist is less then threshold value and face and voice matched than unlock the door if min_dist <= 0.4 and name == identity: print ("Door Unlocked! Welcome " + str(name)) break #open the cam for 3 seconds if curr_time - start_time >= 3: break cv2.waitKey(1) cv2.imshow('frame', frame) cap.release() cv2.destroyAllWindows() if len(face) == 0: print('There was no face found in the frame. Try again...') elif len(face) > 1: print("More than one faces found. Try again...") elif min_dist > 0.4 or name != identity: print("Not Recognized! Try again...")
port = 23333 addr = "239.192.0.233" buf_size = 65536 pygame.init() pygame.camera.init() size = (128, 96) cam = pygame.camera.Camera("/dev/video0", size) NUM_SAMPLES = 2000 framerate = 8000 channels = 1 sampwidth = 2 sleep_time = 0.25 pin = pyaudio.PyAudio() streamin = pin.open(format=pyaudio.paInt16, channels=1, rate=framerate, input=True, frames_per_buffer=NUM_SAMPLES) pout = pyaudio.PyAudio() streamout = pout.open(format=pyaudio.paInt16, channels=1, rate=framerate, output=True) TYPE = 3 def init():
def find_input_devices(): pa = pyaudio.PyAudio() for i in range(pa.get_device_count()): devinfo = pa.get_device_info_by_index(i) print("Device %d: %s" % (i, devinfo["name"]))
def __init__(self, freq): self.audio = pyaudio.PyAudio() self.freq = freq self.stream = audio.get_stream(self.audio, output=True) self.ping_buffer = make_buffer_from_bit_pattern( self.bitstream, self.freq, 0)
CHANNELS = 1 CHUNK_SIZE = int(FRAME_LEN * SAMPLE_RATE) asr = FrameASR(model_definition={ 'sample_rate': SAMPLE_RATE, 'AudioToMelSpectrogramPreprocessor': cfg.preprocessor.params, 'JasperEncoder': cfg.encoder.params, 'labels': cfg.decoder.params.vocabulary }, frame_len=FRAME_LEN, frame_overlap=2, offset=4) asr.reset() p = pa.PyAudio() #print('Available audio input devices:') input_devices = [] for i in range(p.get_device_count()): dev = p.get_device_info_by_index(i) if dev.get('maxInputChannels'): input_devices.append(i) #print(i, dev.get('name')) if len(input_devices): dev_idx = -2 while dev_idx not in input_devices: # print('Please type input device ID:') dev_idx = 3 # 3 for virtal cable empty_counter = 0
def main(): if len(sys.argv) != 4: print("Error!!!!") exit() ip = sys.argv[1] # Server's ip ---argv:argumento port = sys.argv[2] # Server's port identity = sys.argv[3].encode('ascii') connected = False context = zmq.Context() s = context.socket(zmq.DEALER) s.identity = identity s.connect("tcp://{}:{}".format( ip, port)) #corchetes por que se enviaron los parametros el ip y el port print("Started client with id {}".format(identity)) poller = zmq.Poller() poller.register(sys.stdin, zmq.POLLIN) poller.register(s, zmq.POLLIN) FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 0.3 p = pyaudio.PyAudio() first = True global queue print("\n----Menu----") print("- 'bring' {id de usuario} ......... Invitar a sesion (sin llaves)") print("- 'exit' ......... Salir del programa") threads = [] while True: socks = dict(poller.poll()) if s in socks: op, *msg = s.recv_multipart() if op.decode() == "connect": connected = True elif op.decode() == "play": #RECIBIENDO FRAMES if msg[0] in queue: queue[msg[0]].append(msg[1:]) else: queue[msg[0]] = [] queue[msg[0]].append(msg[1:]) threads.append( threading.Thread(target=play, args=(msg[0], ))) threads[-1].start() if sys.stdin.fileno() in socks: command = input() command = command.split() if command[0] == "bring": s.send_multipart( [bytes(command[0], 'ascii'), bytes(command[1], 'ascii')]) connected = True elif command[0] == "exit": s.send_multipart( [bytes(command[0], 'ascii'), bytes("NA", 'ascii')]) break else: print(' Operacion no soportada') if connected: if first: stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=False, frames_per_buffer=CHUNK) first = False frames = [bytes('send', 'ascii')] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): frames.append(stream.read(CHUNK)) #ENVIANDO FRAMES s.send_multipart(frames) p.terminate() stream.stop_stream() stream.close()
def main(): mac_addr = open('/sys/class/net/wlan0/address').readline() # Connect to db con = MySQLdb.Connection(host=HOST, port=PORT, user=USER, passwd=PASSWORD, db=DB) c = con.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS zeroData(temp FLOAT, pres FLOAT, hum FLOAT, gas FLOAT, lux INTEGER, db FLOAT, dt DATETIME)''' ) # Initialize db parser = argparse.ArgumentParser() parser.add_argument("db", help="zeroData") parser.add_argument("token", help="35d4aa441b94cdbae7404050edd3fad6") args = parser.parse_args() corlysis_params = { "db": args.db, "u": "token", "p": args.token, "precision": "ms" } # Initialize sensor bme = bme680.BME680(i2c_addr=0x77) bme.set_humidity_oversample(bme680.OS_2X) bme.set_pressure_oversample(bme680.OS_4X) bme.set_temperature_oversample(bme680.OS_8X) bme.set_filter(bme680.FILTER_SIZE_3) bme.set_gas_status(bme680.ENABLE_GAS_MEAS) # Initialize USB mic pyaud = pyaudio.PyAudio() stream = pyaud.open(format=pyaudio.paInt16, channels=1, rate=32000, input_device_index=2, input=True) payload = "" counter = 1 problem_counter = 0 now = time.strftime('%Y-%m-%d %H:%M:%S') print("Readings began " + now) print("Press ctrl+c to end readings and close connection.") animation = "|/-\\" aniCount = 0 # Main loop while (True): try: # Get time for corlysis and db unix_time_ms = int(time.time() * 1000) now = time.strftime('%Y-%m-%d %H:%M:%S') # Read from BME bme.get_sensor_data() tempCelcius = float("{0:.2f}".format(bme.data.temperature)) # Convert the above variable to fahrenheit temperature = float(tempCelcius * (9 / 5) + 32) pressure = float("{0:.2f}".format(bme.data.pressure)) humidity = float("{0:.2f}".format(bme.data.humidity)) gas = float("{0:.2f}".format(bme.data.gas_resistance)) # Read from lux sensor tsl = TSL2561(debug=True) luxVal = tsl.lux() # Read from USB mic rawsamps = stream.read(2048, exception_on_overflow=False) samps = numpy.fromstring(rawsamps, dtype=numpy.int16) deciVal = analyse.loudness(samps) + 65 line = "sensors_data temperature={},pressure={},humidity={},luxVal={},decib={} {}\n".format( temperature, pressure, humidity, luxVal, deciVal, unix_time_ms) payload += line if counter % SENDING_PERIOD == 0: try: # try to send data to cloud r = requests.post(URL, params=corlysis_params, data=payload) if r.status_code != 204: raise Exception("data not written") payload = "" except: problem_counter += 1 print('cannot write to InfluxDB') if problem_counter == MAX_LINES_HISTORY: problem_counter = 0 payload = "" counter += 1 # Print animation sys.stdout.write("\rCollecting data... " + animation[aniCount]) sys.stdout.flush() aniCount += 1 if (aniCount == 4): aniCount = 0 time_diff_ms = int(time.time() * 1000) - unix_time_ms # print(time_diff_ms) if time_diff_ms < READING_DATA_PERIOD_MS: time.sleep((READING_DATA_PERIOD_MS - time_diff_ms) / 1000.0) values = (mac_addr, temperature, pressure, humidity, gas, luxVal, deciVal, now) add_val = ("INSERT INTO data " "(mac, temp, pres, hum, gas, lux, db, dt)" "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") c.execute(add_val, values) con.commit() except KeyboardInterrupt: con.close() break except Exception as e: pass print(e)
__author__ = 'Victor' import sys import math import wave import struct import curses import pyaudio import numpy as np import matplotlib.pyplot as plt standard = curses.initscr() standard.nodelay(True) curses.noecho() curses.cbreak() pythonAudioObject = pyaudio.PyAudio() MODE = sys.argv[1] FOLD = 1 SAMPLE_RATE = 44100 CHANNELS = 2 WIDTH = 2 try: IterationsN = int(sys.argv[3]) except (ValueError, IndexError): print('The second argument has to be a number.') sys.exit() def main(): standard.addstr('Noise-cancelling live')
def arduino_soundlight(): chunk = 2**11 # Change if too fast/slow, never less than 2**11 scale = 50 # Change if too dim/bright exponent = 5 # Change if too little/too much difference between loud and quiet sounds samplerate = 44100 # CHANGE THIS TO CORRECT INPUT DEVICE # Enable stereo mixing in your sound card # to make you sound output an input # Use list_devices() to list all your input devices device = 3 p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=chunk, input_device_index=device) print "Starting, use Ctrl+C to stop" try: #ser = serial.Serial( #port='com3', #timeout=1 #) while True: data = stream.read(chunk) ''' # Old RMS code, will only show the volume rms = audioop.rms(data, 2) #level = min(rms / (2.0 ** 11) * scale, 1.0) level = max(min(rms / scale, 1.0), 0.0) level = level**exponent level = int(level * 255) print level #ser.write(chr(level)) ''' # Do FFT levels = calculate_levels(data, chunk, samplerate) algo = [] # Make it look better and send to serial for level in levels: level = max(min(level / scale, 1.0), 0.0) level = level**exponent level = int(level * 255) algo.append(level) #print '>' * level, #sys.stdout.flush() #ser.write(chr(level)) '''print '>' * algo[0] + '\r' print '>' * algo[1] + '\r' print '>' * algo[2] + '\r' print '>' * algo[3] + '\r' print '>' * algo[4] + '\r' print '>' * algo[5] + '\r', sys.stdout.flush()''' #s = ser.read(6) if (((algo[2] + algo[3] + algo[0] + algo[1]) / 2) >= 5): para.setData(2) else: para.setData(0) except KeyboardInterrupt: pass finally: print "\nStopping" stream.close() p.terminate()
def worker(self): audio = pyaudio.PyAudio() print('\n*******************************************') print('RHAPSODY MODULE-I INPUT') print('*******************************************\n') print('\n===========================================') print('STARTED RECORDING') print('===========================================\n') for i in range(1, 4): print('\n===========================================') print(str(i) + '...') print('===========================================\n') sleep(1) stream = audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) f = [] for i in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)): data = stream.read(self.CHUNK) f.append(data) print('\n===========================================') print('DONE RECORDING') print('===========================================\n') stream.stop_stream() stream.close() audio.terminate() wf = wave.open(self.WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(self.CHANNELS) wf.setsampwidth(audio.get_sample_size(self.FORMAT)) wf.setframerate(self.RATE) wf.writeframes(b''.join(f)) wf.close() """""" """""" """""" """""" """""" """""" """ 1 - Loading File """ """""" """""" """""" """""" """""" """""" filename = self.WAVE_OUTPUT_FILENAME y, sr = librosa.load(filename) """""" """""" """""" """""" """""" """""" """ 2 - Get Tempo == bpm """ """""" """""" """""" """""" """""" """""" tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) print('\n===========================================') print('Estimated tempo: {:.2f} beats per minute'.format(tempo)) print('===========================================\n') # generate csv files with beat times #CSV_FILENAME = self.WAVE_OUTPUT_FILENAME_NO_EXTENSION + ".csv" beat_times = librosa.frames_to_time(beat_frames, sr=sr) CSV_FILENAME = os.path.abspath( os.path.join(os.path.dirname(__file__), "Recordings", self.final + ".csv")) librosa.output.times_csv(CSV_FILENAME, beat_times) # WRITING A FILE WITH THE TEMPO #TEXT_FILENAME = self.WAVE_OUTPUT_FILENAME_NO_EXTENSION + ".txt" TEXT_FILENAME = os.path.abspath( os.path.join(os.path.dirname(__file__), "Recordings", self.final + ".txt")) bpm_value = open(TEXT_FILENAME, 'w') tempo_text = str(tempo) + '\n' bpm_value.write(tempo_text) """""" """""" """""" """""" """""" """""" """ 3 - Get Notes """ """""" """""" """""" """""" """""" """""" hz = librosa.feature.chroma_cqt(y=y, sr=sr) ## GET STRONGEST OCTAVE strongestOctave = 0 strongestOctave_sum = 0 for octave in range(len(hz)): sum = 0 for frame in hz[octave]: sum = sum + frame if sum > strongestOctave_sum: strongestOctave_sum = sum strongestOctave = octave ## GET HEIGHEST HZ FOR EACH TIME FRAME strongestHz = [] for i in range(len(hz[0])): strongestHz.append(0) notes = [] for i in range(len(hz[0])): notes.append(0) for frame_i in range(len(hz[0])): strongest_temp = 0 for octave_i in range(len(hz)): if hz[octave_i][frame_i] > strongest_temp: strongest_temp = hz[octave_i][frame_i] strongestHz[frame_i] = octave_i + 1 notes[frame_i] = librosa.hz_to_note(hz[octave_i][frame_i]) # C C# D D# E F F# G G# A A# B # 1 2 3 4 5 6 7 8 9 10 11 12 strongestHz_sum = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for note in strongestHz: strongestHz_sum[note - 1] = strongestHz_sum[note - 1] + 1 for i in range(len(strongestHz_sum)): strongestHz_sum[i] = float(strongestHz_sum[i]) / len(strongestHz) noteSorted = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for num in range(len(noteSorted)): biggest = strongestHz_sum.index(max(strongestHz_sum)) noteSorted[num] = biggest + 1 strongestHz_sum[biggest] = strongestHz_sum[biggest] - 0.25 for note in noteSorted: noteString = str(note) + '\n' bpm_value.write(noteString) bpm_value.close() print('\n===========================================') print('RECORDING ANALYSIS COMPLETED SUCCESSFULLY!!!') print('===========================================\n') self.finished.emit()
def run(self): """ Creates an input audio stream, initializes wake word detection (Porcupine) and speech to intent (Rhino) engines, and monitors the audio stream for occurrences of the wake word and then infers the intent from speech command that follows. """ porcupine = None rhino = None pa = None audio_stream = None wake_phrase_detected = False intent_extraction_is_finalized = False try: porcupine = Porcupine( library_path=self._porcupine_library_path, model_file_path=self._porcupine_model_file_path, keyword_file_paths=[self._porcupine_keyword_file_path], sensitivities=[self._porcupine_sensitivity]) rhino = Rhino( library_path=self._rhino_library_path, model_file_path=self._rhino_model_file_path, context_file_path=self._rhino_context_file_path) print() print('****************************** context ******************************') print(rhino.context_expressions) print('*********************************************************************') print() pa = pyaudio.PyAudio() audio_stream = pa.open( rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length, input_device_index=self._input_device_index) # NOTE: This is true now and will be correct possibly forever. If it changes the logic below need to change. assert porcupine.frame_length == rhino.frame_length while True: pcm = audio_stream.read(porcupine.frame_length) pcm = struct.unpack_from("h" * porcupine.frame_length, pcm) if self._output_path is not None: self._recorded_frames.append(pcm) if not wake_phrase_detected: wake_phrase_detected = porcupine.process(pcm) if wake_phrase_detected: print('detected wake phrase') elif not intent_extraction_is_finalized: intent_extraction_is_finalized = rhino.process(pcm) else: if rhino.is_understood(): intent, slot_values = rhino.get_intent() print() print('intent: %s' % intent) print('---') for slot, value in slot_values.items(): print('%s: %s' % (slot, value)) print() else: print("didn't understand the command") rhino.reset() wake_phrase_detected = False intent_extraction_is_finalized = False except KeyboardInterrupt: print('stopping ...') finally: if porcupine is not None: porcupine.delete() if rhino is not None: rhino.delete() if audio_stream is not None: audio_stream.close() if pa is not None: pa.terminate() if self._output_path is not None and len(self._recorded_frames) > 0: recorded_audio = np.concatenate(self._recorded_frames, axis=0).astype(np.int16) soundfile.write(self._output_path, recorded_audio, samplerate=porcupine.sample_rate, subtype='PCM_16')
def play_the_recording(file_path): p = vlc.MediaPlayer(output_file) p.play() if __name__ == '__main__': filename = 'sample.wav' while True: print() command = input('Enter q to stop. Otherwise, press any key: ') if command == 'q': break player = pyaudio.PyAudio() record_to_file(filename, player) player.terminate() # Speech-to-text: Uzbek language texts = speech_to_text(filename, 'uz') # Translations Uzbek -> English textEn = translate_text(texts[0], 'uz', 'en') # Translation English -> Uzbek textUz = translate_text(textEn, 'en', 'uz') # Text-to-Speech: Uzbek language is not supported. Using English instead. output_file = text_to_speech(textUz, 'en')
def localize(): global switch_beamforming global DO_BEAMFORM # Setup search space source_plane = OrientedSourcePlane(SOURCE_PLANE_NORMAL, SOURCE_PLANE_UP, SOURCE_PLANE_OFFSET) space = SearchSpace(MIC_LOC, CAMERA_LOC, [source_plane]) # Setup pyaudio instances pa = pyaudio.PyAudio() helper = AudioHelper(pa) localizer = KalmanTrackingLocalizer(mic_positions=mic_layout, search_space=space, mic_forward=MIC_FORWARD, mic_above=MIC_ABOVE, trans_mat=STATE_TRANSITION_MAT, state_cov=STATE_TRANSITION_MAT, emission_mat=EMISSION_MAT, emission_cov=EMISSION_COV, dft_len=FFT_LENGTH, sample_rate=SAMPLE_RATE, n_theta=N_THETA, n_phi=N_PHI) beamformer = BeamFormer(mic_layout, SAMPLE_RATE) # Setup STFT object stft = StftManager(dft_length=FFT_LENGTH, window_length=WINDOW_LENGTH, hop_length=HOP_LENGTH, use_window_fcn=True, n_channels=NUM_CHANNELS_IN, dtype=DATA_TYPE) # Setup devices in_device = helper.get_input_device_from_user() if PLAY_AUDIO: out_device = helper.get_output_device_from_user() else: out_device = helper.get_default_output_device_info() # Setup streams in_stream = pa.open(rate=SAMPLE_RATE, channels=NUM_CHANNELS_IN, format=SAMPLE_TYPE, frames_per_buffer=FRAMES_PER_BUF, input=True, input_device_index=int(in_device['index']), stream_callback=read_in_data) out_stream = pa.open(rate=SAMPLE_RATE, channels=NUM_CHANNELS_OUT, format=SAMPLE_TYPE, output=True, frames_per_buffer=FRAMES_PER_BUF, output_device_index=int(out_device['index']), stream_callback=write_out_data) # Start recording/playing back in_stream.start_stream() out_stream.start_stream() # Start thread to check for user quit quit_thread = threading.Thread(target=check_for_quit) quit_thread.start() # Setup directions and alignment matrices direcs = localizer.get_directions() align_mats = localizer.get_pos_align_mat() # Plotting setup if PLOT_POLAR: fig = plt.figure() ax = fig.add_subplot(111, projection='polar') ax.set_rlim(0, 1) plt.show(block=False) # Setup space for plotting in new coordinates spher_coords = localizer.get_spher_directions() theta = spher_coords[1, :] pol_plot, = plt.plot(theta, np.ones(theta.shape)) post_plot, = plt.plot(theta, np.ones(theta.shape), 'green') ax.set_ylim(0, 1) if DO_BEAMFORM: pol_beam_plot, = plt.plot(theta, np.ones(theta.shape), 'red') if PLOT_CARTES: fig = plt.figure() ax = fig.add_subplot(111, projection='3d') plt.show(block=False) x = localizer.to_spher_grid(direcs[0, :]) y = localizer.to_spher_grid(direcs[1, :]) z = localizer.to_spher_grid(direcs[2, :]) #scat = ax.scatter(x, y, z, s=100) if EXTERNAL_PLOT: fig = plt.figure() ax = fig.add_subplot(111) plt.show(block=False) count = 0 try: global done while in_stream.is_active() or out_stream.is_active(): data_available = in_buf.wait_for_read(WINDOW_LENGTH, TIMEOUT) if data_available: if switch_beamforming: DO_BEAMFORM = not DO_BEAMFORM switch_beamforming = False # Get data from the circular buffer data = in_buf.read_samples(WINDOW_LENGTH) # Perform an stft stft.performStft(data) # Process dfts from windowed segments of input dfts = stft.getDFTs() rffts = mat.to_all_real_matlab_format(dfts) d, energy = localizer.get_distribution_real( rffts[:, :, 0], 'gcc') # Use first hop post = localizer.get_distribution(rffts[:, :, 0]) ind = np.argmax(post) u = 1.5 * direcs[:, ind] # Direction of arrival #if energy < 500: # continue # Do beam forming if DO_BEAMFORM: align_mat = align_mats[:, :, ind] filtered = beamformer.filter_real(rffts, align_mat) mat.set_dfts_real(dfts, filtered, n_channels=2) # Take care of plotting if count % 1 == 0: if PLOT_CARTES: ax.cla() ax.grid(False) d = localizer.to_spher_grid( post / (np.max(post) + consts.EPS)) #d = localizer.to_spher_grid(d / (np.max(d) + consts.EPS)) ax.scatter(x, y, z, c=d, s=40) #ax.plot_surface(x, y, z, rstride=1, cstride=1, facecolor=plt.cm.gist_heat(d)) ax.plot([0, u[0]], [0, u[1]], [0, u[2]], c='black', linewidth=3) if DO_BEAMFORM: if np.max(np.abs(response)) > 1: response /= np.max(np.abs(response)) X = response * x Y = response * y Z = response * z ax.plot_surface(X, Y, Z, rstride=1, cstride=1, color='white') ax.set_xlim(-1, 1) ax.set_ylim(-1, 1) ax.set_zlim(0, 1) #ax.view_init(90, -90) fig.canvas.draw() if PLOT_2D: # Get unconditional distribution dist = localizer.to_spher_grid(d) dist -= np.min(dist) dist /= (np.sum(dist) + consts.EPS) sample_mat[:, :-1] = sample_mat[:, 1:] sample_mat[:, -1] = dist # Get kalman estimate maxind = np.argmax(post) estimate_mat[:-1] = estimate_mat[1:] estimate_mat[-1] = maxind plot_2d.set_array(sample_mat) state_est_plot.set_ydata(estimate_mat) plt.draw() count += 1 # Get the istft of the processed data if PLAY_AUDIO or RECORD_AUDIO: new_data = stft.performIStft() new_data = out_buf.reduce_channels(new_data, NUM_CHANNELS_IN, NUM_CHANNELS_OUT) # Write out the new, altered data if PLAY_AUDIO: if out_buf.get_available_write() >= WINDOW_LENGTH: out_buf.write_samples(new_data) if RECORD_AUDIO: if record_buf.get_available_write() >= WINDOW_LENGTH: record_buf.write_samples(new_data) except KeyboardInterrupt: print "Program interrupted" done = True print "Cleaning up" in_stream.stop_stream() in_stream.close() out_stream.stop_stream() out_stream.close() pa.terminate() # Take care of output file if RECORD_AUDIO: print "Writing output file" make_wav() print "Done"
def UploadSignal(self): self.ui.GVOriginal.clear() filePaths = QtWidgets.QFileDialog.getOpenFileNames( self, 'Open File', "~/Desktop/sigViews", '*.wav') for filePath in filePaths: for self.f in filePath: if self.f == '*': break p = pyaudio.PyAudio() self.waveFile = wave.open(self.f, 'rb') # wav1 = wave.open(f,'rb') # self.ywav1=wav1.readframes(-1) # self.ywav1 =np.fromstring(self.ywav1,'Int16') # fs=wav1.getframerate() # self.xwav1=np.linspace(0,len(self.ywav1)/fs,num=len(self.ywav1)) # print("length of signal") # print(len(self.xwav1)) # self.ui.GVOriginal.plot(self.xwav1,self.ywav1, pen='b') self.format = p.get_format_from_width( self.waveFile.getsampwidth()) channel = self.waveFile.getnchannels() self.rate = self.waveFile.getframerate() self.frame = self.waveFile.getnframes() self.stream = p.open( format=self.format, # DATA needed for streaming channels=channel, rate=self.rate, output=True) #durationF = self.frame / float(self.rate) self.data_int = self.waveFile.readframes(self.frame) self.data_plot = np.fromstring(self.data_int, 'Int16') self.data_plot.shape = -1, 2 self.data_plot = self.data_plot.T # Y-axis self.ywav1 = self.data_plot print('original data', self.ywav1) self.time = np.arange(0, self.frame) * (1.0 / self.rate ) #X-axis self.xwav1 = self.time #fft_frame = np.fft.rfft(current_frame) self.ywav1min = np.nanmin(self.ywav1[1]) self.ywav1max = np.nanmax(self.ywav1[1]) self.ui.GVOriginal.setXRange(self.xwav1[0], self.xwav1[-1]) self.ui.GVOriginal.plotItem.getViewBox().setLimits( xMin=self.xwav1[0], xMax=self.xwav1[-1], yMin=self.ywav1min - self.ywav1min * 0.1, yMax=self.ywav1max + self.ywav1max * 0.1) self.ui.GVOriginal.plot(self.xwav1, self.ywav1[1], pen='b') #===============================================Fourier Transform===============================================# self.fs_rate, self.spf = wavfile.read(self.f) # print("araay",self.spf.shape) print("Frequency sampling", self.fs_rate) l_audio = len(self.spf.shape) print("Channels", l_audio) if l_audio == 2: self.spf = self.spf.mean( axis=1) # To make it a mono signal, 1 channel only N = self.spf.shape[0] # Give number of rows print("complete Sampling N", N) secs = N / float(self.fs_rate) print("secs", secs) Ts = 1.0 / self.fs_rate # sampling interval in time print("Timestep between Ts", Ts) t = scipy.arange(0, secs, Ts) self.FFT = abs(scipy.fft(self.spf)) self.freqs = scipy.fftpack.fftfreq( self.spf.size, t[1] - t[0] ) # Return the Discrete Fourier Transform sample frequencies. t[1]-t[0] is the sample spacing FFT_side = self.FFT[range( N // 2 )] # one side FFT range, remove the negative part (starts from zero) self.FFT_sideArr = np.array(FFT_side) self.bands = np.array_split(self.FFT_sideArr, 10) # self.bands=np.array_split(self.FFT,20) print('lehgth of band', int(len(self.bands[1]))) self.BandSize = int(len(self.FFT_sideArr) / 10) self.phase = np.angle(scipy.fft( self.spf)) #phase, we will use it later freqs_side = self.freqs[range(N // 2)] self.fft_freqs_side = np.array(freqs_side) self.ui.GVFourier.plot(self.freqs, self.FFT, pen='g') QtCore.QCoreApplication.processEvents()
print('Press Ctrl-C to quit.') last_touched = cap.touched() record_state = True is_recording = [ False, False, False, False, False, False, False, False, False, False, False, False ] is_playing = [ False, False, False, False, False, False, False, False, False, False, False, False ] my_thread_record = None my_thread_play = None is_pygame_init = False my_thread_init = None audio = pyaudio.PyAudio() # create pyaudio instantiation def loop_record2(x): form_1 = pyaudio.paInt16 # 16-bit resolution chans = 1 # 1 channel samp_rate = 44100 # 44.1kHz sampling rate chunk = 4096 # 2^12 samples for buffer record_secs = 120 # seconds to record dev_index = 2 # device index found by p.get_device_info_by_index(ii) file_array = [ 'test0.wav', 'test1.wav', 'test2.wav', 'test3.wav', 'test4.wav', 'test5.wav', 'test6.wav', 'test7.wav', 'test8.wav', 'test9.wav', 'test10.wav' ] wav_output_filename = file_array[x] # name of .wav file
def speakerRecog(): #Recording Phase CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 RECORD_SECONDS = 5 WAVE_OUTPUT_FILENAME = "./SR/samples/test.wav" p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("* recording") frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) print("* done recording") stream.stop_stream() stream.close() p.terminate() wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close() #Now the recording is stored in test.wav #We will now test the recording with the gmm models source = "./SR/samples/" modelpath = "./SR/gmm_models/" test_file = "./SR/testing_sample_list.txt" file_paths = open(test_file, 'r') gmm_files = [ os.path.join(modelpath, fname) for fname in os.listdir(modelpath) if fname.endswith(".gmm") ] print(gmm_files) #Load the Gaussian gender Models models = [cPickle.load(open(fname, 'rb')) for fname in gmm_files] speakers = [fname.split("/")[-1].split(".gmm")[0] for fname in gmm_files] # Read the test directory and get the list of test audio files for path in file_paths: path = path.strip() print(path) sr, audio = read(source + path) vector = extract_features(audio, sr) log_likelihood = np.zeros(len(models)) for i in range(len(models)): gmm = models[i] #checking with each model one by one scores = np.array(gmm.score(vector)) log_likelihood[i] = scores.sum() winner = np.argmax(log_likelihood) print("\tdetected as - ", speakers[winner]) return speakers[winner] time.sleep(1.0)
max_fps = 60 width_of_col = 1 scale = 1 skip_under = 0 file_path = "./File0161.wav" #Init pygame.init() screen = pygame.display.set_mode(size) pygame.display.set_caption("AV") done = False clock = pygame.time.Clock() wf = wave.open(file_path, "rb") p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True, input=True) while not done: # --- Main event loop for event in pygame.event.get(): if event.type == pygame.QUIT: done = True temp = wf.readframes(chunk) if len(temp) < 4096:
def GoRun(self): #main button callback for collecting new data self.Status = True self.fs = 44100 #set sample rate, default to 44100 iters = 1000 # (mostly) deprecated chunkSize = 8192 #number of samples to read in at once windowSize = 3 #number of seconds to plot at once numSamples = iters * chunkSize #set up an audio stream p = pyaudio.PyAudio() audioStream = p.open(format=pyaudio.paInt16, channels=1, rate=self.fs, input=True, frames_per_buffer=chunkSize) #empty out the recording self.Recording = np.zeros(numSamples, dtype=np.int16) self.Formants = np.zeros((100, 5), dtype=np.float32) self.FormantTime = np.zeros(100, dtype=np.float32) self.Pitch = np.zeros(100, dtype=np.float32) self.PitchTime = np.zeros(100, dtype=np.float32) FormantCount = 0 PitchCount = 0 #set up our axes ax = self.RawPlot.figure.add_subplot(111) f0ax = self.FundamentalFrequenncyPlot.figure.add_subplot(111) f0ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) f0ax.set_position([0.35, 0.05, 0.6, 0.93]) formantAx = self.FormantPlot.figure.add_subplot(111) tractAx = self.VocalTractPlot.figure.add_subplot(111) tractAx.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) tractAx.set_position([0.35, 0.05, 0.6, 0.93]) tractAx.set_ylabel('Vocal Tract Length (cm)') tractAx.set_ylim((0, 25)) tractAx.set_xlim((0, 0.8)) c = 34300 # speed of sound in cm/s maxPitchLag = 3 maxVocalLag = 3 ds_rate = 3 #set up time vector print('Beginning New Recording') time = np.linspace(0, numSamples / self.fs, numSamples) i = 0 try: #using try/except to enable keyboard interrupt start = ti.time() while self.Status: #keep going forever, or until keyboard interrupt t = (i + 1) * chunkSize if t > len(self.Recording ): # add space to the recording in necessary extraSpace = np.zeros(numSamples, dtype=np.int16) self.Recording = np.concatenate( [self.Recording, extraSpace], axis=None) time = np.linspace(0, len(self.Recording) / self.fs, len(self.Recording)) # pull a chunk from our audio stream data = PyAudioTest.getChunk(chunkSize, audioStream, Random=0) data_ds = data[0:chunkSize:ds_rate] # downsample of data # its generally a good idea to lowpass filter before downsampling, # but to save computational time this is skipped here. # our data is ~mostly~ band-limited, so I don't expect this to be huge problem # add chunk to our recording self.Recording[i * chunkSize:(i + 1) * chunkSize] = data # get f0 and update f0 plot # use my hack method for getting f0 #clipData = PyAudioTest.centerClip(data) #acf = PyAudioTest.autocorr(clipData) #f0 = PyAudioTest.getF0(acf, self.fs) # use yin implementation instead # yin's original implementation called for filtering, # which we have not yet implemented for computational reasons data_hamming = data * np.hamming(chunkSize) df = yin.differenceFunction(data_hamming, chunkSize, self.fs / 75) cmndf = yin.cumulativeMeanNormalizedDifferenceFunction( df, len(df)) f0 = yin.getPitch(cmndf, self.fs / 500, self.fs / 75, harmo_th=0.35) if f0: # if f0 is detected, update our graph # store ot pitch and time self.Pitch[PitchCount] = 1.0 * self.fs / f0 self.PitchTime[PitchCount] = 1.0 * ( t - chunkSize / 2) / self.fs PitchCount += 1 # add space if needed if PitchCount >= len(self.PitchTime): self.Pitch = np.concatenate( (self.Pitch, np.zeros(200, dtype=np.float32))) self.PitchTime = np.concatenate( (self.PitchTime, np.zeros(200, dtype=np.float32))) #get pitches from the last 3 seconds RecentPitches = [] pitchIDX = PitchCount - 1 while self.PitchTime[pitchIDX] >= 1.0 * ( t - chunkSize / 2) / self.fs - maxPitchLag and pitchIDX >= 0: RecentPitches.append(self.Pitch[pitchIDX]) pitchIDX -= 1 #get mean and std meanPitch = np.mean(RecentPitches) if len(RecentPitches) == 1: stdPitch = 25 else: stdPitch = np.std(RecentPitches) #plot f0ax.bar([0], [2.0 * stdPitch], bottom=[meanPitch - stdPitch]) f0ax.set_ylabel('Fundamental Frequency (Hz)') f0ax.set_ylim((0, 500)) f0ax.set_xlim((0, 0.8)) self.FundamentalFrequenncyPlot.draw() formantAx.clear() formantAx.hold(True) if f0: # if f0 is detected search for formants #make PSD fBins, PSD = sp.signal.periodogram(data_ds, self.fs / ds_rate) PSD = 20 * np.log10(PSD) #convert to dB try: Formants = FormantFinder.findFormantsLPC( data_ds, self.fs / ds_rate) # look for formants using LPC method for f in range( len(Formants )): # plot the formants as vertical lines formantAx.plot([Formants[f], Formants[f]], [-100, 75], color='red') formantAx.plot(fBins, PSD) formantAx.set_title('Power Spectrum - Formants') formantAx.set_xlabel('Frequency (Hz)') formantAx.set_ylabel('Power (dB)') formantAx.set_ylim((-90, 90)) formantAx.set_xlim((0, 5000)) ''' formantAx.bar(range(len(Formants)), Formants) formantAx.set_xlabel('Formant number') formantAx.set_ylabel('Frequency (Hz)') formantAx.set_title('Formants Frequencies') formantAx.set_xlim((0, 4.8)) formantAx.set_ylim((0, 5000)) formantAx.set_xticks([0.4, 1.4, 2.4, 3.4, 4.4]) formantAx.set_xticklabels(['F1', 'F2', 'F3', 'F4', 'F5']) ''' self.FormantPlot.draw() formantAx.hold(False) #store Formants if len(Formants) >= 5: self.Formants[FormantCount, 0:5] = Formants[0:5] else: self.Formants[FormantCount, 0:len(Formants)] = Formants self.FormantTime[FormantCount] = 1.0 * ( t - chunkSize / 2) / self.fs FormantCount += 1 # add space if needed if FormantCount >= len(self.FormantTime): self.Formants = np.concatenate( (self.Formants, np.zeros((200, 5), dtype=np.float32))) self.FormantTime = np.concatenate( (self.FormantTime, np.zeros(200, dtype=np.float32))) #detect recent vocal tract lengths RecentTractLength = [] tractIDX = FormantCount - 1 while self.FormantTime[tractIDX] >= 1.0 * ( t - chunkSize / 2) / self.fs - maxVocalLag and tractIDX >= 0: RecentTractLength.append( FormantFinder.getVocalTractLength( self.Formants[tractIDX, :], c, method='lammert')) tractIDX -= 1 # get mean, std meanTractLength = np.median(RecentTractLength) if len(RecentTractLength) == 1: stdTractLength = 2 else: stdTractLength = np.std(RecentTractLength) # plot bar tractAx.bar([0], [2 * stdTractLength], bottom=[meanTractLength - stdTractLength]) tractAx.set_ylabel('Vocal Tract Length (cm)') tractAx.set_ylim((0, 25)) tractAx.set_xlim((0, 0.8)) self.VocalTractPlot.draw() except (RuntimeError ): #formant detection can throw errors sometimes Formants = np.zeros(3) else: # if no f0, basically do nothing fBins = np.linspace(0, self.fs / 2, 10) PSD = np.zeros(10) #update our raw data plot, but only everyother chunk, because its time consuming if t > windowSize * self.fs and i % 3 == 0: ax.plot(time[t - windowSize * self.fs:t], self.Recording[t - windowSize * self.fs:t]) ax.set_title('Raw Waveform') ax.set_xlabel('Time (s)') ax.set_ylabel('amplitude') self.RawPlot.draw() i += 1 #check for incoming button clicks i.e. stop button QtCore.QCoreApplication.processEvents() except ( KeyboardInterrupt, SystemExit ): # in case of a keyboard interrupt or system exit, clean house self.FormantPlot.draw() self.RawPlot.draw() self.FundamentalFrequenncyPlot.draw() self.Pitch = self.Pitch[0:PitchCount] self.PitchTime = self.PitchTime[0:PitchCount] self.Formants = self.Formants[0:FormantCount, :] self.FormantTime = self.FormantTime[0:FormantCount] print('Recording Completed') self.Recording = self.Recording[0:t] print('recorded time is') print(1.0 * t / self.fs) print('elapsed time is:') print(ti.time() - start) return True self.Pitch = self.Pitch[0:PitchCount] self.PitchTime = self.PitchTime[0:PitchCount] self.Formants = self.Formants[0:FormantCount, :] self.FormantTime = self.FormantTime[0:FormantCount] print('Recording Completed') self.Recording = self.Recording[0:t] print('recorded time is') print(1.0 * t / self.fs) print('elapsed time is:') print(ti.time() - start) return True