def audio_features_generator_with_buffer(input_filename, speech_features_params, target_sr, int_values, chunk_duration): sf = soundfile.SoundFile(input_filename, 'rb') chunk_size = int(chunk_duration * sf.samplerate) start = True end = False audio_signal = np.zeros(shape=3 * chunk_size, dtype=np.float32) while not end: audio_signal[-chunk_size:], end = get_audio_chunk_from_soundfile( sf, chunk_size, int_values) audio_segment = AudioSegment(audio_signal, sf.samplerate, target_sr) audio_features, features_length = get_speech_features( audio_segment.samples, target_sr, speech_features_params) yield audio_features, start, end start = False audio_signal[:-chunk_size] = audio_signal[chunk_size:] sf.close()
def generate_audio_signal(self): #chunk_size = int(self.chunk_duration*self.target_sr) chunk_size = int(0.2 * self.target_sr) self.recording_state = "init" def keyboard_listener(): input("Press Enter to start and end recording...") self.recording_state = "capture" print("Recording...") input("") self.recording_state = "release" listener = threading.Thread(target=keyboard_listener) listener.start() audio_samples = [] stream_initialized = False step = 0 while self.recording_state != "release": try: if self.recording_state == "capture": if not stream_initialized: stream = self.p.open( format=pa.paInt16, channels=1, rate=self.target_sr, input=True, input_device_index=self.input_device_id, frames_per_buffer=chunk_size) stream_initialized = True # Read audio chunk from microphone audio_signal = stream.read(chunk_size) audio_signal = np.frombuffer(audio_signal, dtype=np.int16) audio_segment = AudioSegment(audio_signal, self.target_sr, self.target_sr) if step == 0: audio_samples = audio_segment.samples else: audio_samples = np.concatenate( (audio_samples, audio_segment.samples)) start = False step += 1 except Exception as e: print(e) break stream.close() self.p.terminate() return audio_samples
def audio_generator_from_file(input_filename, target_sr, int_values, chunk_duration): sf = soundfile.SoundFile(input_filename, 'rb') chunk_size = int(chunk_duration * sf.samplerate) start = True end = False while not end: audio_signal, end = get_audio_chunk_from_soundfile( sf, chunk_size, int_values) audio_segment = AudioSegment(audio_signal, sf.samplerate, target_sr) yield audio_segment.samples, target_sr, start, end start = False sf.close()
) #parse_transcript(transcript_text, labels_map, blank_index)) # convert to vocab indices # Read the audio files # Group requests in batches audio_idx = 0 last_request = False predictions = [] while not last_request: batch_audio_samples = [] batch_filenames = [] for idx in range(FLAGS.batch_size): filename = filenames[audio_idx] print("Reading audio file: ", filename) audio = AudioSegment.from_file(filename, offset=0, duration=FLAGS.fixed_size).samples if FLAGS.fixed_size: audio = np.resize(audio, FLAGS.fixed_size) audio_idx = (audio_idx + 1) % len(filenames) if audio_idx == 0: last_request = True batch_audio_samples.append(audio) batch_filenames.append(filename) predictions += speech_client.recognize(batch_audio_samples, batch_filenames) if transcripts: