def audiosegment_google_speech(audio, filename, sample_rate, lang='ms'): if os.path.exists('output-wav/' + filename): return False sf.write(filename, audio.array, sample_rate) try: with sr.AudioFile(filename) as source: a = r.record(source) text = r.recognize_google(a, language=lang) except: text = '' if len(text): text_filename = f'output-text/{filename}.txt' with open(text_filename, 'w') as fopen: fopen.write(text) a = malaya_speech.resample( malaya_speech.astype.int_to_float(audio.array), sample_rate, 16000) sf.write('output-wav/' + filename, a, 16000) os.remove(filename) return True
def split(file, max_duration=10.0): print(file) audio = AudioSegment.from_mp3(file).set_channels(1) y = np.array(audio.get_array_of_samples()) y = malaya_speech.astype.int_to_float(y) y = p_noise(y)['concatenate'] y_int = malaya_speech.astype.float_to_int(y) y_ = malaya_speech.resample(y_int, audio.frame_rate, 16000).astype(int) frames = generator.frames(y, 30, audio.frame_rate) frames_ = generator.frames(y_, 30, 16000, append_ending_trail=False) frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)] splitted = split_vad_duration( frames_webrtc, max_duration=max_duration, negative_threshold=0.1, ) results = [s.array for s in splitted] return results, audio, audio.frame_rate
def parallel(f): y = read_wav(f)[0] y = random_sampling(y, length = 1000) y_ = malaya_speech.resample(y, sr, sr // reduction_factor) return y_, y
def downsample(y, sr, down_sr): y_ = malaya_speech.resample(y, sr, down_sr) return malaya_speech.resample(y_, down_sr, sr)
frame_duration_ms=30).batching(20).foreach_map( model_v2.predict).flatten()) from glob import glob mp3s = glob('*.mp3') mp3s for file in mp3s: print(file) try: audio = AudioSegment.from_mp3(file) sample_rate = audio.frame_rate samples = np.array(audio.get_array_of_samples()) samples = malaya_speech.astype.int_to_float(samples) samples_16k = malaya_speech.resample(samples, sample_rate, 16000) frames_16k = list( malaya_speech.utils.generator.frames(samples_16k, 30, 16000)) frames = list( malaya_speech.utils.generator.frames(samples, 30, sample_rate)) result = p.emit(samples_16k) frames_deep_v2_batch = [(frame, result['flatten'][no]) for no, frame in enumerate(frames)] results = malaya_speech.split.split_vad(frames_deep_v2_batch, n=5, negative_threshold=0.1) for no in tqdm(range(len(results))): result = results[no] sf.write('test.wav', result.array, sample_rate)