def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes() / wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append((seg.word, start_frame / nframes * length, end_frame / nframes * length)) return phonemes
def process_file(self, audiofile): """ processes audio file and returns the text """ with open(audiofile, 'rb') as audiofile: decoder = Decoder(self.config) decoder.start_utt() while True: buf = audiofile.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() hyp = decoder.hyp() print "Hyp:", hyp if hyp != None: print "Hyp Score", (hyp.prob, hyp.best_score) average_score = 0 seg_count = 0 for seg in decoder.seg(): if seg.word != "<sil>": seg_count += 1 average_score += seg.ascore print(seg.word, seg.ascore, seg.lscore) print "hyp:", hyp.hypstr print average_score / seg_count return hyp.hypstr return None
def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes()/wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append(( seg.word, start_frame/nframes*length, end_frame/nframes*length)) return phonemes
class PocketSphinxASREngine(ASREngine): """https://pypi.org/project/pocketsphinx/""" def __init__(self): # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-lm', os.path.join(get_model_path(), 'en-us.lm.bin')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) self._decoder = Decoder(config) def transcribe(self, path): pcm, sample_rate = soundfile.read(path) assert sample_rate == 16000 pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16).tobytes() self._decoder.start_utt() self._decoder.process_raw(pcm, no_search=False, full_utt=True) self._decoder.end_utt() words = [] for seg in self._decoder.seg(): word = seg.word # Remove special tokens. if word == '<sil>' or word == '<s>' or word == '</s>': continue word = ''.join([x for x in word if x.isalpha()]) words.append(word) return ' '.join(words) def __str__(self): return 'PocketSphinx'
rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() print('start...') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break hypothesis = decoder.hyp() if hypothesis: print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score)) print([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()]) print("Detected keyword, restarting search") os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3')) print('restart...') decoder.end_utt() decoder.start_utt() print('ok') # break stream.close()
stream = open(sys.argv[1], "rb") else: p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() print('start...') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break hypothesis = decoder.hyp() if hypothesis: print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score)) print ([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()]) print ("Detected keyword, restarting search") os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3')) print('restart...') decoder.end_utt() decoder.start_utt() print('ok') # break stream.close()