class TestRawDecoder(TestCase): def __init__(self, *args, **kwargs): self.ps = Pocketsphinx() self.ps.decode() super(TestRawDecoder, self).__init__(*args, **kwargs) def test_raw_decoder_lookup_word(self): self.assertEqual(self.ps.lookup_word('hello'), 'HH AH L OW') self.assertEqual(self.ps.lookup_word('abcdf'), None) def test_raw_decoder_hypothesis(self): self.assertEqual(self.ps.hypothesis(), 'go forward ten meters') self.assertEqual(self.ps.score(), -7066) self.assertEqual(self.ps.confidence(), 0.04042641466841839) def test_raw_decoder_segments(self): self.assertEqual( self.ps.segments(), ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>']) def test_raw_decoder_best_hypothesis(self): self.assertEqual(self.ps.best(), [('go forward ten meters', -28034), ('go for word ten meters', -28570), ('go forward and majors', -28670), ('go forward and meters', -28681), ('go forward and readers', -28685), ('go forward ten readers', -28688), ('go forward ten leaders', -28695), ('go forward can meters', -28695), ('go forward and leaders', -28706), ('go for work ten meters', -28722)])
class TestRawDecoder(TestCase): def __init__(self, *args, **kwargs): self.ps = Pocketsphinx() self.ps.decode() super(TestRawDecoder, self).__init__(*args, **kwargs) def test_raw_decoder_lookup_word(self): self.assertEqual(self.ps.lookup_word('hello'), 'HH AH L OW') self.assertEqual(self.ps.lookup_word('abcdf'), None) def test_raw_decoder_hypothesis(self): self.assertEqual(self.ps.hypothesis(), 'go forward ten meters') self.assertEqual(self.ps.score(), -7066) self.assertEqual(self.ps.confidence(), 0.04042641466841839) def test_raw_decoder_segments(self): self.assertEqual(self.ps.segments(), [ '<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>' ]) def test_raw_decoder_best_hypothesis(self): self.assertEqual(self.ps.best(), [ ('go forward ten meters', -28034), ('go for word ten meters', -28570), ('go forward and majors', -28670), ('go forward and meters', -28681), ('go forward and readers', -28685), ('go forward ten readers', -28688), ('go forward ten leaders', -28695), ('go forward can meters', -28695), ('go forward and leaders', -28706), ('go for work ten meters', -28722) ])
class SpeechProcessor: def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo', lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin', dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic', grammar='data/gramatica-tp2.gram', dataPath='tmp/'): self.data_path = dataPath config = { 'hmm': hmm, 'lm': lm, 'dict': dict } #model_path = get_model_path() self.ps = Pocketsphinx(**config) # Switch to JSGF grammar jsgf = Jsgf(grammar) rule = jsgf.get_rule('tp2.grammar') fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5) self.ps.set_fsg('tp2', fsg) self.ps.set_search('tp2') # Síntesis self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J') self.tts = TextToSpeechV1(authenticator=self.tts_authenticator) self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api') def sintetizar(self, outFileName, msg): if len(msg) > 0: with open(outFileName, 'wb') as audio_file: audio_file.write( self.tts.synthesize( msg, voice='es-LA_SofiaV3Voice', accept='audio/wav' ).get_result().content) def reconocer(self, inFileName='audio.wav'): # Reconocimiento print(self.data_path) self.ps.decode( audio_file=os.path.join(self.data_path,inFileName), buffer_size=2048, no_search=False, full_utt=False ) return self.ps.segments(), self.ps.best(count=3)
class TestPhoneme(TestCase): def __init__(self, *args, **kwargs): self.ps = Pocketsphinx( lm=False, dic=False, allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin', lw=2.0, pip=0.3, beam=1e-200, pbeam=1e-20, mmap=False ) self.ps.decode() super(TestPhoneme, self).__init__(*args, **kwargs) def test_phoneme_hypothesis(self): self.assertEqual( self.ps.hypothesis(), 'SIL G OW F AO R W ER D T AE N M IY IH ZH ER Z S V SIL' ) def test_phoneme_best_phonemes(self): self.assertEqual(self.ps.segments(), [ 'SIL', 'G', 'OW', 'F', 'AO', 'R', 'W', 'ER', 'D', 'T', 'AE', 'N', 'M', 'IY', 'IH', 'ZH', 'ER', 'Z', 'S', 'V', 'SIL' ])
class TestPhoneme(TestCase): def __init__(self, *args, **kwargs): self.ps = Pocketsphinx( lm=False, dic=False, allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin', lw=2.0, pip=0.3, beam=1e-200, pbeam=1e-20, mmap=False ) self.ps.decode() super(TestPhoneme, self).__init__(*args, **kwargs) def test_phoneme_hypothesis(self): self.assertEqual( self.ps.hypothesis(), 'SIL G OW F AO R D T AE N NG IY ZH ER S SIL' ) def test_phoneme_best_phonemes(self): self.assertEqual(self.ps.segments(), [ 'SIL', 'G', 'OW', 'F', 'AO', 'R', 'D', 'T', 'AE', 'N', 'NG', 'IY', 'ZH', 'ER', 'S', 'SIL' ])
data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } ps = Pocketsphinx(**config) ps.decode(audio_file=os.path.join(data_path, 'output.raw'), buffer_size=2048, no_search=False, full_utt=False) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'] print(ps.segments()) print('Detailed segments:', *ps.segments(detailed=True), sep='\n') # => [ # word, prob, start_frame, end_frame # ('<s>', 0, 0, 24) # ('<sil>', -3778, 25, 45) # ('go', -27, 46, 63) # ('forward', -38, 64, 116) # ('ten', -14105, 117, 152) # ('meters', -2152, 153, 211) # ('</s>', 0, 212, 260) # ] print(ps.hypothesis()) # => go forward ten meters print(ps.probability()) # => -32079 print(ps.score()) # => -7066 print(ps.confidence()) # => 0.04042641466841839
model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': r'C:\Users\BZT\Desktop\speech_segment\5446.lm', 'dict': r'C:\Users\BZT\Desktop\speech_segment\5446.dic' } ps = Pocketsphinx(**config) ps.decode( audio_file= r'C:\Users\BZT\Desktop\speech_segment\speech_segment\Ses01F_impro01_M013.wav', buffer_size=2048, no_search=False, full_utt=False) # print(ps.segments()) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'] # print('Detailed segments:', *ps.segments(detailed=True), sep='\n') for sample in ps.segments(detailed=True): for subsample in sample: print(subsample, end='\t') print() print(ps.hypothesis()) # => go forward ten meters # print(ps.probability()) # => -32079 # print(ps.score()) # => -7066 # print(ps.confidence()) # => 0.04042641466841839 # print(*ps.best(count=10), sep='\n')
class Sphinx(Thread): def __init__(self): Thread.__init__(self) self.ready = False def run(self): print_important("Info! Thread sphinx started.") self.config = { 'verbose': True, 'hmm': os.path.join('s2m', 'core', 'sphinx', 'fr'), 'lm': os.path.join('s2m', 'core', 'sphinx', 'fr.lm.dmp'), 'dict': os.path.join('s2m', 'core', 'sphinx', 's2m.dict'), 'jsgf': os.path.join('s2m', 'core', 'sphinx', 's2m.jsgf'), } self.pocketsphinx = Pocketsphinx(**self.config) self.ready = True def get_silence(self, duration): if duration < 0.25: return '[veryshortsil]' elif duration < 0.5: return '[shortsil]' elif duration < 1.5: return '[sil]' elif duration < 3.: return '[longsil]' else: return '[verylongsil]' def get_segment_string(self, segments): segment_list = [] last_silence = 0 spoken_duration = 0 word_count = 0 for segment in segments: if segment.word in ['<s>', '</s>']: continue elif segment.word == '<sil>': last_silence += segment.end_frame - segment.start_frame else: if last_silence > 0: segment_list.append(last_silence) last_silence = 0 spoken_duration += segment.end_frame - segment.start_frame segment_list.append(segment.word) word_count += 1 if word_count == 0: return '' avg_word_duration = spoken_duration / word_count return ' '.join((self.get_silence(s / avg_word_duration) if type(s) is int else nobrackets(s)) for s in segment_list) def to_text(self, filename, erase=False): if not self.ready: raise EnvironmentError('Initialization of sphinx not finished.') FILLER_WORDS = ['<s>', '<sil>', '</s>'] try: self.pocketsphinx.decode(filename) except Exception as e: print("An error was raised by sphinx while decoding file '%r', parsing aborted" % filename) text = " ".join( [s for s in self.pocketsphinx.segments() if s not in FILLER_WORDS]) text = nobrackets(text) segment_string = self.get_segment_string(self.pocketsphinx.seg()) nbest = [nobrackets(w[0]) for w in self.pocketsphinx.best(count=10)[1:]] if erase: os.remove(loc) return segment_string, nbest
data_path = get_data_path() print "config set" config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } ps = Pocketsphinx(**config) print "decoging" ps.decode(audio_file=sys.argv[1], buffer_size=2048, no_search=False, full_utt=False) print "decoded" print(ps.segments() ) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'] print "DETAILED SHIT" print(ps.segments(detailed=True)) # => [ # word, prob, start_frame, end_frame # ('<s>', 0, 0, 24) # ('<sil>', -3778, 25, 45) # ('go', -27, 46, 63) # ('forward', -38, 64, 116) # ('ten', -14105, 117, 152) # ('meters', -2152, 153, 211) # ('</s>', 0, 212, 260) # ] #print(ps.hypothesis()) # => go forward ten meters #print(ps.probability()) # => -32079
buffer_size=2048, no_search=False, full_utt=False ) #print(ps.segments()) #save the detailed segments of the words, #which will contain details word, probablity, start_time and end_time #print('Detailed segments:', *ps.segments(detailed=True), sep='\n') # with open('output_segments_obama_farewell_speech.txt', 'a') as f: # print(*ps.segments(detailed=True), sep='\n', file=f) with open(filename_output_segments, 'a') as f: print(*ps.segments(detailed=True), sep='\n', file=f) #convert from audio to text and save text = ps.hypothesis() file1 = open(filename_sphinx,"w")#write mode file1.write(text) file1.close() #load into dataframe # For the above saved file, modify manually by removing '(',')',' and then save as modified fie #df = pd.read_csv('output_segments_donaldTrump_modified.txt', sep=",", header=None) df = pd.read_csv(filename_output_segments_mod, sep=",", header=None) df.columns = ["word", "prob","startTime", "endTime"] df.head()
#instantiate PyAudio p = pyaudio.PyAudio() #open stream stream = p.open(format = 8, channels = 1, rate = 16000, output = True) #read data data = f.read(chunk*2) frames = [] phones = {} phoneme_pred = ps.segments(detailed=True) fps = 100 for idx,s in enumerate(phoneme_pred): if (idx < len(phoneme_pred)-1): phones[phoneme_pred[idx+1][2]/fps] = phoneme_pred[idx][0] else: phones[phoneme_pred[idx][3]/fps] = phoneme_pred[idx][0] mouth_list = [] pre_mouth_shape = 'M' #lip sync style intensity variation if (opt.level == 0): intensity = 0.2 elif (opt.level == 1):