Exemple #1
0
def audio2phoneme(audio_file):
    wave_read = wave.open(audio_file, 'rb')
    length = wave_read.getnframes() / wave_read.getframerate()
    wave_read.close()

    # Decode streaming data.
    decoder = Decoder(config)

    buf = bytearray(1024)
    with open(audio_file, 'rb') as f:
        decoder.start_utt()
        while f.readinto(buf):
            decoder.process_raw(buf, False, False)
        decoder.end_utt()

    nframes = decoder.n_frames()

    phonemes = []
    offset = None
    for seg in decoder.seg():
        if offset is None:
            offset = seg.start_frame
        start_frame = seg.start_frame - offset
        end_frame = seg.end_frame - offset
        phonemes.append((seg.word, start_frame / nframes * length,
                         end_frame / nframes * length))

    return phonemes
Exemple #2
0
    def process_file(self, audiofile):
        """
        processes audio file and returns the text
        """
        with open(audiofile, 'rb') as audiofile:
            decoder = Decoder(self.config)
            decoder.start_utt()

            while True:
                buf = audiofile.read(1024)
                if buf:
                    decoder.process_raw(buf, False, False)
                else:
                    break
            decoder.end_utt()

            hyp = decoder.hyp()
            print "Hyp:", hyp

            if hyp != None:
                print "Hyp Score", (hyp.prob, hyp.best_score)
                average_score = 0
                seg_count = 0
                for seg in decoder.seg():
                    if seg.word != "<sil>":
                        seg_count += 1
                        average_score += seg.ascore
                        print(seg.word, seg.ascore, seg.lscore)

                print "hyp:", hyp.hypstr
                print average_score / seg_count
                return hyp.hypstr
        return None
Exemple #3
0
def audio2phoneme(audio_file):
    wave_read = wave.open(audio_file, 'rb')
    length = wave_read.getnframes()/wave_read.getframerate()
    wave_read.close()

    # Decode streaming data.
    decoder = Decoder(config)

    buf = bytearray(1024)
    with open(audio_file, 'rb') as f:
        decoder.start_utt()
        while f.readinto(buf):
            decoder.process_raw(buf, False, False)
        decoder.end_utt()

    nframes = decoder.n_frames()


    phonemes = []
    offset = None
    for seg in decoder.seg():
        if offset is None:
            offset = seg.start_frame
        start_frame = seg.start_frame - offset
        end_frame = seg.end_frame - offset
        phonemes.append((
            seg.word, start_frame/nframes*length, end_frame/nframes*length))

    return phonemes
Exemple #4
0
class PocketSphinxASREngine(ASREngine):
    """https://pypi.org/project/pocketsphinx/"""
    def __init__(self):
        # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py
        config = Decoder.default_config()
        config.set_string('-logfn', '/dev/null')
        config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        config.set_string('-lm', os.path.join(get_model_path(),
                                              'en-us.lm.bin'))
        config.set_string('-dict',
                          os.path.join(get_model_path(), 'cmudict-en-us.dict'))

        self._decoder = Decoder(config)

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        assert sample_rate == 16000
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16).tobytes()

        self._decoder.start_utt()
        self._decoder.process_raw(pcm, no_search=False, full_utt=True)
        self._decoder.end_utt()

        words = []
        for seg in self._decoder.seg():
            word = seg.word

            # Remove special tokens.
            if word == '<sil>' or word == '<s>' or word == '</s>':
                continue

            word = ''.join([x for x in word if x.isalpha()])

            words.append(word)

        return ' '.join(words)

    def __str__(self):
        return 'PocketSphinx'
Exemple #5
0
                    rate=16000,
                    input=True,
                    frames_per_buffer=1024)
    stream.start_stream()

print('start...')

while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break

    hypothesis = decoder.hyp()
    if hypothesis:
        print('\nhypothesis: %s, score: %d' %
              (hypothesis.hypstr, hypothesis.best_score))
        print([(seg.word, seg.prob, seg.start_frame, seg.end_frame)
               for seg in decoder.seg()])
        print("Detected keyword, restarting search")
        os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3'))

        print('restart...')
        decoder.end_utt()
        decoder.start_utt()
        print('ok')
        # break

stream.close()
    stream = open(sys.argv[1], "rb")
else:
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()


print('start...')

while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break

    hypothesis = decoder.hyp()
    if hypothesis:
        print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score))
        print ([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()])
        print ("Detected keyword, restarting search")
        os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3'))

        print('restart...')
        decoder.end_utt()
        decoder.start_utt()
        print('ok')
        # break

stream.close()