def start(): speech = Pygsr() # duration in seconds speech.record(4) output = "log.csv" input = "audio.flac" if input[-5:] != '.flac': input += '*.flac' files = glob.glob(input) for flac in files: print "opening %s:" % flac valid_result = False tries = 0 with open(flac, 'rb') as f: result = GoogleSpeechAPI(f) print "Audio is %.03f seconds long" % result.length f.seek(0) with Timer() as t: result.start() print "Result took %.03f sec" % t.interval print result.result out = ''.join(result.result) split3 = "" try: split1 = [x.strip() for x in out.split(':')] split2 = split1[3] split3 = [x.strip() for x in split2.split(',')] #[1:-1] is to remove double quotes from string print split3[0][1:-1] return split3[0][1:-1] except Exception, e: print "Exception"
def witAI(): #loading pygsr Python google speach recognition speech = Pygsr() raw_input("Ready ?") speech.record(5) # duration in seconds (3) try: phrase, complete_response = speech.speech_to_text( 'en_US') # select the language except: phrase = "Tell me a joke !" print phrase phrase = phrase.strip().replace(" ", "%20").encode( 'ascii') #if not ascii, curl crashes #Wit.ai curl URL curl_url = "https://api.wit.ai/message?q=%s" % phrase #Auth headers curl_header = ["Authorization: Bearer YW3P2YITCYYXGHVLMIE7R7G7BBJODBG4"] #debug print curl_url answer = get_curl(curl_url, curl_header) print answer result = json.loads(answer) #parse answer return result
def srprocess(self,threadname): speech = Pygsr() speech.record(3) try: phrase, complete_response = speech.speech_to_text('ar_AE') except: phrase = '' global do_flage do_flage = False self.save_text(phrase+' ')
class HearingModule: def __init__(self): self.speech = Pygsr() def listen(self, t): self.speech.record(t) phrase, complete_response = self.speech.sepeech_to_text('en_EN') return phrase def connected(self): if self.speech: return True return False
class Recorder: def __init__(self): global logger logger = logging.getLogger(__name__) self.speech = Pygsr() def record_command(self): ''' Records audio and sends it to google to translate to text. ''' self.speech.record(settings.RECORD_LENGTH) result = self.speech.speech_to_text() line = None if result: line = result[0].lower() if not line: logger.warn('No command recorded.') return None logger.info(line) command = Command(line) logger.info(command) return command
def witAI(): #loading pygsr Python google speach recognition speech = Pygsr() raw_input("Ready ?") speech.record(5) # duration in seconds (3) try: phrase, complete_response = speech.speech_to_text('en_US') # select the language except: phrase = "Tell me a joke !" print phrase phrase = phrase.strip().replace(" ","%20").encode('ascii') #if not ascii, curl crashes #Wit.ai curl URL curl_url = "https://api.wit.ai/message?q=%s"%phrase #Auth headers curl_header = ["Authorization: Bearer YW3P2YITCYYXGHVLMIE7R7G7BBJODBG4"] #debug print curl_url answer = get_curl(curl_url,curl_header) print answer result = json.loads(answer) #parse answer return result
def listen(self): speech = Pygsr() speech.record(2) phrase, complete_response = speech.speech_to_text('en_IN') print phrase
def _voice_input(self, duration): speech = Pygsr() speech.record(duration) phrase, complete_response = speech.speech_to_text(self.lang) return phrase
from pygsr import Pygsr import sys import os def set_proc_name(newname): from ctypes import cdll, byref, create_string_buffer libc = cdll.LoadLibrary('libc.so.6') buff = create_string_buffer(len(newname)+1) buff.value = newname libc.prctl(15, byref(buff), 0, 0, 0) def get_proc_name(): from ctypes import cdll, byref, create_string_buffer libc = cdll.LoadLibrary('libc.so.6') buff = create_string_buffer(128) # 16 == PR_GET_NAME from <linux/prctl.h> libc.prctl(16, byref(buff), 0, 0, 0) return buff.value set_proc_name('Mwave_gspeech') speech = Pygsr() speech.record(2) # duration in seconds (3) phrase, complete_response = speech.speech_to_text('en-US') # select the language os.remove('audio') os.remove('audio.flac') print phrase
from pygsr import Pygsr speech = Pygsr() # duration in seconds (3) speech.record(3) # select the language and obtain the result phrase, complete_response = speech.speech_to_text('es_ES') print phrase
f.write(opener.open(request).read()) f.close() Popen(['mplayer', 'data.mp3', '-really-quiet']).wait() #os.system('mplayer -ao alsa -noconsolecontrols data.mp3') if __name__ == '__main__': x = Record() x.setup() recorded = False response = None while response is not 'exit': x.read() rms = audioop.rms(x.data, 2) print rms if rms > x.threshold: speech = Pygsr() speech.record(5) phrase, complete_response = speech.speech_to_text('en_EN') response = x.custom(phrase) if response == False: response = x.Wolfram(phrase) if response == False: response = x.cleverbot(phrase) print('PHRASEPHRASEPHRASE') print(phrase) print(response) x.speak(response) recorded = True rms = 0 x.setup()
#!/usr/bin/python from pygsr import Pygsr speech = Pygsr() speech.record(3) phrase, complete_response = speech.speech_to_text('de_DE') print phrase
from pygsr import Pygsr speech = Pygsr() speech.record(3) # duration in seconds (3) phrase, complete_response = speech.speech_to_text( 'es_ES') # select the language print phrase
from pygsr import Pygsr speech = Pygsr() # duration in seconds (3) speech.record(3) # select the language and obtain the result phrase, complete_response = speech.speech_to_text("es_ES") print phrase
def __init__(self): Pygsr.__init__(self) self.active = False self.count_silence = 0 return
from pygsr import Pygsr speech = Pygsr() # duration in seconds speech.record(3) # select the language (phrase, complete_response) = speech.speech_to_text('en_US') print(phrase)
#!/usr/bin python #coding: utf-8 from pygsr import Pygsr speech = Pygsr() speech.record(3) # duration in seconds (3) response = speech.speech_to_text('es_ES') # select the language print response
def __init__(self): global logger logger = logging.getLogger(__name__) self.speech = Pygsr()
def __init__(self): self.speech = Pygsr()
def main(wavFileName): ######################################################################################################################## #wavFileName = "/Users/toine/Documents/speech_recognition/sound/sample/test.wav" wavFile = wave.open(wavFileName) (nchannels, sampwidth, framerate, nframes, comptype, compname) = wavFile.getparams() frames = wavFile.readframes(-1) npFrames = np.fromstring(frames, "Int16") ######################################################################################################################## ## compute the spectrogram ## make sure FFT size is not too big for good accuracy nFft = 64 nOverlap = 32 fftWindow = nFft - nOverlap specgramFramerate = framerate / (fftWindow) ##TODO: check if this is needed ## pad the input for perfect FFT match ## npFrames = np.r_[npFrames, np.zeros(nFft - nframes % nFft)] ## spectrogram, return (Pxx, freqs, bins, im) # bins are the time points the spectrogram is calculated over # freqs is an array of frequencies # Pxx is an array of shape (len(times), len(freqs)) of power # im is a AxesImage instance (Pxx, freqs, bins, im) = plt.specgram(npFrames, Fs=framerate, NFFT=nFft, noverlap=nOverlap) #plt.show() plt.clf() ######################################################################################################################## ## extract the voice frequencies ## voice frequency range, from 300Hz to 3500Hz # create a mask vector with these frequency taken from B # sum over the voice frequency range, voiceArray is 0's, but 1 when in voice frequency range f300Ind = lib.overflow(freqs, 300) f3500Ind = lib.overflow(freqs, 3500) voiceArray = np.zeros(len(freqs)) voiceArray[f300Ind:f3500Ind] = 1 ## dot product of the specgram voiceFreq = np.transpose(np.dot(np.transpose(Pxx), voiceArray)) ######################################################################################################################## ## compute the interesting minimums based on minimums and threshold #TODO: consider using the mlab/numpy function histData = plt.hist(voiceFreq, bins=100, range=(min(voiceFreq), np.mean(voiceFreq))) #plt.show() plt.clf() overflowPercent = 0.7 overflowIndex = lib.overflow_hist(histData[0], overflowPercent) overflowValue = histData[1][overflowIndex] ## smooth the curve to find the minimums voiceFreqSmooth = lib.smooth(voiceFreq, 128) minimums = np.r_[True, voiceFreqSmooth[1:] < voiceFreqSmooth[:-1]] & \ np.r_[voiceFreqSmooth[:-1] < voiceFreqSmooth[1:], True] ##TODO: change name ## create the array of cutting points, points are local minimums under the histogram threshold cutPoints = np.where(minimums & (voiceFreqSmooth < overflowValue))[0] ######################################################################################################################## ## filter the minimums by roughly selecting one every 5 seconds # on npFrames, 5 sec = framerate * 5 # on voiceFreq, framerate -> framerate/32 avgSec = 3 cutPointsNSec = [0] for pt in cutPoints: pt *= fftWindow # convert cutPointsThres to npFrames framerate by multiplying with fftWindow if (pt - cutPointsNSec[-1]) > (framerate * avgSec): # subtract the last value cutPointsNSec.append(pt) ######################################################################################################################## ## create the cuts as additional files cutPointsNSecInSec = [(x / framerate) for x in cutPointsNSec] timestamp = [] timestampNFrames = [] for item1, item2 in lib.pairwise(cutPointsNSec, fillvalue=0): timestamp.append((item1, item2)) timestampNFrames.append(item2 - item1) # geenrate the extension to the filename, e.g. filename.X_Y.wav for a cut from seconds X to Y addExtension = [] timestampInSec = [] for item1, item2 in lib.pairwise(cutPointsNSecInSec, fillvalue="end"): tmp = str(item1) + "_" + str(item2) timestampInSec.append((item1, item2)) addExtension.append(tmp) logger = logging.getLogger(__name__) logger.debug("%s %s %s", timestamp, timestampNFrames, addExtension) logger.debug("%s %s %s", len(timestamp), len(timestampNFrames), len(addExtension)) ## test on 1 file first #for (cutExt, cutTime, cutFrame) in zip(timestamp, timestampNFrames, addExtension): totalRes = [] TESTINDEX = 6 #TODO: take care of the last index, when cutPointNSecInSec is "end" for TESTINDEX in range(len(timestamp)-1): #TODO: make a lib function out of that splitName = path.basename(wavFileName).split(".") filename = path.dirname(wavFileName) + "/" + splitName[0] + "." + addExtension[TESTINDEX] + "." + splitName[1] wavChunk = wave.open(filename, "w") wavChunk.setparams((nchannels, sampwidth, framerate, timestampNFrames[TESTINDEX], comptype, compname)) wavChunk.writeframes(npFrames[timestamp[TESTINDEX][0]:timestamp[TESTINDEX][1]].tostring()) wavChunk.close() pygsr = Pygsr(filename) pygsr.convert() res = pygsr.speech_to_text("en", indx=TESTINDEX) totalRes.append(res) logger.debug("%s %s %s", TESTINDEX, addExtension[TESTINDEX], timestamp[TESTINDEX]) h1 = str(datetime.timedelta(seconds=timestampInSec[TESTINDEX][0]))+",200" h2 = str(datetime.timedelta(seconds=timestampInSec[TESTINDEX][1]-1))+",800" logger.info("%s", TESTINDEX) logger.info("%s --> %s", h1, h2) logger.info("%s", res) logger.info("") #logger.debug("this should not appear in the srt file") logger.debug("%s", totalRes) return 1