def input_usr_utt_nblist(self): """Reads an N-best list of utterances from the input. """ self.init_readline() nblist = UtteranceNBList() i = 1 while i < 100: l = raw_input("User %d: " % i) try: l = l.decode('utf8') except: # if we use ipdb, it already gives us UTF-8-encoded input :-( pass if l.startswith("."): print break try: prob, da = self.parse_input_utt(l) except TextHubException as e: print e continue nblist.add(prob, da) i += 1 nblist.merge() nblist.scale() nblist.add_other() self.write_readline() return nblist
def test_parse_with_mutliple_date_rel(self): asr_hyp = UtteranceNBList() asr_hyp.add(0.1, Utterance("CHTEL BYCH ZITRA ZITRA JET")) cn = self.slu.parse(asr_hyp) self.assert_(DialogueActItem(dai="inform(date_rel=tomorrow)") in cn)
def input_usr_utt_nblist(self): """Reads an N-best list of utterances from the input. """ self.init_readline() nblist = UtteranceNBList() i = 1 while i < 100: l = raw_input("User %d: " % i) l = l.decode('utf8') if self.f_output_script: self.f_output_script.write(l + '\n') if l.startswith("."): print break try: prob, da = self.parse_input_utt(l) except TextHubException as e: print e continue nblist.add(prob, da) i += 1 nblist.merge() nblist.scale() nblist.add_other() self.write_readline() return nblist
def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns recognizers hypotheses about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice() # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist
def test_parse_with_mutliple_date_rel(self): asr_hyp = UtteranceNBList() asr_hyp.add(0.1, Utterance("CHTEL BYCH ZITRA ZITRA JET")) cn = self.slu.parse(asr_hyp) self.assert_(DialogueActItem(dai="inform(date_rel=tomorrow)") in cn)
def input_usr_utt_nblist(self): """Reads an N-best list of utterances from the input. """ self.init_readline() nblist = UtteranceNBList() i = 1 while i < 100: l = raw_input("User %d: " % i) try: l = l.decode('utf8') except: # if we use ipdb, it already gives us UTF-8-encoded input :-( pass if l.startswith("."): print break try: prob, da = self.parse_input_utt(l) except TextHubException as e: print e continue nblist.add(prob, da) i += 1 nblist.merge() nblist.scale() nblist.add_other() self.write_readline() return nblist
def input_usr_utt_nblist(self): """Reads an N-best list of utterances from the input. """ self.init_readline() nblist = UtteranceNBList() i = 1 while i < 100: l = raw_input("User %d: " % i) l = l.decode('utf8') if self.f_output_script: self.f_output_script.write(l + '\n') if l.startswith("."): print break try: prob, da = self.parse_input_utt(l) except TextHubException as e: print e continue nblist.add(prob, da) i += 1 nblist.merge() nblist.scale() nblist.add_other() self.write_readline() return nblist
def test_parse_meta(self): utterances_to_understand = [ (u"ahoj", "hello()", ), (u"sbohem čau", "bye()", ), (u"jiné", "reqalts()", ), (u"začneme znovu", "restart()", ), (u"zopakuj", "repeat()", ), (u"promiň", "apology()", ), (u"co se zeptat", "help()", ), (u"haló", "canthearyou()", ), (u"nerozuměl jsem", "notunderstood()", ), (u"ano jo", "affirm()", ), (u"ne ano nechci", "negate()", ), (u"děkuji", "thankyou()", ), (u"dobře", "ack()", ), (u"chci jet", "inform(task=find_connection)", ), (u"jak bude", "inform(task=weather)", ), (u"nástupiště", "inform(task=find_platform)", ), (u"z jaké jede", "request(from_stop)", ), (u"kam to jede", "request(to_stop)", ), (u"kdy to jede", "request(departure_time)", ), (u"za jak dlouho", "request(departure_time_rel)", ), (u"kdy tam budem", "request(arrival_time)", ), (u"za jak dlouho tam přijedu", "request(arrival_time_rel)", ), (u"jak dlouho bude trvat cesta", "request(duration)", ), (u"kolik je hodin", "request(current_time)", ), (u"jak dlouho trvá přestup", "request(time_transfers)", ), (u"kolik přestupů", "request(num_transfers)", ), (u"nechci přestup bez jet přímo", "inform(num_transfers=0)", ), (u"jeden přestup", "inform(num_transfers=1)", ), (u"dva přestupy", "inform(num_transfers=2)", ), (u"tři přestupy", "inform(num_transfers=3)", ), (u"čtyři přestupy", "inform(num_transfers=4)", ), (u"libovolně přestupů", "inform(num_transfers=dontcare)", ), (u"jet přímo", "inform(num_transfers=0)", ), (u"alternativa libovolný", "inform(alternative=dontcare)", ), (u"alternativa první", "inform(alternative=1)", ), (u"alternativa druhá", "inform(alternative=2)", ), (u"alternativa třetí", "inform(alternative=3)", ), (u"alternativa čtvrtá", "inform(alternative=4)", ), (u"alternativa páté", "inform(alternative=5)", ), (u"předchozí spoj", "inform(alternative=prev)", ), (u"nechci předchozí spoj", "deny(alternative=prev)", ), (u"poslední spoj", "inform(alternative=last)", ), (u"nechci poslední spoj", "deny(alternative=last)", ), (u"další spoj", "inform(alternative=next)", ), (u"další", "inform(alternative=next)", ), (u"předchozí", "inform(alternative=prev)", ), (u"jako ve dne", "inform(ampm=pm)", ), ] for utt, res in utterances_to_understand: asr_hyp = UtteranceNBList() asr_hyp.add(0.79, Utterance(utt)) cn = self.slu.parse(asr_hyp) self.assertIn(DialogueActItem(dai=res), cn)
def get_response(self, utt_text): try: self.cfg['Logging']['system_logger'].info('User: '******'utt_nbl': utt_nblist}) return self.process_dm() except Exception: self.cfg['Logging']['system_logger'].exception( 'Uncaught exception in WTHUB process.') return None
def recognize(self, wav): """ Produces hypotheses for the input audio data. Remember that GoogleASR works only with complete wave files. Returns an n-best list of hypotheses. """ # making a file temp for manipulation handle, flac_file_name = mkstemp('TmpSpeechFile.flac') try: # convert wav to flac audio.save_flac(self.cfg, flac_file_name, wav) json_hypotheses = self.get_asr_hypotheses(flac_file_name) except (urllib2.HTTPError, urllib2.URLError) as e: self.syslog.exception('GoogleASR HTTP/URL error: %s' % unicode(e)) json_hypotheses = [ [{'confidence': 1.0, 'utterance': '__google__ __asr__ __exception__'}, ], ] finally: os.close(handle) remove(flac_file_name) try: hyp = json.loads(json_hypotheses) # print "###", hyp nblist = UtteranceNBList() if len(hyp['result']) > 0: hypotheses = hyp['result'][0]['alternative'] n = len(hypotheses) for i, h in enumerate(hypotheses): if i == 0: nblist.add(h['confidence'], Utterance(h['transcript'])) conf1 = h['confidence'] else: # guess the confX score nblist.add((1.0-conf1)*(n-i)/(n-1.0)/(n-0.0)*2.0, Utterance(h['transcript'])) except: nblist = UtteranceNBList() nblist.merge() nblist.add_other() return nblist
def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) if self.calibration_table: lat = lattice_calibration(lat, self.calibration_table) self.last_lattice = lat # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist
def test_parse_X(self): from alex.components.slu.dainnclassifier import DAINNClassifier np.random.seed(0) cldb = CategoryLabelDatabase() class db: database = { "task": { "find_connection": ["najít spojení", "najít spoj", "zjistit spojení", "zjistit spoj", "hledám spojení", 'spojení', 'spoj', ], "find_platform": ["najít nástupiště", "zjistit nástupiště", ], 'weather': ['pocasi', 'jak bude', ], }, "number": { "1": ["jednu"] }, "time": { "now": ["nyní", "teď", "teďka", "hned", "nejbližší", "v tuto chvíli", "co nejdřív"], }, } cldb.load(db_mod=db) preprocessing = SLUPreprocessing(cldb) clf = DAINNClassifier(cldb, preprocessing, features_size=4) # Train a simple classifier. das = { '1': DialogueAct('inform(task=weather)'), '2': DialogueAct('inform(time=now)'), '3': DialogueAct('inform(task=weather)'), '4': DialogueAct('inform(task=connection)'), } utterances = { '1': Utterance('pocasi pocasi pocasi pocasi pocasi'), '2': Utterance('hned ted nyni hned ted nyni'), '3': Utterance('jak bude jak bude jak bude jak bude'), '4': Utterance('kdy a odkat mi to jede'), } clf.extract_classifiers(das, utterances, verbose=False) clf.prune_classifiers(min_classifier_count=0) clf.gen_classifiers_data(min_pos_feature_count=0, min_neg_feature_count=0, verbose2=False) clf.train(inverse_regularisation=1e1, verbose=False) # Parse some sentences. utterance_list = UtteranceNBList() utterance_list.add(0.7, Utterance('pocasi')) utterance_list.add(0.7, Utterance('jak bude pocasi')) utterance_list.add(0.2, Utterance('hned')) utterance_list.add(0.2, Utterance('hned')) da_confnet = clf.parse_X(utterance_list, verbose=False) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(task=weather)')) != 0.0) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(time=now)')) != 0.0)
def test_conversion_of_confnet_into_nblist(self): A1, A2, A3 = 0.90, 0.05, 0.05 B1, B2, B3 = 0.50, 0.35, 0.15 C1, C2, C3 = 0.60, 0.30, 0.10 correct_nblist = UtteranceNBList() correct_nblist.add(A1*B1*C1, Utterance("A1 B1 C1")) correct_nblist.add(A1*B2*C1, Utterance("A1 B2 C1")) correct_nblist.add(A1*B1*C2, Utterance("A1 B1 C2")) correct_nblist.add(A1*B2*C2, Utterance("A1 B2 C2")) correct_nblist.add(A1*B3*C1, Utterance("A1 B3 C1")) correct_nblist.add(A1*B1*C3, Utterance("A1 B1 C3")) correct_nblist.add(A1*B3*C2, Utterance("A1 B3 C2")) correct_nblist.add(A1*B2*C3, Utterance("A1 B2 C3")) correct_nblist.merge() correct_nblist.add_other() confnet = UtteranceConfusionNetwork() confnet.add([[A1, 'A1'], [A2, 'A2'], [A3, 'A3'],]) confnet.add([[B1, 'B1'], [B2, 'B2'], [B3, 'B3'],]) confnet.add([[C1, 'C1'], [C2, 'C2'], [C3, 'C3'],]) confnet.merge().sort() gen_nblist = confnet.get_utterance_nblist(10) s = [] s.append("") s.append("Confusion network:") s.append(unicode(confnet)) s.append("") s.append("Generated nblist:") s.append(unicode(gen_nblist)) s.append("") s.append("Correct nblist:") s.append(unicode(correct_nblist)) s.append("") print '\n'.join(s) self.assertEqual(unicode(gen_nblist), unicode(correct_nblist))
def process_call_log(fn): name = multiprocessing.current_process().name asr = [] nbl = [] sem = [] trn = [] trn_hdc_sem = [] fcount = 0 tcount = 0 f_dir = os.path.dirname(fn) print "Process name:", name print "File #", fcount fcount += 1 print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format( turn=i, fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i + 1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format( turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format( turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format( fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format( turn=i, fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format( fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_key, t)) print print "Transcritpiton #", tcount tcount += 1 print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt': Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) # 1 best ASR asr.append((wav_key, a)) # N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n', unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))), Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) return asr, nbl, sem, trn, trn_hdc_sem, fcount, tcount
def get_results(self, timeout=0.6): """" Waits for the complete recognition results from the Julius ASR server. Timeout specifies how long it will wait for the end of message. """ msg = "" # Get results from the server. time_slept = 0.0 while time_slept < timeout: msg_part = self.read_server_message(self.msg_timeout) if not msg_part: # Wait and check whether there is a message. time.sleep(self.cfg['Hub']['main_loop_sleep_time']) time_slept += self.cfg['Hub']['main_loop_sleep_time'] if self.debug >= 2: print "gr.time_slept:", time_slept continue msg += msg_part + '\n' if self.debug: print msg if '<CONFNET>' in msg: break else: raise JuliusASRTimeoutException( "Timeout when waiting for the Julius server results.") # Process the results. """ Typical result returned by the Julius ASR. <STARTPROC/> <INPUT STATUS="LISTEN" TIME="1343896296"/> <INPUT STATUS="STARTREC" TIME="1343896311"/> <STARTRECOG/> <INPUT STATUS="ENDREC" TIME="1343896312"/> <ENDRECOG/> <INPUTPARAM FRAMES="164" MSEC="1640"/> <RECOGOUT> <SHYPO RANK="1" SCORE="-7250.111328"> <WHYPO WORD="" CLASSID="<s>" PHONE="sil" CM="0.887"/> <WHYPO WORD="I'M" CLASSID="I'M" PHONE="ah m" CM="0.705"/> <WHYPO WORD="LOOKING" CLASSID="LOOKING" PHONE="l uh k ih ng" CM="0.992"/> <WHYPO WORD="FOR" CLASSID="FOR" PHONE="f er" CM="0.757"/> <WHYPO WORD="A" CLASSID="A" PHONE="ah" CM="0.672"/> <WHYPO WORD="PUB" CLASSID="PUB" PHONE="p ah b" CM="0.409"/> <WHYPO WORD="" CLASSID="</s>" PHONE="sil" CM="1.000"/> </SHYPO> </RECOGOUT> <GRAPHOUT NODENUM="43" ARCNUM="70"> <NODE GID="0" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="2"/> <NODE GID="1" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="3"/> <NODE GID="2" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="4"/> <NODE GID="3" WORD="I" CLASSID="I" PHONE="ay" BEGIN="3" END="5"/> <NODE GID="4" WORD="NO" CLASSID="NO" PHONE="n ow" BEGIN="3" END="7"/> <NODE GID="5" WORD="I" CLASSID="I" PHONE="ay" BEGIN="4" END="6"/> <NODE GID="6" WORD="UH" CLASSID="UH" PHONE="ah" BEGIN="4" END="6"/> <NODE GID="7" WORD="I'M" CLASSID="I'M" PHONE="ay m" BEGIN="4" END="27"/> ... <NODE GID="38" WORD="PUB" CLASSID="PUB" PHONE="p ah b" BEGIN="79" END="104"/> <NODE GID="39" WORD="AH" CLASSID="AH" PHONE="aa" BEGIN="81" END="110"/> <NODE GID="40" WORD="LOT" CLASSID="LOT" PHONE="l aa t" BEGIN="81" END="110"/> <NODE GID="41" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="105" END="163"/> <NODE GID="42" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="111" END="163"/> <ARC FROM="0" TO="4"/> <ARC FROM="0" TO="3"/> <ARC FROM="1" TO="7"/> <ARC FROM="1" TO="5"/> <ARC FROM="1" TO="6"/> ... <ARC FROM="38" TO="41"/> <ARC FROM="39" TO="42"/> <ARC FROM="40" TO="42"/> </GRAPHOUT> <CONFNET> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.950">I</ALTERNATIVE> <ALTERNATIVE PROB="0.020">HI</ALTERNATIVE> <ALTERNATIVE PROB="0.013">NO</ALTERNATIVE> <ALTERNATIVE PROB="0.010"></ALTERNATIVE> <ALTERNATIVE PROB="0.006">UH</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.945">AM</ALTERNATIVE> <ALTERNATIVE PROB="0.055">I'M</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">LOOKING</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">FOR</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">A</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.963">PUB</ALTERNATIVE> <ALTERNATIVE PROB="0.016">AH</ALTERNATIVE> <ALTERNATIVE PROB="0.012">BAR</ALTERNATIVE> <ALTERNATIVE PROB="0.008">LOT</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> </CONFNET> <INPUT STATUS="LISTEN" TIME="1343896312"/> """ msg = "<RESULTS>" + msg + "</RESULTS>" msg = msg.replace("<s>", "<s>").replace("</s>", "</s>") nblist = UtteranceNBList() doc = xml.dom.minidom.parseString(msg) recogout = doc.getElementsByTagName("RECOGOUT") for el in recogout: shypo = el.getElementsByTagName("SHYPO") for el in shypo: whypo = el.getElementsByTagName("WHYPO") utterance = "" cm = 1.0 for el in whypo: word = el.getAttribute("WORD") utterance += " " + word if word: cm *= float(el.getAttribute("CM")) nblist.add(cm, Utterance(utterance)) nblist.merge() nblist.add_other() cn = UtteranceConfusionNetwork() confnet = doc.getElementsByTagName("CONFNET") for el in confnet: word = el.getElementsByTagName("WORD") for el in word: alternative = el.getElementsByTagName("ALTERNATIVE") word_list = [] for el in alternative: prob = float(el.getAttribute("PROB")) text = get_text_from_xml_node(el) word_list.append([prob, text]) # Filter out empty hypotheses. if len(word_list) == 0: continue if len(word_list) == 1 and len(word_list[0][1]) == 0: continue # Add the word into the confusion network. cn.add(word_list) cn.merge() cn.normalise() cn.prune() cn.normalise() cn.sort() return nblist, cn
def get_results(self, timeout=0.6): """" Waits for the complete recognition results from the Julius ASR server. Timeout specifies how long it will wait for the end of message. """ msg = "" # Get results from the server. time_slept = 0.0 while time_slept < timeout: msg_part = self.read_server_message(self.msg_timeout) if not msg_part: # Wait and check whether there is a message. time.sleep(self.cfg['Hub']['main_loop_sleep_time']) time_slept += self.cfg['Hub']['main_loop_sleep_time'] if self.debug >= 2: print "gr.time_slept:", time_slept continue msg += msg_part + '\n' if self.debug: print msg if '<CONFNET>' in msg: break else: raise JuliusASRTimeoutException( "Timeout when waiting for the Julius server results.") # Process the results. """ Typical result returned by the Julius ASR. <STARTPROC/> <INPUT STATUS="LISTEN" TIME="1343896296"/> <INPUT STATUS="STARTREC" TIME="1343896311"/> <STARTRECOG/> <INPUT STATUS="ENDREC" TIME="1343896312"/> <ENDRECOG/> <INPUTPARAM FRAMES="164" MSEC="1640"/> <RECOGOUT> <SHYPO RANK="1" SCORE="-7250.111328"> <WHYPO WORD="" CLASSID="<s>" PHONE="sil" CM="0.887"/> <WHYPO WORD="I'M" CLASSID="I'M" PHONE="ah m" CM="0.705"/> <WHYPO WORD="LOOKING" CLASSID="LOOKING" PHONE="l uh k ih ng" CM="0.992"/> <WHYPO WORD="FOR" CLASSID="FOR" PHONE="f er" CM="0.757"/> <WHYPO WORD="A" CLASSID="A" PHONE="ah" CM="0.672"/> <WHYPO WORD="PUB" CLASSID="PUB" PHONE="p ah b" CM="0.409"/> <WHYPO WORD="" CLASSID="</s>" PHONE="sil" CM="1.000"/> </SHYPO> </RECOGOUT> <GRAPHOUT NODENUM="43" ARCNUM="70"> <NODE GID="0" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="2"/> <NODE GID="1" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="3"/> <NODE GID="2" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="4"/> <NODE GID="3" WORD="I" CLASSID="I" PHONE="ay" BEGIN="3" END="5"/> <NODE GID="4" WORD="NO" CLASSID="NO" PHONE="n ow" BEGIN="3" END="7"/> <NODE GID="5" WORD="I" CLASSID="I" PHONE="ay" BEGIN="4" END="6"/> <NODE GID="6" WORD="UH" CLASSID="UH" PHONE="ah" BEGIN="4" END="6"/> <NODE GID="7" WORD="I'M" CLASSID="I'M" PHONE="ay m" BEGIN="4" END="27"/> ... <NODE GID="38" WORD="PUB" CLASSID="PUB" PHONE="p ah b" BEGIN="79" END="104"/> <NODE GID="39" WORD="AH" CLASSID="AH" PHONE="aa" BEGIN="81" END="110"/> <NODE GID="40" WORD="LOT" CLASSID="LOT" PHONE="l aa t" BEGIN="81" END="110"/> <NODE GID="41" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="105" END="163"/> <NODE GID="42" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="111" END="163"/> <ARC FROM="0" TO="4"/> <ARC FROM="0" TO="3"/> <ARC FROM="1" TO="7"/> <ARC FROM="1" TO="5"/> <ARC FROM="1" TO="6"/> ... <ARC FROM="38" TO="41"/> <ARC FROM="39" TO="42"/> <ARC FROM="40" TO="42"/> </GRAPHOUT> <CONFNET> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.950">I</ALTERNATIVE> <ALTERNATIVE PROB="0.020">HI</ALTERNATIVE> <ALTERNATIVE PROB="0.013">NO</ALTERNATIVE> <ALTERNATIVE PROB="0.010"></ALTERNATIVE> <ALTERNATIVE PROB="0.006">UH</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.945">AM</ALTERNATIVE> <ALTERNATIVE PROB="0.055">I'M</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">LOOKING</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">FOR</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">A</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.963">PUB</ALTERNATIVE> <ALTERNATIVE PROB="0.016">AH</ALTERNATIVE> <ALTERNATIVE PROB="0.012">BAR</ALTERNATIVE> <ALTERNATIVE PROB="0.008">LOT</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> </CONFNET> <INPUT STATUS="LISTEN" TIME="1343896312"/> """ msg = "<RESULTS>" + msg + "</RESULTS>" msg = msg.replace("<s>", "<s>").replace("</s>", "</s>") nblist = UtteranceNBList() doc = xml.dom.minidom.parseString(msg) recogout = doc.getElementsByTagName("RECOGOUT") for el in recogout: shypo = el.getElementsByTagName("SHYPO") for el in shypo: whypo = el.getElementsByTagName("WHYPO") utterance = "" cm = 1.0 for el in whypo: word = el.getAttribute("WORD") utterance += " " + word if word: cm *= float(el.getAttribute("CM")) nblist.add(cm, Utterance(utterance)) nblist.merge() nblist.add_other() cn = UtteranceConfusionNetwork() confnet = doc.getElementsByTagName("CONFNET") for el in confnet: word = el.getElementsByTagName("WORD") for el in word: alternative = el.getElementsByTagName("ALTERNATIVE") word_list = [] for el in alternative: prob = float(el.getAttribute("PROB")) text = get_text_from_xml_node(el) word_list.append([prob, text]) # Filter out empty hypotheses. if len(word_list) == 0: continue if len(word_list) == 1 and len(word_list[0][1]) == 0: continue # Add the word into the confusion network. cn.add(word_list) cn.merge() cn.normalise() cn.prune() cn.normalise() cn.sort() return nblist, cn
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTIENSLUPreprocessing(cldb) slu = PTIENHDCSLU(preprocessing, cfg={'SLU': {PTIENHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoEN/data/utt2da_dict.txt")}}}) cfg = Config.load_configs(['../kaldi.cfg',], use_default=True) asr_rec = asr_factory(cfg) fn_uniq_trn = 'uniq.trn' fn_uniq_trn_hdc_sem = 'uniq.trn.hdc.sem' fn_uniq_trn_sem = 'uniq.trn.sem' fn_all_sem = 'all.sem' fn_all_trn = 'all.trn' fn_all_trn_hdc_sem = 'all.trn.hdc.sem' fn_all_asr = 'all.asr' fn_all_asr_hdc_sem = 'all.asr.hdc.sem' fn_all_nbl = 'all.nbl' fn_all_nbl_hdc_sem = 'all.nbl.hdc.sem' fn_train_sem = 'train.sem' fn_train_trn = 'train.trn' fn_train_trn_hdc_sem = 'train.trn.hdc.sem' fn_train_asr = 'train.asr' fn_train_asr_hdc_sem = 'train.asr.hdc.sem' fn_train_nbl = 'train.nbl' fn_train_nbl_hdc_sem = 'train.nbl.hdc.sem' fn_dev_sem = 'dev.sem' fn_dev_trn = 'dev.trn' fn_dev_trn_hdc_sem = 'dev.trn.hdc.sem' fn_dev_asr = 'dev.asr' fn_dev_asr_hdc_sem = 'dev.asr.hdc.sem' fn_dev_nbl = 'dev.nbl' fn_dev_nbl_hdc_sem = 'dev.nbl.hdc.sem' fn_test_sem = 'test.sem' fn_test_trn = 'test.trn' fn_test_trn_hdc_sem = 'test.trn.hdc.sem' fn_test_asr = 'test.asr' fn_test_asr_hdc_sem = 'test.asr.hdc.sem' fn_test_nbl = 'test.nbl' fn_test_nbl_hdc_sem = 'test.nbl.hdc.sem' indomain_data_dir = "indomain_data" print "Generating the SLU train and test data" print "-"*120 ############################################################################################### files = [] files.append(glob.glob(os.path.join(indomain_data_dir, 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', '*', 'asr_transcribed.xml'))) files = various.flatten(files) sem = [] trn = [] trn_hdc_sem = [] asr = [] asr_hdc_sem = [] nbl = [] nbl_hdc_sem = [] for fn in files[:100000]: f_dir = os.path.dirname(fn) print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=i,fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i+1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format(turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format(turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format(fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format(turn=i,fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_','') trn.append((wav_key, t)) print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt':Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) if '--uniq' not in sys.argv: # HDC SLU on 1 best ASR if '--asr-log' not in sys.argv: a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) asr.append((wav_key, a)) s = slu.parse_1_best({'utt':Utterance(a)}).get_best_da() asr_hdc_sem.append((wav_key, s)) # HDC SLU on N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n',unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))),Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) if '--fast' not in sys.argv: s = slu.parse_nblist({'utt_nbl':n}).get_best_da() nbl_hdc_sem.append((wav_key, s)) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) uniq_trn = {} uniq_trn_hdc_sem = {} uniq_trn_sem = {} trn_set = set() sem = dict(trn_hdc_sem) for k, v in trn: if not v in trn_set: trn_set.add(v) uniq_trn[k] = v uniq_trn_hdc_sem[k] = sem[k] uniq_trn_sem[k] = v + " <=> " + unicode(sem[k]) save_wavaskey(fn_uniq_trn, uniq_trn) save_wavaskey(fn_uniq_trn_hdc_sem, uniq_trn_hdc_sem, trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem) # all save_wavaskey(fn_all_trn, dict(trn)) save_wavaskey(fn_all_trn_hdc_sem, dict(trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) if '--uniq' not in sys.argv: save_wavaskey(fn_all_asr, dict(asr)) save_wavaskey(fn_all_asr_hdc_sem, dict(asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_all_nbl, dict(nbl)) save_wavaskey(fn_all_nbl_hdc_sem, dict(nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) seed_value = 10 random.seed(seed_value) random.shuffle(trn) random.seed(seed_value) random.shuffle(trn_hdc_sem) random.seed(seed_value) random.shuffle(asr) random.seed(seed_value) random.shuffle(asr_hdc_sem) random.seed(seed_value) random.shuffle(nbl) random.seed(seed_value) random.shuffle(nbl_hdc_sem) # trn train_trn = trn[:int(0.8*len(trn))] dev_trn = trn[int(0.8*len(trn)):int(0.9*len(trn))] test_trn = trn[int(0.9*len(trn)):] save_wavaskey(fn_train_trn, dict(train_trn)) save_wavaskey(fn_dev_trn, dict(dev_trn)) save_wavaskey(fn_test_trn, dict(test_trn)) # trn_hdc_sem train_trn_hdc_sem = trn_hdc_sem[:int(0.8*len(trn_hdc_sem))] dev_trn_hdc_sem = trn_hdc_sem[int(0.8*len(trn_hdc_sem)):int(0.9*len(trn_hdc_sem))] test_trn_hdc_sem = trn_hdc_sem[int(0.9*len(trn_hdc_sem)):] save_wavaskey(fn_train_trn_hdc_sem, dict(train_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_trn_hdc_sem, dict(dev_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_trn_hdc_sem, dict(test_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # asr train_asr = asr[:int(0.8*len(asr))] dev_asr = asr[int(0.8*len(asr)):int(0.9*len(asr))] test_asr = asr[int(0.9*len(asr)):] save_wavaskey(fn_train_asr, dict(train_asr)) save_wavaskey(fn_dev_asr, dict(dev_asr)) save_wavaskey(fn_test_asr, dict(test_asr)) # asr_hdc_sem train_asr_hdc_sem = asr_hdc_sem[:int(0.8*len(asr_hdc_sem))] dev_asr_hdc_sem = asr_hdc_sem[int(0.8*len(asr_hdc_sem)):int(0.9*len(asr_hdc_sem))] test_asr_hdc_sem = asr_hdc_sem[int(0.9*len(asr_hdc_sem)):] save_wavaskey(fn_train_asr_hdc_sem, dict(train_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_asr_hdc_sem, dict(dev_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_asr_hdc_sem, dict(test_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # n-best lists train_nbl = nbl[:int(0.8*len(nbl))] dev_nbl = nbl[int(0.8*len(nbl)):int(0.9*len(nbl))] test_nbl = nbl[int(0.9*len(nbl)):] save_wavaskey(fn_train_nbl, dict(train_nbl)) save_wavaskey(fn_dev_nbl, dict(dev_nbl)) save_wavaskey(fn_test_nbl, dict(test_nbl)) # nbl_hdc_sem train_nbl_hdc_sem = nbl_hdc_sem[:int(0.8*len(nbl_hdc_sem))] dev_nbl_hdc_sem = nbl_hdc_sem[int(0.8*len(nbl_hdc_sem)):int(0.9*len(nbl_hdc_sem))] test_nbl_hdc_sem = nbl_hdc_sem[int(0.9*len(nbl_hdc_sem)):] save_wavaskey(fn_train_nbl_hdc_sem, dict(train_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_nbl_hdc_sem, dict(dev_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_nbl_hdc_sem, dict(test_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))
def test_parse_meta(self): utterances_to_understand = [ ( u"ahoj", "hello()", ), ( u"sbohem čau", "bye()", ), ( u"jiné", "reqalts()", ), ( u"začneme znovu", "restart()", ), ( u"zopakuj", "repeat()", ), ( u"promiň", "apology()", ), ( u"co se zeptat", "help()", ), ( u"haló", "canthearyou()", ), ( u"nerozuměl jsem", "notunderstood()", ), ( u"ano jo", "affirm()", ), ( u"ne ano nechci", "negate()", ), ( u"děkuji", "thankyou()", ), ( u"dobře", "ack()", ), ( u"chci jet", "inform(task=find_connection)", ), ( u"jak bude", "inform(task=weather)", ), ( u"nástupiště", "inform(task=find_platform)", ), ( u"z jaké jede", "request(from_stop)", ), ( u"kam to jede", "request(to_stop)", ), ( u"kdy to jede", "request(departure_time)", ), ( u"za jak dlouho", "request(departure_time_rel)", ), ( u"kdy tam budem", "request(arrival_time)", ), ( u"za jak dlouho tam přijedu", "request(arrival_time_rel)", ), ( u"jak dlouho bude trvat cesta", "request(duration)", ), ( u"kolik je hodin", "request(current_time)", ), ( u"jak dlouho trvá přestup", "request(time_transfers)", ), ( u"kolik přestupů", "request(num_transfers)", ), ( u"nechci přestup bez jet přímo", "inform(num_transfers=0)", ), ( u"jeden přestup", "inform(num_transfers=1)", ), ( u"dva přestupy", "inform(num_transfers=2)", ), ( u"tři přestupy", "inform(num_transfers=3)", ), ( u"čtyři přestupy", "inform(num_transfers=4)", ), ( u"libovolně přestupů", "inform(num_transfers=dontcare)", ), ( u"jet přímo", "inform(num_transfers=0)", ), ( u"alternativa libovolný", "inform(alternative=dontcare)", ), ( u"alternativa první", "inform(alternative=1)", ), ( u"alternativa druhá", "inform(alternative=2)", ), ( u"alternativa třetí", "inform(alternative=3)", ), ( u"alternativa čtvrtá", "inform(alternative=4)", ), ( u"alternativa páté", "inform(alternative=5)", ), ( u"předchozí spoj", "inform(alternative=prev)", ), ( u"nechci předchozí spoj", "deny(alternative=prev)", ), ( u"poslední spoj", "inform(alternative=last)", ), ( u"nechci poslední spoj", "deny(alternative=last)", ), ( u"další spoj", "inform(alternative=next)", ), ( u"další", "inform(alternative=next)", ), ( u"předchozí", "inform(alternative=prev)", ), ( u"jako ve dne", "inform(ampm=pm)", ), ] for utt, res in utterances_to_understand: asr_hyp = UtteranceNBList() asr_hyp.add(0.79, Utterance(utt)) cn = self.slu.parse(asr_hyp) self.assertIn(DialogueActItem(dai=res), cn)
def process_call_log(fn): name = multiprocessing.current_process().name asr = [] nbl = [] sem = [] trn = [] trn_hdc_sem = [] fcount = 0 tcount = 0 f_dir = os.path.dirname(fn) print "Process name:", name print "File #", fcount fcount += 1 print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=i, fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i + 1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format(turn=i, fn=fn, asrs=len( asrs), next_asrs=len( next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format( turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format( fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format(turn=i, fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_key, t)) print print "Transcritpiton #", tcount tcount += 1 print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt': Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) # 1 best ASR asr.append((wav_key, a)) # N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n', unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))), Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) return asr, nbl, sem, trn, trn_hdc_sem, fcount, tcount