def phrase_pos(self, words, sub_utt=None): """Returns the position of the given phrase in the given utterance, or -1 if not found. :rtype: int """ utt = Utterance(self._utterance[sub_utt[0]:sub_utt[1]]) if sub_utt else self._utterance words = words if not isinstance(words, basestring) else words.strip().split() return utt.find(words)
class TestUtterance(unittest.TestCase): """Tests correct working of the Utterance class.""" def setUp(self): self.barbara = Utterance('b a r b a r a') self.ararat = Utterance('a r a r a t') def test_index(self): test_pairs = ((['b', 'a', 'r'], 0), (['b', 'a', 'r', 'a'], 3), (['a', 'r', 'a'], 4)) for phrase, idx in test_pairs: self.assertEqual(self.barbara.index(phrase), idx) self.assertRaises(ValueError, self.barbara.index, ['r', 'a', 'b']) self.assertEqual(self.ararat.index(['a', 'r', 'a', 't']), 2) def test_ngram_iterator(self): # Normal use case. trigrams = [['b', 'a', 'r'], ['a', 'r', 'b'], ['r', 'b', 'a'], ['b', 'a', 'r'], ['a', 'r', 'a'], ] trigrams_with_boundaries = [ [SENTENCE_START, 'b', 'a'], ['b', 'a', 'r'], ['a', 'r', 'b'], ['r', 'b', 'a'], ['b', 'a', 'r'], ['a', 'r', 'a'], ['r', 'a', SENTENCE_END], ] act_trigrams = list(self.barbara.iter_ngrams(3)) act_trigrams_with_boundaries = list( self.barbara.iter_ngrams(3, with_boundaries=True)) self.assertItemsEqual(trigrams, act_trigrams) self.assertItemsEqual(trigrams_with_boundaries, act_trigrams_with_boundaries) # Corner cases. self.assertItemsEqual(list(self.barbara.iter_ngrams(7)), [['b', 'a', 'r', 'b', 'a', 'r', 'a']]) self.assertItemsEqual(list(self.barbara.iter_ngrams(8)), []) self.assertItemsEqual( list(self.barbara.iter_ngrams(8, with_boundaries=True)), [[SENTENCE_START, 'b', 'a', 'r', 'b', 'a', 'r', 'a'], ['b', 'a', 'r', 'b', 'a', 'r', 'a', SENTENCE_END]]) self.assertItemsEqual( list(self.barbara.iter_ngrams(9, with_boundaries=True)), [[SENTENCE_START, 'b', 'a', 'r', 'b', 'a', 'r', 'a', SENTENCE_END]]) self.assertItemsEqual( list(self.barbara.iter_ngrams(10, with_boundaries=True)), [])
def test_parse_X(self): from alex.components.slu.dainnclassifier import DAINNClassifier np.random.seed(0) cldb = CategoryLabelDatabase() class db: database = { "task": { "find_connection": ["najít spojení", "najít spoj", "zjistit spojení", "zjistit spoj", "hledám spojení", 'spojení', 'spoj', ], "find_platform": ["najít nástupiště", "zjistit nástupiště", ], 'weather': ['pocasi', 'jak bude', ], }, "number": { "1": ["jednu"] }, "time": { "now": ["nyní", "teď", "teďka", "hned", "nejbližší", "v tuto chvíli", "co nejdřív"], }, } cldb.load(db_mod=db) preprocessing = SLUPreprocessing(cldb) clf = DAINNClassifier(cldb, preprocessing, features_size=4) # Train a simple classifier. das = { '1': DialogueAct('inform(task=weather)'), '2': DialogueAct('inform(time=now)'), '3': DialogueAct('inform(task=weather)'), '4': DialogueAct('inform(task=connection)'), } utterances = { '1': Utterance('pocasi pocasi pocasi pocasi pocasi'), '2': Utterance('hned ted nyni hned ted nyni'), '3': Utterance('jak bude jak bude jak bude jak bude'), '4': Utterance('kdy a odkat mi to jede'), } clf.extract_classifiers(das, utterances, verbose=False) clf.prune_classifiers(min_classifier_count=0) clf.gen_classifiers_data(min_pos_feature_count=0, min_neg_feature_count=0, verbose2=False) clf.train(inverse_regularisation=1e1, verbose=False) # Parse some sentences. utterance_list = UtteranceNBList() utterance_list.add(0.7, Utterance('pocasi')) utterance_list.add(0.7, Utterance('jak bude pocasi')) utterance_list.add(0.2, Utterance('hned')) utterance_list.add(0.2, Utterance('hned')) da_confnet = clf.parse_X(utterance_list, verbose=False) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(task=weather)')) != 0.0) self.assertTrue(da_confnet.get_prob(DialogueActItem(dai='inform(time=now)')) != 0.0)
def recognize(self, wav): """ Produces hypotheses for the input audio data. Remember that GoogleASR works only with complete wave files. Returns an n-best list of hypotheses. """ # making a file temp for manipulation handle, flac_file_name = mkstemp('TmpSpeechFile.flac') try: # convert wav to flac audio.save_flac(self.cfg, flac_file_name, wav) json_hypotheses = self.get_asr_hypotheses(flac_file_name) except (urllib2.HTTPError, urllib2.URLError) as e: self.syslog.exception('GoogleASR HTTP/URL error: %s' % unicode(e)) json_hypotheses = [ [{'confidence': 1.0, 'utterance': '__google__ __asr__ __exception__'}, ], ] finally: os.close(handle) remove(flac_file_name) try: hyp = json.loads(json_hypotheses) # print "###", hyp nblist = UtteranceNBList() if hyp['status'] == 0: n = len(hyp['hypotheses']) for i, h in enumerate(hyp['hypotheses']): if i == 0: nblist.add(h['confidence'], Utterance(h['utterance'])) conf1 = hyp['hypotheses'][0]['confidence'] else: # guess the confX score nblist.add((1.0-conf1)*(n-i)/(n-1.0)/(n-0.0)*2.0, Utterance(h['utterance'])) elif hyp['status'] == 5: nblist.add(1.0, Utterance('_other_')) except: nblist = UtteranceNBList() nblist.merge() nblist.add_other() return nblist
def test_parse_with_mutliple_date_rel(self): asr_hyp = UtteranceNBList() asr_hyp.add(0.1, Utterance("CHTEL BYCH ZITRA ZITRA JET")) cn = self.slu.parse(asr_hyp) self.assert_(DialogueActItem(dai="inform(date_rel=tomorrow)") in cn)
def parse_input_utt(self, l): """Converts a text including a dialogue act and its probability into a dialogue act instance and float probability. The input text must have the following form: [prob] this is a text input """ ri = l.find(" ") prob = 1.0 if ri != -1: utt = l[ri + 1:] try: prob = float(l[:ri]) except: # I cannot convert the first part of the input as a float # Therefore, assume that all the input is a DA utt = l else: utt = l utt = utt.strip() if utt == "": utt = "_silence_" try: utt = Utterance(utt) except UtteranceException: raise TextHubException("Invalid utterance: %s" % utt) return prob, utt
def process_file(file_path): cldb = CategoryLabelDatabase(as_project_path('applications/PublicTransportInfoCS/data/database.py')) preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path('applications/PublicTransportInfoCS/data/utt2da_dict.txt')}}}) stdout = codecs.getwriter('UTF-8')(sys.stdout) with open(file_path, 'r') as fh: for line in codecs.getreader('UTF-8')(fh): line = line.strip("\r\n") # skip empty lines (dialogue boundaries) if not line: continue person, da, utt = line.split("\t") # skip system utterances, use just user utterances if 'SYSTEM' in person: continue # reparse utterance using transcription utt = re.sub(r',', r' ', utt) utt = Utterance(utt) sem = hdc_slu.parse({'utt': utt}) # get abstracted utterance text abutt = hdc_slu.abstract_utterance(utt) abutt_str = get_abutt_str(utt, abutt) # get abstracted DA best_da = sem.get_best_da() best_da_str = unicode(best_da) abstract_da(best_da) print >> stdout, unicode(utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(best_da)
def test_parse_street_from_street_to_streets(self): utterance = Utterance("third street to beacon court at beamer court") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_street="3 St")', str(cn[0][1])) self.assertEquals('inform(to_street="Beacon Ct")', str(cn[1][1])) self.assertEquals('inform(to_street2="Beamer Ct")', str(cn[2][1])) self.assertEquals(3, len(cn))
def test_parse_street_at_streets(self): utterance = Utterance( "i am at the corner of third street and beacon court") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_street="3 St")', str(cn[0][1])) self.assertEquals('inform(from_street2="Beacon Ct")', str(cn[1][1])) self.assertEquals(2, len(cn))
def test_parse_borough_from_to(self): utterance = Utterance("i want to go from manhattan to brooklyn") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_borough="Manhattan")', str(cn[0][1])) self.assertEquals('inform(to_borough="Brooklyn")', str(cn[1][1])) self.assertEquals('inform(task="find_connection")', str(cn[2][1])) self.assertEquals(3, len(cn))
def test_parse_from_to_city(self): utterance = Utterance("i want to go from New York to Baltimore") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_city="New York")', str(cn[0][1])) self.assertEquals('inform(to_city="Baltimore")', str(cn[1][1])) self.assertEquals('inform(task="find_connection")', str(cn[2][1])) self.assertEquals(3, len(cn))
def phrase_pos(utterance, words): """Returns the position of the given phrase in the given utterance, or -1 if not found. :rtype: int """ utterance = utterance if not isinstance(utterance, list) else Utterance(' '.join(utterance)) words = words if not isinstance(words, basestring) else words.strip().split() return utterance.find(words)
def test_parse_form_street_to_stop(self): utterance = Utterance( "i would like to go from cypress avenue to the lincoln center") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_street="Cypress Ave")', str(cn[0][1])) self.assertEquals('inform(to_stop="Lincoln Center")', str(cn[1][1])) self.assertEquals('inform(task="find_connection")', str(cn[2][1])) self.assertEquals(3, len(cn))
def test_parse_from_street_street_to_street(self): utterance = Utterance( "i want to go from 7th avenue and 42nd street to broadway") cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_street="7 Ave")', str(cn[0][1])) self.assertEquals('inform(from_street2="42 St")', str(cn[1][1])) self.assertEquals('inform(to_street="Broadway")', str(cn[2][1])) self.assertEquals('inform(task="find_connection")', str(cn[3][1])) self.assertEquals(4, len(cn))
def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) if self.calibration_table: lat = lattice_calibration(lat, self.calibration_table) self.last_lattice = lat # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist
def test_parse_street_from_streets_to_streets(self): utterance = Utterance( "from third street and twenty third avenue to beacon court and beamer court" ) cn = self.slu.parse_1_best({'utt': utterance}) self.assertEquals('inform(from_street="3 St")', str(cn[0][1])) self.assertEquals('inform(from_street2="23 Ave")', str(cn[1][1])) self.assertEquals('inform(to_street="Beacon Ct")', str(cn[2][1])) self.assertEquals('inform(to_street2="Beamer Ct")', str(cn[3][1])) self.assertEquals(4, len(cn))
def __init__(self, utterance, abutterance_lenghts=None): """ :param utterance: utterance to search words in :type utterance: Utterance :param abutterance_lenghts: numbers of utterance words that correspond to each abutterance word. I.e.: an element is 1 if respective abutterance word is unabstracted utterance words :type abutterance_lenghts: list[int] """ self._utterance = utterance if not isinstance(self, list) else Utterance(' '.join(utterance)) self.utt2abutt_idxs = range(len(utterance)) if not abutterance_lenghts else \ list(chain.from_iterable([idx]*abutterance_lenghts[idx] for idx in range(len(abutterance_lenghts)))) self._alignment = set()
def get_response(self, utt_text): try: self.cfg['Logging']['system_logger'].info('User: '******'utt_nbl': utt_nblist}) return self.process_dm() except Exception: self.cfg['Logging']['system_logger'].exception( 'Uncaught exception in WTHUB process.') return None
def ending_phrases_in(self, phrases): """Returns True if the utterance ends with one of the phrases :param phrases: a list of phrases to search for :rtype: bool """ utterance = self._utterance if not isinstance(self, list) else Utterance(' '.join(self._utterance)) for phrase in phrases: pos = self.phrase_pos(phrase) if pos is not -1 and pos + len(phrase.split()) is len(utterance): self._alignment.update(range(pos, pos+len(phrase.split()))) return True return False
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) output_utterance = True output_abstraction = False output_da = True fn_uniq_trn_sem = 'uniq.trn.sem.tmp' if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance(utterance) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
def ending_phrases_in(utterance, phrases): """Returns True if the utterance ends with one of the phrases :param utterance: The utterance to search in :param phrases: a list of phrases to search for :rtype: bool """ utterance = utterance if not isinstance(utterance, list) else Utterance(' '.join(utterance)) utterance_len = len(utterance) for phrase in phrases: phr_pos = phrase_pos(utterance, phrase) if phr_pos is not -1 and phr_pos + len(phrase.split()) is utterance_len: return True return False
def test_conversion_of_confnet_into_nblist(self): A1, A2, A3 = 0.90, 0.05, 0.05 B1, B2, B3 = 0.50, 0.35, 0.15 C1, C2, C3 = 0.60, 0.30, 0.10 correct_nblist = UtteranceNBList() correct_nblist.add(A1*B1*C1, Utterance("A1 B1 C1")) correct_nblist.add(A1*B2*C1, Utterance("A1 B2 C1")) correct_nblist.add(A1*B1*C2, Utterance("A1 B1 C2")) correct_nblist.add(A1*B2*C2, Utterance("A1 B2 C2")) correct_nblist.add(A1*B3*C1, Utterance("A1 B3 C1")) correct_nblist.add(A1*B1*C3, Utterance("A1 B1 C3")) correct_nblist.add(A1*B3*C2, Utterance("A1 B3 C2")) correct_nblist.add(A1*B2*C3, Utterance("A1 B2 C3")) correct_nblist.merge() correct_nblist.add_other() confnet = UtteranceConfusionNetwork() confnet.add([[A1, 'A1'], [A2, 'A2'], [A3, 'A3'],]) confnet.add([[B1, 'B1'], [B2, 'B2'], [B3, 'B3'],]) confnet.add([[C1, 'C1'], [C2, 'C2'], [C3, 'C3'],]) confnet.merge().sort() gen_nblist = confnet.get_utterance_nblist(10) s = [] s.append("") s.append("Confusion network:") s.append(unicode(confnet)) s.append("") s.append("Generated nblist:") s.append(unicode(gen_nblist)) s.append("") s.append("Correct nblist:") s.append(unicode(correct_nblist)) s.append("") print '\n'.join(s) self.assertEqual(unicode(gen_nblist), unicode(correct_nblist))
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) output_alignment = False output_utterance = True output_abstraction = False output_da = True if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] fn_uniq_trn_sem = fn_uniq_trn + '.sem.tmp' print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_alignment: norm_utterance = slu.preprocessing.normalise_utterance( Utterance(utterance)) abutterance, _, _ = slu.abstract_utterance(norm_utterance) abutterance = slu.handle_false_abstractions(abutterance) da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() max_alignment_idx = lambda _dai: max( _dai.alignment) if _dai.alignment else len(abutterance) for i, dai in enumerate(sorted(da, key=max_alignment_idx)): if not dai.alignment: print "Empty alignment:", unicode(abutterance), ";", dai if not dai.alignment or dai.alignment == {-1}: dai_alignment_idx = len(abutterance) else: dai_alignment_idx = max(dai.alignment) + i + 1 abutterance.insert( dai_alignment_idx, "[{} - {}]".format( unicode(dai), list(dai.alignment if dai.alignment else []))) annotation += [unicode(abutterance)] else: if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance( Utterance(utterance)) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({ 'utt': Utterance(utterance) }).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
def process_call_log(fn): name = multiprocessing.current_process().name asr = [] nbl = [] sem = [] trn = [] trn_hdc_sem = [] fcount = 0 tcount = 0 f_dir = os.path.dirname(fn) print "Process name:", name print "File #", fcount fcount += 1 print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format( turn=i, fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i + 1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format( turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format( turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format( fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format( turn=i, fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format( fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_', '') trn.append((wav_key, t)) print print "Transcritpiton #", tcount tcount += 1 print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt': Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) # 1 best ASR asr.append((wav_key, a)) # N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n', unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))), Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) return asr, nbl, sem, trn, trn_hdc_sem, fcount, tcount
from alex.components.slu.base import CategoryLabelDatabase """ Serves to quickly test HDC SLU with a single utterance supplied as argument """ if len(sys.argv) < 2: print "No utterance entered as argument. Processing sample utterance instead..." utterance = u"CHTĚL BYCH JET ZE ZASTÁVKY ANDĚL DO ZASTÁVKY MALOSTRANSKÉ NÁMĚSTÍ" else: utterance = sys.argv[1].decode('utf-8') sys.argv = sys.argv[:1] cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt") } } }) da = slu.parse_1_best({ 'utt': Utterance(utterance) }, verbose=True).get_best_da() print "Resulting dialogue act: \n", unicode(da)
def setUp(self): self.barbara = Utterance('b a r b a r a') self.ararat = Utterance('a r a r a t')
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTIENSLUPreprocessing(cldb) slu = PTIENHDCSLU(preprocessing, cfg={'SLU': {PTIENHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoEN/data/utt2da_dict.txt")}}}) cfg = Config.load_configs(['../kaldi.cfg',], use_default=True) asr_rec = asr_factory(cfg) fn_uniq_trn = 'uniq.trn' fn_uniq_trn_hdc_sem = 'uniq.trn.hdc.sem' fn_uniq_trn_sem = 'uniq.trn.sem' fn_all_sem = 'all.sem' fn_all_trn = 'all.trn' fn_all_trn_hdc_sem = 'all.trn.hdc.sem' fn_all_asr = 'all.asr' fn_all_asr_hdc_sem = 'all.asr.hdc.sem' fn_all_nbl = 'all.nbl' fn_all_nbl_hdc_sem = 'all.nbl.hdc.sem' fn_train_sem = 'train.sem' fn_train_trn = 'train.trn' fn_train_trn_hdc_sem = 'train.trn.hdc.sem' fn_train_asr = 'train.asr' fn_train_asr_hdc_sem = 'train.asr.hdc.sem' fn_train_nbl = 'train.nbl' fn_train_nbl_hdc_sem = 'train.nbl.hdc.sem' fn_dev_sem = 'dev.sem' fn_dev_trn = 'dev.trn' fn_dev_trn_hdc_sem = 'dev.trn.hdc.sem' fn_dev_asr = 'dev.asr' fn_dev_asr_hdc_sem = 'dev.asr.hdc.sem' fn_dev_nbl = 'dev.nbl' fn_dev_nbl_hdc_sem = 'dev.nbl.hdc.sem' fn_test_sem = 'test.sem' fn_test_trn = 'test.trn' fn_test_trn_hdc_sem = 'test.trn.hdc.sem' fn_test_asr = 'test.asr' fn_test_asr_hdc_sem = 'test.asr.hdc.sem' fn_test_nbl = 'test.nbl' fn_test_nbl_hdc_sem = 'test.nbl.hdc.sem' indomain_data_dir = "indomain_data" print "Generating the SLU train and test data" print "-"*120 ############################################################################################### files = [] files.append(glob.glob(os.path.join(indomain_data_dir, 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', 'asr_transcribed.xml'))) files.append(glob.glob(os.path.join(indomain_data_dir, '*', '*', '*', '*', '*', 'asr_transcribed.xml'))) files = various.flatten(files) sem = [] trn = [] trn_hdc_sem = [] asr = [] asr_hdc_sem = [] nbl = [] nbl_hdc_sem = [] for fn in files[:100000]: f_dir = os.path.dirname(fn) print "Processing:", fn doc = xml.dom.minidom.parse(fn) turns = doc.getElementsByTagName("turn") for i, turn in enumerate(turns): if turn.getAttribute('speaker') != 'user': continue recs = turn.getElementsByTagName("rec") trans = turn.getElementsByTagName("asr_transcription") asrs = turn.getElementsByTagName("asr") if len(recs) != 1: print "Skipping a turn {turn} in file: {fn} - recs: {recs}".format(turn=i,fn=fn, recs=len(recs)) continue if len(asrs) == 0 and (i + 1) < len(turns): next_asrs = turns[i+1].getElementsByTagName("asr") if len(next_asrs) != 2: print "Skipping a turn {turn} in file: {fn} - asrs: {asrs} - next_asrs: {next_asrs}".format(turn=i, fn=fn, asrs=len(asrs), next_asrs=len(next_asrs)) continue print "Recovered from missing ASR output by using a delayed ASR output from the following turn of turn {turn}. File: {fn} - next_asrs: {asrs}".format(turn=i, fn=fn, asrs=len(next_asrs)) hyps = next_asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 1: hyps = asrs[0].getElementsByTagName("hypothesis") elif len(asrs) == 2: print "Recovered from EXTRA ASR outputs by using a the last ASR output from the turn. File: {fn} - asrs: {asrs}".format(fn=fn, asrs=len(asrs)) hyps = asrs[-1].getElementsByTagName("hypothesis") else: print "Skipping a turn {turn} in file {fn} - asrs: {asrs}".format(turn=i,fn=fn, asrs=len(asrs)) continue if len(trans) == 0: print "Skipping a turn in {fn} - trans: {trans}".format(fn=fn, trans=len(trans)) continue wav_key = recs[0].getAttribute('fname') wav_path = os.path.join(f_dir, wav_key) # FIXME: Check whether the last transcription is really the best! FJ t = various.get_text_from_xml_node(trans[-1]) t = normalise_text(t) if '--asr-log' not in sys.argv: asr_rec_nbl = asr_rec.rec_wav_file(wav_path) a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) if exclude_slu(t) or 'DOM Element:' in a: print "Skipping transcription:", unicode(t) print "Skipping ASR output: ", unicode(a) continue # The silence does not have a label in the language model. t = t.replace('_SIL_','') trn.append((wav_key, t)) print "Parsing transcription:", unicode(t) print " ASR:", unicode(a) # HDC SLU on transcription s = slu.parse_1_best({'utt':Utterance(t)}).get_best_da() trn_hdc_sem.append((wav_key, s)) if '--uniq' not in sys.argv: # HDC SLU on 1 best ASR if '--asr-log' not in sys.argv: a = unicode(asr_rec_nbl.get_best()) else: a = various.get_text_from_xml_node(hyps[0]) a = normalise_semi_words(a) asr.append((wav_key, a)) s = slu.parse_1_best({'utt':Utterance(a)}).get_best_da() asr_hdc_sem.append((wav_key, s)) # HDC SLU on N best ASR n = UtteranceNBList() if '--asr-log' not in sys.argv: n = asr_rec_nbl print 'ASR RECOGNITION NBLIST\n',unicode(n) else: for h in hyps: txt = various.get_text_from_xml_node(h) txt = normalise_semi_words(txt) n.add(abs(float(h.getAttribute('p'))),Utterance(txt)) n.merge() n.normalise() nbl.append((wav_key, n.serialise())) if '--fast' not in sys.argv: s = slu.parse_nblist({'utt_nbl':n}).get_best_da() nbl_hdc_sem.append((wav_key, s)) # there is no manual semantics in the transcriptions yet sem.append((wav_key, None)) uniq_trn = {} uniq_trn_hdc_sem = {} uniq_trn_sem = {} trn_set = set() sem = dict(trn_hdc_sem) for k, v in trn: if not v in trn_set: trn_set.add(v) uniq_trn[k] = v uniq_trn_hdc_sem[k] = sem[k] uniq_trn_sem[k] = v + " <=> " + unicode(sem[k]) save_wavaskey(fn_uniq_trn, uniq_trn) save_wavaskey(fn_uniq_trn_hdc_sem, uniq_trn_hdc_sem, trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem) # all save_wavaskey(fn_all_trn, dict(trn)) save_wavaskey(fn_all_trn_hdc_sem, dict(trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) if '--uniq' not in sys.argv: save_wavaskey(fn_all_asr, dict(asr)) save_wavaskey(fn_all_asr_hdc_sem, dict(asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_all_nbl, dict(nbl)) save_wavaskey(fn_all_nbl_hdc_sem, dict(nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) seed_value = 10 random.seed(seed_value) random.shuffle(trn) random.seed(seed_value) random.shuffle(trn_hdc_sem) random.seed(seed_value) random.shuffle(asr) random.seed(seed_value) random.shuffle(asr_hdc_sem) random.seed(seed_value) random.shuffle(nbl) random.seed(seed_value) random.shuffle(nbl_hdc_sem) # trn train_trn = trn[:int(0.8*len(trn))] dev_trn = trn[int(0.8*len(trn)):int(0.9*len(trn))] test_trn = trn[int(0.9*len(trn)):] save_wavaskey(fn_train_trn, dict(train_trn)) save_wavaskey(fn_dev_trn, dict(dev_trn)) save_wavaskey(fn_test_trn, dict(test_trn)) # trn_hdc_sem train_trn_hdc_sem = trn_hdc_sem[:int(0.8*len(trn_hdc_sem))] dev_trn_hdc_sem = trn_hdc_sem[int(0.8*len(trn_hdc_sem)):int(0.9*len(trn_hdc_sem))] test_trn_hdc_sem = trn_hdc_sem[int(0.9*len(trn_hdc_sem)):] save_wavaskey(fn_train_trn_hdc_sem, dict(train_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_trn_hdc_sem, dict(dev_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_trn_hdc_sem, dict(test_trn_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # asr train_asr = asr[:int(0.8*len(asr))] dev_asr = asr[int(0.8*len(asr)):int(0.9*len(asr))] test_asr = asr[int(0.9*len(asr)):] save_wavaskey(fn_train_asr, dict(train_asr)) save_wavaskey(fn_dev_asr, dict(dev_asr)) save_wavaskey(fn_test_asr, dict(test_asr)) # asr_hdc_sem train_asr_hdc_sem = asr_hdc_sem[:int(0.8*len(asr_hdc_sem))] dev_asr_hdc_sem = asr_hdc_sem[int(0.8*len(asr_hdc_sem)):int(0.9*len(asr_hdc_sem))] test_asr_hdc_sem = asr_hdc_sem[int(0.9*len(asr_hdc_sem)):] save_wavaskey(fn_train_asr_hdc_sem, dict(train_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_asr_hdc_sem, dict(dev_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_asr_hdc_sem, dict(test_asr_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) # n-best lists train_nbl = nbl[:int(0.8*len(nbl))] dev_nbl = nbl[int(0.8*len(nbl)):int(0.9*len(nbl))] test_nbl = nbl[int(0.9*len(nbl)):] save_wavaskey(fn_train_nbl, dict(train_nbl)) save_wavaskey(fn_dev_nbl, dict(dev_nbl)) save_wavaskey(fn_test_nbl, dict(test_nbl)) # nbl_hdc_sem train_nbl_hdc_sem = nbl_hdc_sem[:int(0.8*len(nbl_hdc_sem))] dev_nbl_hdc_sem = nbl_hdc_sem[int(0.8*len(nbl_hdc_sem)):int(0.9*len(nbl_hdc_sem))] test_nbl_hdc_sem = nbl_hdc_sem[int(0.9*len(nbl_hdc_sem)):] save_wavaskey(fn_train_nbl_hdc_sem, dict(train_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_dev_nbl_hdc_sem, dict(dev_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) save_wavaskey(fn_test_nbl_hdc_sem, dict(test_nbl_hdc_sem), trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))
def get_results(self, timeout=0.6): """" Waits for the complete recognition results from the Julius ASR server. Timeout specifies how long it will wait for the end of message. """ msg = "" # Get results from the server. time_slept = 0.0 while time_slept < timeout: msg_part = self.read_server_message(self.msg_timeout) if not msg_part: # Wait and check whether there is a message. time.sleep(self.cfg['Hub']['main_loop_sleep_time']) time_slept += self.cfg['Hub']['main_loop_sleep_time'] if self.debug >= 2: print "gr.time_slept:", time_slept continue msg += msg_part + '\n' if self.debug: print msg if '<CONFNET>' in msg: break else: raise JuliusASRTimeoutException( "Timeout when waiting for the Julius server results.") # Process the results. """ Typical result returned by the Julius ASR. <STARTPROC/> <INPUT STATUS="LISTEN" TIME="1343896296"/> <INPUT STATUS="STARTREC" TIME="1343896311"/> <STARTRECOG/> <INPUT STATUS="ENDREC" TIME="1343896312"/> <ENDRECOG/> <INPUTPARAM FRAMES="164" MSEC="1640"/> <RECOGOUT> <SHYPO RANK="1" SCORE="-7250.111328"> <WHYPO WORD="" CLASSID="<s>" PHONE="sil" CM="0.887"/> <WHYPO WORD="I'M" CLASSID="I'M" PHONE="ah m" CM="0.705"/> <WHYPO WORD="LOOKING" CLASSID="LOOKING" PHONE="l uh k ih ng" CM="0.992"/> <WHYPO WORD="FOR" CLASSID="FOR" PHONE="f er" CM="0.757"/> <WHYPO WORD="A" CLASSID="A" PHONE="ah" CM="0.672"/> <WHYPO WORD="PUB" CLASSID="PUB" PHONE="p ah b" CM="0.409"/> <WHYPO WORD="" CLASSID="</s>" PHONE="sil" CM="1.000"/> </SHYPO> </RECOGOUT> <GRAPHOUT NODENUM="43" ARCNUM="70"> <NODE GID="0" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="2"/> <NODE GID="1" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="3"/> <NODE GID="2" WORD="" CLASSID="<s>" PHONE="sil" BEGIN="0" END="4"/> <NODE GID="3" WORD="I" CLASSID="I" PHONE="ay" BEGIN="3" END="5"/> <NODE GID="4" WORD="NO" CLASSID="NO" PHONE="n ow" BEGIN="3" END="7"/> <NODE GID="5" WORD="I" CLASSID="I" PHONE="ay" BEGIN="4" END="6"/> <NODE GID="6" WORD="UH" CLASSID="UH" PHONE="ah" BEGIN="4" END="6"/> <NODE GID="7" WORD="I'M" CLASSID="I'M" PHONE="ay m" BEGIN="4" END="27"/> ... <NODE GID="38" WORD="PUB" CLASSID="PUB" PHONE="p ah b" BEGIN="79" END="104"/> <NODE GID="39" WORD="AH" CLASSID="AH" PHONE="aa" BEGIN="81" END="110"/> <NODE GID="40" WORD="LOT" CLASSID="LOT" PHONE="l aa t" BEGIN="81" END="110"/> <NODE GID="41" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="105" END="163"/> <NODE GID="42" WORD="" CLASSID="</s>" PHONE="sil" BEGIN="111" END="163"/> <ARC FROM="0" TO="4"/> <ARC FROM="0" TO="3"/> <ARC FROM="1" TO="7"/> <ARC FROM="1" TO="5"/> <ARC FROM="1" TO="6"/> ... <ARC FROM="38" TO="41"/> <ARC FROM="39" TO="42"/> <ARC FROM="40" TO="42"/> </GRAPHOUT> <CONFNET> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.950">I</ALTERNATIVE> <ALTERNATIVE PROB="0.020">HI</ALTERNATIVE> <ALTERNATIVE PROB="0.013">NO</ALTERNATIVE> <ALTERNATIVE PROB="0.010"></ALTERNATIVE> <ALTERNATIVE PROB="0.006">UH</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.945">AM</ALTERNATIVE> <ALTERNATIVE PROB="0.055">I'M</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">LOOKING</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">FOR</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000">A</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="0.963">PUB</ALTERNATIVE> <ALTERNATIVE PROB="0.016">AH</ALTERNATIVE> <ALTERNATIVE PROB="0.012">BAR</ALTERNATIVE> <ALTERNATIVE PROB="0.008">LOT</ALTERNATIVE> </WORD> <WORD> <ALTERNATIVE PROB="1.000"></ALTERNATIVE> </WORD> </CONFNET> <INPUT STATUS="LISTEN" TIME="1343896312"/> """ msg = "<RESULTS>" + msg + "</RESULTS>" msg = msg.replace("<s>", "<s>").replace("</s>", "</s>") nblist = UtteranceNBList() doc = xml.dom.minidom.parseString(msg) recogout = doc.getElementsByTagName("RECOGOUT") for el in recogout: shypo = el.getElementsByTagName("SHYPO") for el in shypo: whypo = el.getElementsByTagName("WHYPO") utterance = "" cm = 1.0 for el in whypo: word = el.getAttribute("WORD") utterance += " " + word if word: cm *= float(el.getAttribute("CM")) nblist.add(cm, Utterance(utterance)) nblist.merge() nblist.add_other() cn = UtteranceConfusionNetwork() confnet = doc.getElementsByTagName("CONFNET") for el in confnet: word = el.getElementsByTagName("WORD") for el in word: alternative = el.getElementsByTagName("ALTERNATIVE") word_list = [] for el in alternative: prob = float(el.getAttribute("PROB")) text = get_text_from_xml_node(el) word_list.append([prob, text]) # Filter out empty hypotheses. if len(word_list) == 0: continue if len(word_list) == 1 and len(word_list[0][1]) == 0: continue # Add the word into the confusion network. cn.add(word_list) cn.merge() cn.normalise() cn.prune() cn.normalise() cn.sort() return nblist, cn
cfg = Config.load_configs(args.configs) ######################################################################### ######################################################################### term_width = getTerminalSize()[1] or 120 cfg['Logging']['system_logger'].info("Text Hub\n" + "=" * (term_width - 4)) cfg['Logging']['system_logger'].session_start("localhost") cfg['Logging']['system_logger'].session_system_log('config = ' + unicode(cfg)) cfg['Logging']['session_logger'].session_start( cfg['Logging']['system_logger'].get_session_dir_name()) cfg['Logging']['session_logger'].config('config = ' + unicode(cfg)) cfg['Logging']['session_logger'].header(cfg['Logging']["system_name"], cfg['Logging']["version"]) cfg['Logging']['session_logger'].input_source("text") thub = TextHub(cfg, args.tts_preprocessing) if args.scripts: for script in args.scripts: with open(script) as f_in: for ln in f_in: thub.process_dm() ln = ln.decode('utf8').strip() print "SCRIPT: %s" % ln thub.process_utterance_hyp({'utt': Utterance(ln)}) thub.run()
def test_parse_meta(self): utterances_to_understand = [ ( u"ahoj", "hello()", ), ( u"sbohem čau", "bye()", ), ( u"jiné", "reqalts()", ), ( u"začneme znovu", "restart()", ), ( u"zopakuj", "repeat()", ), ( u"promiň", "apology()", ), ( u"co se zeptat", "help()", ), ( u"haló", "canthearyou()", ), ( u"nerozuměl jsem", "notunderstood()", ), ( u"ano jo", "affirm()", ), ( u"ne ano nechci", "negate()", ), ( u"děkuji", "thankyou()", ), ( u"dobře", "ack()", ), ( u"chci jet", "inform(task=find_connection)", ), ( u"jak bude", "inform(task=weather)", ), ( u"nástupiště", "inform(task=find_platform)", ), ( u"z jaké jede", "request(from_stop)", ), ( u"kam to jede", "request(to_stop)", ), ( u"kdy to jede", "request(departure_time)", ), ( u"za jak dlouho", "request(departure_time_rel)", ), ( u"kdy tam budem", "request(arrival_time)", ), ( u"za jak dlouho tam přijedu", "request(arrival_time_rel)", ), ( u"jak dlouho bude trvat cesta", "request(duration)", ), ( u"kolik je hodin", "request(current_time)", ), ( u"jak dlouho trvá přestup", "request(time_transfers)", ), ( u"kolik přestupů", "request(num_transfers)", ), ( u"nechci přestup bez jet přímo", "inform(num_transfers=0)", ), ( u"jeden přestup", "inform(num_transfers=1)", ), ( u"dva přestupy", "inform(num_transfers=2)", ), ( u"tři přestupy", "inform(num_transfers=3)", ), ( u"čtyři přestupy", "inform(num_transfers=4)", ), ( u"libovolně přestupů", "inform(num_transfers=dontcare)", ), ( u"jet přímo", "inform(num_transfers=0)", ), ( u"alternativa libovolný", "inform(alternative=dontcare)", ), ( u"alternativa první", "inform(alternative=1)", ), ( u"alternativa druhá", "inform(alternative=2)", ), ( u"alternativa třetí", "inform(alternative=3)", ), ( u"alternativa čtvrtá", "inform(alternative=4)", ), ( u"alternativa páté", "inform(alternative=5)", ), ( u"předchozí spoj", "inform(alternative=prev)", ), ( u"nechci předchozí spoj", "deny(alternative=prev)", ), ( u"poslední spoj", "inform(alternative=last)", ), ( u"nechci poslední spoj", "deny(alternative=last)", ), ( u"další spoj", "inform(alternative=next)", ), ( u"další", "inform(alternative=next)", ), ( u"předchozí", "inform(alternative=prev)", ), ( u"jako ve dne", "inform(ampm=pm)", ), ] for utt, res in utterances_to_understand: asr_hyp = UtteranceNBList() asr_hyp.add(0.79, Utterance(utt)) cn = self.slu.parse(asr_hyp) self.assertIn(DialogueActItem(dai=res), cn)