def hdc_slu(fn_input, constructor, fn_output): """ Use for transcription a HDC SLU model. :param fn_model: :param fn_input: :param constructor: :param fn_reference: :return: """ print "="*120 print "HDC SLU: ", fn_input, fn_output print "-"*120 from alex.components.slu.base import CategoryLabelDatabase from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey from alex.corpustools.semscore import score cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoCS/data/utt2da_dict.txt")}}}) test_utterances = load_wavaskey(fn_input, constructor, limit=1000000) parsed_das = {} for utt_key, utt in sorted(test_utterances.iteritems()): if isinstance(utt, Utterance): obs = {'utt': utt} elif isinstance(utt, UtteranceNBList): obs = {'utt_nbl': utt} else: raise BaseException('Unsupported observation type') print '-' * 120 print "Observation:" print utt_key, " ==> " print unicode(utt) da_confnet = hdc_slu.parse(obs, verbose=False) print "Conf net:" print unicode(da_confnet) da_confnet.prune() dah = da_confnet.get_best_da_hyp() print "1 best: " print unicode(dah) parsed_das[utt_key] = dah.da if 'CL_' in str(dah.da): print '*' * 120 print utt print dah.da hdc_slu.parse(obs, verbose=True) save_wavaskey(fn_output, parsed_das, trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))
def process_file(file_path): cldb = CategoryLabelDatabase(as_project_path('applications/PublicTransportInfoCS/data/database.py')) preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path('applications/PublicTransportInfoCS/data/utt2da_dict.txt')}}}) stdout = codecs.getwriter('UTF-8')(sys.stdout) with open(file_path, 'r') as fh: for line in codecs.getreader('UTF-8')(fh): line = line.strip("\r\n") # skip empty lines (dialogue boundaries) if not line: continue person, da, utt = line.split("\t") # skip system utterances, use just user utterances if 'SYSTEM' in person: continue # reparse utterance using transcription utt = re.sub(r',', r' ', utt) utt = Utterance(utt) sem = hdc_slu.parse({'utt': utt}) # get abstracted utterance text abutt = hdc_slu.abstract_utterance(utt) abutt_str = get_abutt_str(utt, abutt) # get abstracted DA best_da = sem.get_best_da() best_da_str = unicode(best_da) abstract_da(best_da) print >> stdout, unicode(utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(best_da)
def hdc_slu_test(fn_input, constructor, fn_reference): """ Tests the HDC SLU. :param fn_model: :param fn_input: :param constructor: :param fn_reference: :return: """ print "=" * 120 print "Testing HDC SLU: ", fn_input, fn_reference print "-" * 120 from alex.components.slu.base import CategoryLabelDatabase from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey from alex.corpustools.semscore import score cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) test_utterances = load_wavaskey(fn_input, constructor, limit=100000) parsed_das = {} for utt_key, utt in sorted(test_utterances.iteritems()): if isinstance(utt, Utterance): obs = {'utt': utt} elif isinstance(utt, UtteranceNBList): obs = {'utt_nbl': utt} else: raise BaseException('Unsupported observation type') print '-' * 120 print "Observation:" print utt_key, " ==> " print unicode(utt) da_confnet = hdc_slu.parse(obs, verbose=False) print "Conf net:" print unicode(da_confnet) da_confnet.prune() dah = da_confnet.get_best_da_hyp() print "1 best: " print unicode(dah) parsed_das[utt_key] = dah.da if 'CL_' in str(dah.da): print '*' * 120 print utt print dah.da hdc_slu.parse(obs, verbose=True) fn_sem = os.path.basename(fn_input) + '.hdc.sem.out' save_wavaskey(fn_sem, parsed_das, trans=lambda da: '&'.join(sorted(unicode(da).split('&')))) f = codecs.open(os.path.basename(fn_sem) + '.score', 'w+', encoding='UTF-8') score(fn_reference, fn_sem, True, True, f) f.close()
def hdc_slu_test(fn_input, constructor, fn_reference): """ Tests a SLU DAILogRegClassifier model. :param fn_model: :param fn_input: :param constructor: :param fn_reference: :return: """ print "="*120 print "Testing HDC SLU: ", fn_input, fn_reference print "-"*120 from alex.components.slu.base import CategoryLabelDatabase from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey from alex.corpustools.semscore import score cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU(preprocessing) test_utterances = load_wavaskey(fn_input, constructor, limit=100000) parsed_das = {} for utt_key, utt in sorted(test_utterances.iteritems()): if isinstance(utt, Utterance): obs = {'utt': utt} elif isinstance(utt, UtteranceNBList): obs = {'utt_nbl': utt} else: raise BaseException('Unsupported observation type') print '-' * 120 print "Observation:" print utt_key, " ==> " print unicode(utt) da_confnet = hdc_slu.parse(obs, verbose=False) print "Conf net:" print unicode(da_confnet) da_confnet.prune() dah = da_confnet.get_best_da_hyp() print "1 best: " print unicode(dah) parsed_das[utt_key] = dah.da if 'CL_' in str(dah.da): print '*' * 120 print utt print dah.da hdc_slu.parse(obs, verbose=True) fn_sem = os.path.basename(fn_input)+'.hdc.sem.out' save_wavaskey(fn_sem, parsed_das, trans = lambda da: '&'.join(sorted(unicode(da).split('&')))) f = codecs.open(os.path.basename(fn_sem)+'.score', 'w+', encoding='UTF-8') score(fn_reference, fn_sem, True, True, f) f.close()