def process_file(file_path): cldb = CategoryLabelDatabase(as_project_path('applications/PublicTransportInfoEN/data/database.py')) preprocessing = PTIENSLUPreprocessing(cldb) hdc_slu = PTIENHDCSLU(preprocessing, cfg={'SLU': {PTIENHDCSLU: {'utt2da': as_project_path('applications/PublicTransportInfoEN/data/utt2da_dict.txt')}}}) stdout = codecs.getwriter('UTF-8')(sys.stdout) with open(file_path, 'r') as fh: for line in codecs.getreader('UTF-8')(fh): line = line.strip("\r\n") # skip empty lines (dialogue boundaries) if not line: continue person, da, utt = line.split("\t") # skip system utterances, use just user utterances if 'SYSTEM' in person: continue # reparse utterance using transcription utt = re.sub(r',', r' ', utt) utt = Utterance(utt) sem = hdc_slu.parse({'utt': utt}) # get abstracted utterance text abutt = hdc_slu.abstract_utterance(utt) abutt_str = get_abutt_str(utt, abutt) # get abstracted DA best_da = sem.get_best_da() best_da_str = unicode(best_da) abstract_da(best_da) print >> stdout, unicode(utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(best_da)
def process_file(file_path): cldb = CategoryLabelDatabase( as_project_path('applications/PublicTransportInfoEN/data/database.py')) preprocessing = PTIENSLUPreprocessing(cldb) hdc_slu = PTIENHDCSLU( preprocessing, cfg={ 'SLU': { PTIENHDCSLU: { 'utt2da': as_project_path( 'applications/PublicTransportInfoEN/data/utt2da_dict.txt' ) } } }) stdout = codecs.getwriter('UTF-8')(sys.stdout) with open(file_path, 'r') as fh: for line in codecs.getreader('UTF-8')(fh): line = line.strip("\r\n") # skip empty lines (dialogue boundaries) if not line: continue person, da, utt = line.split("\t") # skip system utterances, use just user utterances if 'SYSTEM' in person: continue # reparse utterance using transcription utt = re.sub(r',', r' ', utt) utt = Utterance(utt) sem = hdc_slu.parse({'utt': utt}) # get abstracted utterance text abutt = hdc_slu.abstract_utterance(utt) abutt_str = get_abutt_str(utt, abutt) # get abstracted DA best_da = sem.get_best_da() best_da_str = unicode(best_da) abstract_da(best_da) print >> stdout, unicode( utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode( best_da)