def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU(preprocessing, cfg={'SLU': {PTICSHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoCS/data/utt2da_dict.txt")}}}) output_alignment = False output_utterance = True output_abstraction = False output_da = True if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] fn_uniq_trn_sem = fn_uniq_trn + '.sem.tmp' print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_alignment: norm_utterance = slu.preprocessing.normalise_utterance(Utterance(utterance)) abutterance, _, _ = slu.abstract_utterance(norm_utterance) abutterance = slu.handle_false_abstractions(abutterance) da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() max_alignment_idx = lambda _dai: max(_dai.alignment) if _dai.alignment else len(abutterance) for i, dai in enumerate(sorted(da, key=max_alignment_idx)): if not dai.alignment: print "Empty alignment:", unicode(abutterance), ";", dai if not dai.alignment or dai.alignment == {-1}: dai_alignment_idx = len(abutterance) else: dai_alignment_idx = max(dai.alignment) + i + 1 abutterance.insert(dai_alignment_idx, "[{} - {}]".format(unicode(dai), list(dai.alignment if dai.alignment else []))) annotation += [unicode(abutterance)] else: if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance(Utterance(utterance)) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
def process_file(file_path): cldb = CategoryLabelDatabase(as_project_path('applications/PublicTransportInfoCS/data/database.py')) preprocessing = PTICSSLUPreprocessing(cldb) hdc_slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path('applications/PublicTransportInfoCS/data/utt2da_dict.txt')}}}) stdout = codecs.getwriter('UTF-8')(sys.stdout) with open(file_path, 'r') as fh: for line in codecs.getreader('UTF-8')(fh): line = line.strip("\r\n") # skip empty lines (dialogue boundaries) if not line: continue person, da, utt = line.split("\t") # skip system utterances, use just user utterances if 'SYSTEM' in person: continue # reparse utterance using transcription utt = re.sub(r',', r' ', utt) utt = Utterance(utt) sem = hdc_slu.parse({'utt': utt}) # get abstracted utterance text abutt = hdc_slu.abstract_utterance(utt) abutt_str = get_abutt_str(utt, abutt) # get abstracted DA best_da = sem.get_best_da() best_da_str = unicode(best_da) abstract_da(best_da) print >> stdout, unicode(utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(best_da)
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) output_utterance = True output_abstraction = False output_da = True fn_uniq_trn_sem = 'uniq.trn.sem.tmp' if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance(utterance) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
def main(): import autopath cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoCS/data/utt2da_dict.txt")}}}) output_utterance = True output_abstraction = False output_da = True fn_uniq_trn_sem = 'uniq.trn.sem.tmp' if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance(utterance) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
def main(): cldb = CategoryLabelDatabase('../data/database.py') preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) output_alignment = False output_utterance = True output_abstraction = False output_da = True if len(sys.argv) < 2: fn_uniq_trn = 'uniq.trn' else: fn_uniq_trn = sys.argv[1] fn_uniq_trn_sem = fn_uniq_trn + '.sem.tmp' print "Processing input from file", fn_uniq_trn uniq_trn = codecs.open(fn_uniq_trn, "r", encoding='utf8') uniq_trn_sem = {} for line in uniq_trn: wav_key, utterance = line.split(" => ", 2) annotation = [] if output_alignment: norm_utterance = slu.preprocessing.normalise_utterance( Utterance(utterance)) abutterance, _, _ = slu.abstract_utterance(norm_utterance) abutterance = slu.handle_false_abstractions(abutterance) da = slu.parse_1_best({'utt': Utterance(utterance)}).get_best_da() max_alignment_idx = lambda _dai: max( _dai.alignment) if _dai.alignment else len(abutterance) for i, dai in enumerate(sorted(da, key=max_alignment_idx)): if not dai.alignment: print "Empty alignment:", unicode(abutterance), ";", dai if not dai.alignment or dai.alignment == {-1}: dai_alignment_idx = len(abutterance) else: dai_alignment_idx = max(dai.alignment) + i + 1 abutterance.insert( dai_alignment_idx, "[{} - {}]".format( unicode(dai), list(dai.alignment if dai.alignment else []))) annotation += [unicode(abutterance)] else: if output_utterance: annotation += [utterance.rstrip()] if output_abstraction: norm_utterance = slu.preprocessing.normalise_utterance( Utterance(utterance)) abutterance, _ = slu.abstract_utterance(norm_utterance) annotation += [abutterance] if output_da: da = slu.parse_1_best({ 'utt': Utterance(utterance) }).get_best_da() annotation += [unicode(da)] uniq_trn_sem[wav_key] = " <=> ".join(annotation) print "Saving output to file", fn_uniq_trn_sem save_wavaskey(fn_uniq_trn_sem, uniq_trn_sem)
utterance = u"CHTĚL BYCH JET ZE ZASTÁVKY ANDĚL DO ZASTÁVKY MALOSTRANSKÉ NÁMĚSTÍ" else: utterance = sys.argv[1].decode("utf-8") sys.argv = sys.argv[:1] cldb = CategoryLabelDatabase("../data/database.py") preprocessing = PTICSSLUPreprocessing(cldb) slu = PTICSHDCSLU( preprocessing, cfg={ "SLU": {PTICSHDCSLU: {"utt2da": as_project_path("applications/PublicTransportInfoCS/data/utt2da_dict.txt")}} }, ) norm_utterance = slu.preprocessing.normalise_utterance(Utterance(utterance)) abutterance, _, _ = slu.abstract_utterance(norm_utterance) da = slu.parse_1_best({"utt": Utterance(utterance)}, verbose=True).get_best_da() print "Abstracted utterance:", unicode(abutterance) print "Dialogue act:", unicode(da) max_alignment_idx = lambda _dai: max(_dai.alignment) if _dai.alignment else len(abutterance) for i, dai in enumerate(sorted(da, key=max_alignment_idx)): if not dai.alignment: print "Empty alignment:", unicode(abutterance), ";", dai if not dai.alignment or dai.alignment == -1: dai_alignment_idx = len(abutterance) else: dai_alignment_idx = max(dai.alignment) + i + 1 abutterance.insert(dai_alignment_idx, "[{} - {}]".format(dai, dai.alignment))
slu = PTICSHDCSLU( preprocessing, cfg={ 'SLU': { PTICSHDCSLU: { 'utt2da': as_project_path( "applications/PublicTransportInfoCS/data/utt2da_dict.txt" ) } } }) norm_utterance = slu.preprocessing.normalise_utterance( Utterance(utterance)) abutterance, _, _ = slu.abstract_utterance(norm_utterance) da = slu.parse_1_best({ 'utt': Utterance(utterance) }, verbose=True).get_best_da() print "Abstracted utterance:", unicode(abutterance) print "Dialogue act:", unicode(da) max_alignment_idx = lambda _dai: max( _dai.alignment) if _dai.alignment else len(abutterance) for i, dai in enumerate(sorted(da, key=max_alignment_idx)): if not dai.alignment: print "Empty alignment:", unicode(abutterance), ";", dai if not dai.alignment or dai.alignment == -1: dai_alignment_idx = len(abutterance) else: