예제 #1
0
파일: reparse_en.py 프로젝트: cifkao/alex
def process_file(file_path):

    cldb = CategoryLabelDatabase(as_project_path('applications/PublicTransportInfoEN/data/database.py'))
    preprocessing = PTIENSLUPreprocessing(cldb)
    hdc_slu = PTIENHDCSLU(preprocessing, cfg={'SLU': {PTIENHDCSLU: {'utt2da': as_project_path('applications/PublicTransportInfoEN/data/utt2da_dict.txt')}}})
    stdout = codecs.getwriter('UTF-8')(sys.stdout)

    with open(file_path, 'r') as fh:
        for line in codecs.getreader('UTF-8')(fh):
            line = line.strip("\r\n")

            # skip empty lines (dialogue boundaries)
            if not line:
                continue

            person, da, utt = line.split("\t")
            # skip system utterances, use just user utterances
            if 'SYSTEM' in person:
                continue

            # reparse utterance using transcription
            utt = re.sub(r',', r' ', utt)
            utt = Utterance(utt)
            sem = hdc_slu.parse({'utt': utt})

            # get abstracted utterance text
            abutt = hdc_slu.abstract_utterance(utt)
            abutt_str = get_abutt_str(utt, abutt)

            # get abstracted DA
            best_da = sem.get_best_da()
            best_da_str = unicode(best_da)
            abstract_da(best_da)

            print >> stdout, unicode(utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(best_da)
예제 #2
0
def process_file(file_path):

    cldb = CategoryLabelDatabase(
        as_project_path('applications/PublicTransportInfoEN/data/database.py'))
    preprocessing = PTIENSLUPreprocessing(cldb)
    hdc_slu = PTIENHDCSLU(
        preprocessing,
        cfg={
            'SLU': {
                PTIENHDCSLU: {
                    'utt2da':
                    as_project_path(
                        'applications/PublicTransportInfoEN/data/utt2da_dict.txt'
                    )
                }
            }
        })
    stdout = codecs.getwriter('UTF-8')(sys.stdout)

    with open(file_path, 'r') as fh:
        for line in codecs.getreader('UTF-8')(fh):
            line = line.strip("\r\n")

            # skip empty lines (dialogue boundaries)
            if not line:
                continue

            person, da, utt = line.split("\t")
            # skip system utterances, use just user utterances
            if 'SYSTEM' in person:
                continue

            # reparse utterance using transcription
            utt = re.sub(r',', r' ', utt)
            utt = Utterance(utt)
            sem = hdc_slu.parse({'utt': utt})

            # get abstracted utterance text
            abutt = hdc_slu.abstract_utterance(utt)
            abutt_str = get_abutt_str(utt, abutt)

            # get abstracted DA
            best_da = sem.get_best_da()
            best_da_str = unicode(best_da)
            abstract_da(best_da)

            print >> stdout, unicode(
                utt) + "\t" + abutt_str + "\t" + best_da_str + "\t" + unicode(
                    best_da)