Exemple #1
0
def score(fn_reftext, fn_testtext, outfile=sys.stdout):
    reftext = load_wavaskey(fn_reftext, Utterance)
    testtext = load_wavaskey(fn_testtext, Utterance)

    corr, sub, dels, ins, wer, nwords = score_file(reftext, testtext)

    m = """
    Please note that the scoring is implicitly ignoring all non-speech events.
    
    Ref: {r}
    Tst: {t}
    |==============================================================================================|
    |            | # Sentences  |  # Words  |   Corr   |   Sub    |   Del    |   Ins    |   Err    |
    |----------------------------------------------------------------------------------------------|
    | Sum/Avg    |{num_sents:^14}|{num_words:^11.0f}|{corr:^10.2f}|{sub:^10.2f}|{dels:^10.2f}|{ins:^10.2f}|{wer:^10.2f}|
    |==============================================================================================|
    """.format(
        r=fn_reftext,
        t=fn_testtext,
        num_sents=len(reftext),
        num_words=nwords,
        corr=corr,
        sub=sub,
        dels=dels,
        ins=ins,
        wer=wer,
    )

    outfile.write(m)
    outfile.write("\n")
Exemple #2
0
def score(fn_reftext, fn_testtext, outfile=sys.stdout):
    reftext = load_wavaskey(fn_reftext, Utterance)
    testtext = load_wavaskey(fn_testtext, Utterance)

    corr, sub, dels, ins, wer, nwords = score_file(reftext, testtext)

    m = """
    Please note that the scoring is implicitly ignoring all non-speech events.
    
    Ref: {r}
    Tst: {t}
    |==============================================================================================|
    |            | # Sentences  |  # Words  |   Corr   |   Sub    |   Del    |   Ins    |   Err    |
    |----------------------------------------------------------------------------------------------|
    | Sum/Avg    |{num_sents:^14}|{num_words:^11.0f}|{corr:^10.2f}|{sub:^10.2f}|{dels:^10.2f}|{ins:^10.2f}|{wer:^10.2f}|
    |==============================================================================================|
    """.format(r=fn_reftext,
               t=fn_testtext,
               num_sents=len(reftext),
               num_words=nwords,
               corr=corr,
               sub=sub,
               dels=dels,
               ins=ins,
               wer=wer)

    outfile.write(m)
    outfile.write("\n")
def hdc_slu(fn_input, constructor, fn_output):
    """
    Use for transcription a HDC SLU model.

    :param fn_model:
    :param fn_input:
    :param constructor:
    :param fn_reference:
    :return:
    """
    print "="*120
    print "HDC SLU: ", fn_input, fn_output
    print "-"*120

    from alex.components.slu.base import CategoryLabelDatabase
    from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing
    from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU
    from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey
    from alex.corpustools.semscore import score

    cldb = CategoryLabelDatabase('../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    hdc_slu = PTICSHDCSLU(preprocessing, cfg = {'SLU': {PTICSHDCSLU: {'utt2da': as_project_path("applications/PublicTransportInfoCS/data/utt2da_dict.txt")}}})

    test_utterances = load_wavaskey(fn_input, constructor, limit=1000000)

    parsed_das = {}
    for utt_key, utt in sorted(test_utterances.iteritems()):
        if isinstance(utt, Utterance):
            obs = {'utt': utt}
        elif isinstance(utt, UtteranceNBList):
            obs = {'utt_nbl': utt}
        else:
            raise BaseException('Unsupported observation type')

        print '-' * 120
        print "Observation:"
        print utt_key, " ==> "
        print unicode(utt)

        da_confnet = hdc_slu.parse(obs, verbose=False)

        print "Conf net:"
        print unicode(da_confnet)

        da_confnet.prune()
        dah = da_confnet.get_best_da_hyp()

        print "1 best: "
        print unicode(dah)

        parsed_das[utt_key] = dah.da

        if 'CL_' in str(dah.da):
            print '*' * 120
            print utt
            print dah.da
            hdc_slu.parse(obs, verbose=True)

    save_wavaskey(fn_output, parsed_das, trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))
def main():

    files = []
    for i in range(1, len(sys.argv)):

        k = load_wavaskey(sys.argv[i], unicode)
        print sys.argv[i], len(k)
        files.append(k)

    keys = set()
    keys.update(set(files[0].keys()))
    ukeys = set()
    for f in files:
        keys = keys.intersection(set(f.keys()))
        ukeys = ukeys.union(set(f.keys()))

    print len(keys), len(ukeys), len(ukeys - keys)

    for f in files:
        rk = set(f.keys()) - keys
        for k in rk:
            if k in f:
                del f[k]

    for i in range(1, len(sys.argv)):
        save_wavaskey(sys.argv[i] + '.pruned', files[i - 1])
def main():
    import autopath

    files = []
    for i in range(1, len(sys.argv)):

        k = load_wavaskey(sys.argv[i], unicode)
        print sys.argv[i], len(k)
        files.append(k)

    keys = set()
    keys.update(set(files[0].keys()))
    ukeys = set()
    for f in files:
        keys = keys.intersection(set(f.keys()))
        ukeys = ukeys.union(set(f.keys()))

    print len(keys), len(ukeys), len(ukeys - keys)

    for f in files:
        rk = set(f.keys()) - keys
        for k in rk:
            if k in f:
                del f[k]

    for i in range(1, len(sys.argv)):
        save_wavaskey(sys.argv[i]+'.pruned',files[i-1])
Exemple #6
0
def decode_with_reference(reference, outdir, cfg):
    """
    Launch the decoding

    Args:
        reference(str): Path to file with references in Alex reference format.
        outdir(str): Path to directory where to save log files.
        cfg(dict): Alex configuration file
    """
    asr = asr_factory(cfg)
    trn_dict = load_wavaskey(reference, Utterance)
    declen_dict, fwlen_dict, wavlen_dict, dec_dict = {}, {}, {}, {}

    for wav_path, reference in trn_dict.iteritems():
        best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, wav_path,
                                                     reference)
        dec_dict[wav_path] = best
        wavlen_dict[wav_path] = wav_dur
        declen_dict[wav_path] = dec_dur
        fwlen_dict[wav_path] = fw_dur

        compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict,
                          fwlen_dict)

    compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict,
                      fwlen_dict)
Exemple #7
0
def train(fn_model,
          fn_transcription,
          constructor,
          fn_annotation,
          fn_bs_transcription,
          fn_bs_annotation,
          min_pos_feature_count,
          min_neg_feature_count,
          min_classifier_count,
          limit=100000):
    """
    Trains a SLU DAILogRegClassifier model.

    :param fn_model:
    :param fn_transcription:
    :param constructor:
    :param fn_annotation:
    :param limit:
    :return:
    """
    bs_utterances = load_wavaskey(fn_bs_transcription, Utterance, limit=limit)
    increase_weight(bs_utterances, min_pos_feature_count + 10)
    bs_das = load_wavaskey(fn_bs_annotation, DialogueAct, limit=limit)
    increase_weight(bs_das, min_pos_feature_count + 10)

    utterances = load_wavaskey(fn_transcription, constructor, limit=limit)
    das = load_wavaskey(fn_annotation, DialogueAct, limit=limit)

    utterances.update(bs_utterances)
    das.update(bs_das)

    cldb = CategoryLabelDatabase('../../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    slu = DAILogRegClassifier(cldb, preprocessing, features_size=4)

    slu.extract_classifiers(das, utterances, verbose=True)
    slu.prune_classifiers(min_classifier_count=min_classifier_count)
    slu.print_classifiers()
    slu.gen_classifiers_data(min_pos_feature_count=min_pos_feature_count,
                             min_neg_feature_count=min_neg_feature_count,
                             verbose2=True)

    slu.train(inverse_regularisation=1e1, verbose=True)

    slu.save_model(fn_model)
Exemple #8
0
def train(fn_model,
          fn_transcription, constructor, fn_annotation,
          fn_bs_transcription, fn_bs_annotation,
          min_pos_feature_count,
          min_neg_feature_count,
          min_classifier_count,
          limit = 100000):
    """
    Trains a SLU DAILogRegClassifier model.

    :param fn_model:
    :param fn_transcription:
    :param constructor:
    :param fn_annotation:
    :param limit:
    :return:
    """
    bs_utterances = load_wavaskey(fn_bs_transcription, Utterance, limit = limit)
    increase_weight(bs_utterances, min_pos_feature_count+10)
    bs_das = load_wavaskey(fn_bs_annotation, DialogueAct, limit = limit)
    increase_weight(bs_das, min_pos_feature_count+10)

    utterances = load_wavaskey(fn_transcription, constructor, limit = limit)
    das = load_wavaskey(fn_annotation, DialogueAct, limit = limit)

    utterances.update(bs_utterances)
    das.update(bs_das)

    cldb = CategoryLabelDatabase('../../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    slu = DAILogRegClassifier(cldb, preprocessing, features_size=4)

    slu.extract_classifiers(das, utterances, verbose=True)
    slu.prune_classifiers(min_classifier_count = min_classifier_count)
    slu.print_classifiers()
    slu.gen_classifiers_data(min_pos_feature_count = min_pos_feature_count,
                             min_neg_feature_count = min_neg_feature_count,
                             verbose2 = True)

    slu.train(inverse_regularisation=1e1, verbose=True)

    slu.save_model(fn_model)
Exemple #9
0
def decode_with_reference(reference, outdir, cfg):
    asr = asr_factory(cfg)
    trn_dict = load_wavaskey(reference, Utterance)
    declen_dict, fwlen_dict, wavlen_dict, dec_dict = {}, {}, {}, {}

    for wav_path, reference in trn_dict.iteritems():
        best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, wav_path, reference)
        dec_dict[wav_path] = best
        wavlen_dict[wav_path] = wav_dur
        declen_dict[wav_path] = dec_dur
        fwlen_dict[wav_path] = fw_dur

        compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)
Exemple #10
0
def main():
	utterances = load_wavaskey("all.trn", unicode, limit=100000)

	keys = list(utterances.keys())
	random.seed()
	random.shuffle(keys)

	for k in keys:
	    if '_' in utterances[k]:
	        continue

	    url = 'www.google.cz/#q='+urllib.quote_plus(utterances[k].lower().encode('utf8'))

	    browser = subprocess.Popen(['opera', '-nosession', '-nomail', '-noraise', '-geometry', '500x100+0+0', url])
	    time.sleep(random.randint(10, 200))

	    os.system('kill -9 {pid}'.format(pid=browser.pid))
Exemple #11
0
def decode_with_reference(reference, outdir, num_workers):
    """
    Launch the decoding

    Args:
        reference(str): Path to file with references in Alex reference format.
        outdir(str): Path to directory where to save log files.
        cfg(dict): Alex configuration file
    """
    trn_dict = load_wavaskey(reference, Utterance)
    declen_dict, fwlen_dict, wavlen_dict, dec_dict = {}, {}, {}, {}

    params = [(outdir, wav_path, reference)
              for wav_path, reference in trn_dict.items()]
    random.shuffle(params)

    if num_workers > 1:
        p_decode_wavs = multiprocessing.Pool(num_workers)
        decoded_wavs = p_decode_wavs.map(decode_info, params, 100)
    else:
        decoded_wavs = []

        for p in params:
            decoded_wavs.append(decode_info(p))

    for best, dec_dur, fw_dur, wav_dur, wav_path in decoded_wavs:
        dec_dict[wav_path] = best
        wavlen_dict[wav_path] = wav_dur
        declen_dict[wav_path] = dec_dur
        fwlen_dict[wav_path] = fw_dur

    # compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    # for wav_path, reference in sorted(trn_dict.items()):
    #     best, dec_dur, fw_dur, wav_dur, wav_path = decode_info(asr, cfg, outdir, wav_path, reference)
    #     dec_dict[wav_path] = best
    #     wavlen_dict[wav_path] = wav_dur
    #     declen_dict[wav_path] = dec_dur
    #     fwlen_dict[wav_path] = fw_dur
    #
    #     compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict,
                      fwlen_dict)
Exemple #12
0
def decode_with_reference(reference, outdir, num_workers):
    """
    Launch the decoding

    Args:
        reference(str): Path to file with references in Alex reference format.
        outdir(str): Path to directory where to save log files.
        cfg(dict): Alex configuration file
    """
    trn_dict = load_wavaskey(reference, Utterance)
    declen_dict, fwlen_dict, wavlen_dict, dec_dict = {}, {}, {}, {}

    params = [ (outdir, wav_path, reference) for wav_path, reference in trn_dict.items()]
    random.shuffle(params)

    if num_workers > 1:
        p_decode_wavs = multiprocessing.Pool(num_workers)
        decoded_wavs = p_decode_wavs.map(decode_info, params, 100)
    else:
        decoded_wavs = []

        for p in params:
            decoded_wavs.append(decode_info(p))

    for best, dec_dur, fw_dur, wav_dur, wav_path in decoded_wavs:
        dec_dict[wav_path] = best
        wavlen_dict[wav_path] = wav_dur
        declen_dict[wav_path] = dec_dur
        fwlen_dict[wav_path] = fw_dur

    # compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    # for wav_path, reference in sorted(trn_dict.items()):
    #     best, dec_dur, fw_dur, wav_dur, wav_path = decode_info(asr, cfg, outdir, wav_path, reference)
    #     dec_dict[wav_path] = best
    #     wavlen_dict[wav_path] = wav_dur
    #     declen_dict[wav_path] = dec_dur
    #     fwlen_dict[wav_path] = fw_dur
    #
    #     compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)
Exemple #13
0
def main():
    utterances = load_wavaskey("all.trn", unicode, limit=100000)

    keys = list(utterances.keys())
    random.seed()
    random.shuffle(keys)

    for k in keys:
        if '_' in utterances[k]:
            continue

        url = 'www.google.cz/#q=' + urllib.quote_plus(
            utterances[k].lower().encode('utf8'))

        browser = subprocess.Popen([
            'opera', '-nosession', '-nomail', '-noraise', '-geometry',
            '500x100+0+0', url
        ])
        time.sleep(random.randint(10, 200))

        os.system('kill -9 {pid}'.format(pid=browser.pid))
Exemple #14
0
def load_das(das_fname, limit=None, encoding='UTF-8'):
    """
    Loads a dictionary of DAs from a given file.

    The file is assumed to contain lines of the following form:

        [[:space:]..]<key>[[:space:]..]=>[[:space:]..]<DA>[[:space:]..]

    or just (without keys):

        [[:space:]..]<DA>[[:space:]..]

    Arguments:
        das_fname -- path towards the file to read the DAs from
        limit -- limit on the number of DAs to read
        encoding -- the file encoding

    Returns a dictionary with DAs (instances of DialogueAct) as values.

    """
    return load_wavaskey(das_fname, DialogueAct, limit, encoding)
Exemple #15
0
def load_das(das_fname, limit=None, encoding='UTF-8'):
    """
    Loads a dictionary of DAs from a given file.

    The file is assumed to contain lines of the following form:

        [[:space:]..]<key>[[:space:]..]=>[[:space:]..]<DA>[[:space:]..]

    or just (without keys):

        [[:space:]..]<DA>[[:space:]..]

    Arguments:
        das_fname -- path towards the file to read the DAs from
        limit -- limit on the number of DAs to read
        encoding -- the file encoding

    Returns a dictionary with DAs (instances of DialogueAct) as values.

    """
    return load_wavaskey(das_fname, DialogueAct, limit, encoding)
Exemple #16
0
def decode_with_reference(reference, outdir, cfg):
    """
    Launch the decoding

    Args:
        reference(str): Path to file with references in Alex reference format.
        outdir(str): Path to directory where to save log files.
        cfg(dict): Alex configuration file
    """
    asr = asr_factory(cfg)
    trn_dict = load_wavaskey(reference, Utterance)
    declen_dict, fwlen_dict, wavlen_dict, dec_dict = {}, {}, {}, {}

    for wav_path, reference in sorted(trn_dict.items()):
        best, dec_dur, fw_dur, wav_dur = decode_info(asr, cfg, outdir, wav_path, reference)
        dec_dict[wav_path] = best
        wavlen_dict[wav_path] = wav_dur
        declen_dict[wav_path] = dec_dur
        fwlen_dict[wav_path] = fw_dur

        compute_rt_factor(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)

    compute_save_stat(outdir, trn_dict, dec_dict, wavlen_dict, declen_dict, fwlen_dict)
Exemple #17
0
    cal_list = [ x[1] for x in pri_cal_list]
    s = time.time()
    for wpm in wp_2_match:
        i, f = find_approx(wpm[0])
    e = time.time()
    print "size {size} elapsed {time}".format(size=len(wp_2_match), time = e - s)

    print "="*120
    print "The calibration table: insert it in the config"
    print "-"*120
    print repr(cal_list)

if __name__ == '__main__':
    reference = 'decoded_kaldi/all_trn.txt'
    trn_dict = load_wavaskey(reference, Utterance)
    trn_dict = basename_dict(trn_dict)

    fst_dir = 'decoded_kaldi'
    fst_fns = sorted(glob.glob(os.path.join(fst_dir, '*.fst')))

    words = load_words('models/words.txt')

    wp_2_match = []
    for i, fn in enumerate(fst_fns):
        print '='*120
        print i, fn

        ref = trn_dict[os.path.basename(fn).replace('fst','wav')]
        print unicode(ref)
        print '-'*120
Exemple #18
0
def hdc_slu_test(fn_input, constructor, fn_reference):
    """
    Tests the HDC SLU.

    :param fn_model:
    :param fn_input:
    :param constructor:
    :param fn_reference:
    :return:
    """
    print "=" * 120
    print "Testing HDC SLU: ", fn_input, fn_reference
    print "-" * 120

    from alex.components.slu.base import CategoryLabelDatabase
    from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing
    from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU
    from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey
    from alex.corpustools.semscore import score

    cldb = CategoryLabelDatabase('../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    hdc_slu = PTICSHDCSLU(
        preprocessing,
        cfg={
            'SLU': {
                PTICSHDCSLU: {
                    'utt2da':
                    as_project_path(
                        "applications/PublicTransportInfoCS/data/utt2da_dict.txt"
                    )
                }
            }
        })

    test_utterances = load_wavaskey(fn_input, constructor, limit=100000)

    parsed_das = {}
    for utt_key, utt in sorted(test_utterances.iteritems()):
        if isinstance(utt, Utterance):
            obs = {'utt': utt}
        elif isinstance(utt, UtteranceNBList):
            obs = {'utt_nbl': utt}
        else:
            raise BaseException('Unsupported observation type')

        print '-' * 120
        print "Observation:"
        print utt_key, " ==> "
        print unicode(utt)

        da_confnet = hdc_slu.parse(obs, verbose=False)

        print "Conf net:"
        print unicode(da_confnet)

        da_confnet.prune()
        dah = da_confnet.get_best_da_hyp()

        print "1 best: "
        print unicode(dah)

        parsed_das[utt_key] = dah.da

        if 'CL_' in str(dah.da):
            print '*' * 120
            print utt
            print dah.da
            hdc_slu.parse(obs, verbose=True)

    fn_sem = os.path.basename(fn_input) + '.hdc.sem.out'

    save_wavaskey(fn_sem,
                  parsed_das,
                  trans=lambda da: '&'.join(sorted(unicode(da).split('&'))))

    f = codecs.open(os.path.basename(fn_sem) + '.score',
                    'w+',
                    encoding='UTF-8')
    score(fn_reference, fn_sem, True, True, f)
    f.close()
Exemple #19
0
def load_das(das_fname, limit=None, encoding='UTF-8'):
    return load_wavaskey(das_fname, CUEDDialogueAct, limit, encoding)
Exemple #20
0
def trained_slu_test(fn_model, fn_input, constructor, fn_reference):
    """
    Tests a SLU DAILogRegClassifier model.

    :param fn_model:
    :param fn_input:
    :param constructor:
    :param fn_reference:
    :return:
    """
    print "="*120
    print "Testing: ", fn_model, fn_input, fn_reference
    print "-"*120

    from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing
    from alex.components.slu.base import CategoryLabelDatabase
    from alex.components.slu.dailrclassifier import DAILogRegClassifier
    from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey
    from alex.corpustools.semscore import score

    cldb = CategoryLabelDatabase('../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    slu = DAILogRegClassifier(cldb, preprocessing)

    slu.load_model(fn_model)

    test_utterances = load_wavaskey(fn_input, constructor, limit=100000)

    parsed_das = {}
    for utt_key, utt in sorted(test_utterances.iteritems()):
        if isinstance(utt, Utterance):
            obs = {'utt': utt}
        elif isinstance(utt, UtteranceNBList):
            obs = {'utt_nbl': utt}
        else:
            raise BaseException('Unsupported observation type')

        print '-' * 120
        print "Observation:"
        print utt_key, " ==> "
        print unicode(utt)

        da_confnet = slu.parse(obs, verbose=False)

        print "Conf net:"
        print unicode(da_confnet)

        da_confnet.prune()
        dah = da_confnet.get_best_da_hyp()

        print "1 best: "
        print unicode(dah)

        parsed_das[utt_key] = dah.da

        if 'CL_' in str(dah.da):
            print '*' * 120
            print utt
            print dah.da
            slu.parse(obs, verbose=True)

    if 'trn' in fn_model:
        fn_sem = os.path.basename(fn_input)+'.model.trn.sem.out'
    elif 'asr' in fn_model:
        fn_sem = os.path.basename(fn_input)+'.model.asr.sem.out'
    elif 'nbl' in fn_model:
        fn_sem = os.path.basename(fn_input)+'.model.nbl.sem.out'
    else:
        fn_sem = os.path.basename(fn_input)+'.XXX.sem.out'

    save_wavaskey(fn_sem, parsed_das, trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))

    f = codecs.open(os.path.basename(fn_sem)+'.score', 'w+', encoding='UTF-8')
    score(fn_reference, fn_sem, True, True, f)
    f.close()
Exemple #21
0
    cal_list = [x[1] for x in pri_cal_list]
    s = time.time()
    for wpm in wp_2_match:
        i, f = find_approx(wpm[0])
    e = time.time()
    print "size {size} elapsed {time}".format(size=len(wp_2_match), time=e - s)

    print "=" * 120
    print "The calibration table: insert it in the config"
    print "-" * 120
    print repr(cal_list)


if __name__ == '__main__':
    reference = 'decoded_kaldi/all_trn.txt'
    trn_dict = load_wavaskey(reference, Utterance)
    trn_dict = basename_dict(trn_dict)

    fst_dir = 'decoded_kaldi'
    fst_fns = sorted(glob.glob(os.path.join(fst_dir, '*.fst')))

    words = load_words('models/words.txt')

    wp_2_match = []
    for i, fn in enumerate(fst_fns):
        print '=' * 120
        print i, fn

        ref = trn_dict[os.path.basename(fn).replace('fst', 'wav')]
        print unicode(ref)
        print '-' * 120
Exemple #22
0
def load_das(das_fname, limit=None, encoding='UTF-8'):
    return load_wavaskey(das_fname, CUEDDialogueAct, limit, encoding)
Exemple #23
0
def hdc_slu_test(fn_input, constructor, fn_reference):
    """
    Tests a SLU DAILogRegClassifier model.

    :param fn_model:
    :param fn_input:
    :param constructor:
    :param fn_reference:
    :return:
    """
    print "="*120
    print "Testing HDC SLU: ", fn_input, fn_reference
    print "-"*120

    from alex.components.slu.base import CategoryLabelDatabase
    from alex.applications.PublicTransportInfoCS.preprocessing import PTICSSLUPreprocessing
    from alex.applications.PublicTransportInfoCS.hdc_slu import PTICSHDCSLU
    from alex.corpustools.wavaskey import load_wavaskey, save_wavaskey
    from alex.corpustools.semscore import score

    cldb = CategoryLabelDatabase('../data/database.py')
    preprocessing = PTICSSLUPreprocessing(cldb)
    hdc_slu = PTICSHDCSLU(preprocessing)

    test_utterances = load_wavaskey(fn_input, constructor, limit=100000)

    parsed_das = {}
    for utt_key, utt in sorted(test_utterances.iteritems()):
        if isinstance(utt, Utterance):
            obs = {'utt': utt}
        elif isinstance(utt, UtteranceNBList):
            obs = {'utt_nbl': utt}
        else:
            raise BaseException('Unsupported observation type')

        print '-' * 120
        print "Observation:"
        print utt_key, " ==> "
        print unicode(utt)

        da_confnet = hdc_slu.parse(obs, verbose=False)

        print "Conf net:"
        print unicode(da_confnet)

        da_confnet.prune()
        dah = da_confnet.get_best_da_hyp()

        print "1 best: "
        print unicode(dah)

        parsed_das[utt_key] = dah.da

        if 'CL_' in str(dah.da):
            print '*' * 120
            print utt
            print dah.da
            hdc_slu.parse(obs, verbose=True)

    fn_sem = os.path.basename(fn_input)+'.hdc.sem.out'

    save_wavaskey(fn_sem, parsed_das, trans = lambda da: '&'.join(sorted(unicode(da).split('&'))))

    f = codecs.open(os.path.basename(fn_sem)+'.score', 'w+', encoding='UTF-8')
    score(fn_reference, fn_sem, True, True, f)
    f.close()