Ejemplo n.º 1
0
def evaulate_trf(model, vocab, read_model, tsize, fres):
    res_name = '{}:'.format(int(tsize)) + os.path.split(read_model)[-1]
    tskdir = '{}/'.format(tsize)

    # rescore
    config = ' -vocab {} '.format(vocab)
    config += ' -read {}.model '.format(read_model)
    config += ' -nbest {} '.format(data(tskdir)[3])
    config += ' -lmscore {0}.lmscore'.format(read_model)
    model.use(config)
    # WER
    [read_nbest, read_templ, read_acscore, read_lmscore] = data(tskdir)[3:7]
    read_lmscore = read_model + '.lmscore'

    [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                         wb.LoadScore(read_lmscore),
                                         wb.LoadScore(read_acscore),
                                         np.linspace(0.1, 0.9, 9))
    print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(
        wer, lmscale, acscale))
    # calculate the ppl on wsj test
    templ_txt = model.workdir + os.path.split(read_templ)[-1] + '.rmlabel'
    wb.file_rmlabel(read_templ, templ_txt)
    PPL_templ = model.ppl(vocab, read_model + '.model', templ_txt)
    LL_templ = -wb.PPL2LL(PPL_templ, templ_txt)

    # output the result
    fres.Add(res_name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
    fres.AddWER(res_name, wer)
Ejemplo n.º 2
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    bindir = '../../tools/srilm/'
    fres = wb.FRes('result.txt')  # the result file
    order_reg = [5]

    for tsize in [1, 2, 4]:
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'ngramlm/'
        model = ngram.model(bindir, workdir)

        for order in order_reg:
            write_model = workdir + '{}gram.lm'.format(order)
            write_name = '{}:KN{}'.format(tsize, order)

            print(write_model)

            if '-train' in sys.argv or '-all' in sys.argv:
                if order_reg.index(order) == 0:
                    model.prepare(
                        data(tskdir)[0],
                        data(tskdir)[1],
                        data(tskdir)[2])
                model.train(order, write_model)

            if '-test' in sys.argv or '-all' in sys.argv:
                PPL = [0] * 3
                PPL[0] = model.ppl(write_model, order, data(tskdir)[0])
                PPL[1] = model.ppl(write_model, order, data(tskdir)[1])
                PPL[2] = model.ppl(write_model, order, data(tskdir)[2])
                fres.AddPPL(write_name, PPL, data(tskdir)[0:3])

            if '-rescore' in sys.argv or '-all' in sys.argv:
                model.rescore(write_model, order,
                              data(tskdir)[3], write_model[0:-3] + '.lmscore')

            if '-wer' in sys.argv or '-all' in sys.argv:
                [nbest, templ] = data(tskdir)[3:5]
                lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore')
                acscore = wb.LoadScore(data(tskdir)[5])

                [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore,
                                                     acscore,
                                                     np.linspace(0.1, 0.9, 9))
                print('wer={} lmscale={} acscale={}'.format(
                    wer, lmscale, acscale))
                fres.AddWER(write_name, wer)

                trans_txt = workdir + os.path.split(templ)[-1] + '.txt'
                wb.file_rmlabel(templ, trans_txt)
                PPL_temp = model.ppl(write_model, order, trans_txt)
                LL_temp = -wb.PPL2LL(PPL_temp, trans_txt)
                fres.Add(write_name, ['LL-wsj', 'PPL-wsj'],
                         [LL_temp, PPL_temp])
Ejemplo n.º 3
0
def wer_all(lmpaths, lmtypes, outlog):
    fres = wb.FRes(outlog, True)
    fres.Clean()

    [read_nbest, read_trans, read_acscore] = data()[3:6]
    lmscale_vec = np.linspace(0.1, 0.9, 9)
    weight_vec = np.linspace(0.5, 0.5, 1)

    for type in lmtypes:
        exist_multiple_run = False
        a = type.split('+')
        for lm in a:
            if lmpaths[lm].find('<run>') != -1:
                exist_multiple_run = True
                break

        run_vec = [0]
        run_name = type
        if exist_multiple_run:
            run_vec = range(0, 10)
            run_name = type + ':<run>'

        for run in run_vec:
            run_str = 'run{}'.format(run)
            name = run_name.replace('<run>', run_str)
            opt_wer_vec = [100, 1.0, 1.0]
            opt_weight = 1.0

            if len(a) == 1:
                lmscore = wb.LoadScore(lmpaths[a[0]].replace('<run>', run_str))
                opt_wer_vec = wb.TuneWER(read_nbest, read_trans, lmscore,
                                         read_acscore, lmscale_vec)
                opt_weight = 1.0
            else:
                lmscore1 = np.array(
                    wb.LoadScore(lmpaths[a[0]].replace('<run>', run_str)))
                lmscore2 = np.array(
                    wb.LoadScore(lmpaths[a[1]].replace('<run>', run_str)))

                for w in weight_vec:
                    lmscore = w * lmscore1 + (1 - w) * lmscore2
                    [wer, lmscale,
                     acscale] = wb.TuneWER(read_nbest, read_trans, lmscore,
                                           read_acscore, lmscale_vec)
                    if wer < opt_wer_vec[0]:
                        opt_wer_vec = [wer, lmscale, acscale]
                        opt_weight = w

            fres.Add(name, ['wer', 'lmscale', 'acscale', 'weight'],
                     opt_wer_vec + [opt_weight])
Ejemplo n.º 4
0
def wer_all(workdir, nbestdir, lmpaths, lmtypes):
    wb.mkdir(workdir)
    # calculate the wer for each task, each lmscale, each combination
    for tsk in ['nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu']]:
        print(tsk)
        wb.mkdir(workdir + tsk)
        fwer = open(workdir + tsk + '/wer.txt', 'wt')

        read_nbest_txt = nbestdir + tsk + '/words_text'
        read_transcript = nbestdir + tsk + '/text'
        read_acscore = nbestdir + tsk + '/acwt'
        read_gfscore = nbestdir + tsk + '/lmwt.nolm'

        # remove the <UNK> in nbest
        read_nbest_rmunk = workdir + tsk + '/words_text_rmunk'
        nbest_rmUNK(read_nbest_txt, read_nbest_rmunk)
        # load score
        acscore = np.array(wb.LoadScore(read_acscore))
        gfscore = np.array(wb.LoadScore(read_gfscore))
        # load label
        score_label = wb.LoadLabel(read_acscore)

        # lm config

        for lmtype in lmtypes:
            a = lmtype.split('+')
            if len(a) == 1:
                lmscore = np.array(wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk)))
            elif len(a) == 2:
                s1 = wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk))
                s2 = wb.LoadScore(lmpaths[a[1]].replace('<tsk>', tsk))
                lmscore = 0.5 * np.array(s1) + 0.5 * np.array(s2)

            # write lmscore
            wb.WriteScore(workdir + tsk + '/' + lmtype + '.lmscore', lmscore, score_label)

            for lmscale in np.linspace(9, 15, 7):
                write_best = workdir + tsk + '/{}_lmscale={}.best'.format(lmtype, lmscale)
                wb.GetBest(read_nbest_rmunk, (acscore + lmscale * (lmscore + gfscore)).tolist(), write_best)
                [err, num, wer] = wb.CmpWER(write_best, read_transcript)
                os.remove(write_best)
                s = '{} wer={:.2f} err={} num={} lmscale={}'.format(lmtype, wer, err, num, lmscale)
                print('  ' + s)
                fwer.write(s + '\n')
                fwer.flush()

        fwer.close()
Ejemplo n.º 5
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    absdir = os.getcwd() + '/'
    bindir = absdir + '../../tools/srilm/'
    workdir = absdir + 'ngramlm/'
    wb.mkdir(workdir)

    datas = [absdir + i for i in data()]
    result_file = absdir + 'models_ppl.txt'  # the result file
    model = ngram.model(bindir, workdir)
    order_reg = [2, 3, 4, 5]

    for order in order_reg:
        write_model = workdir + '{}gram.lm'.format(order)
        print(write_model)

        if '-train' in sys.argv:
            if order_reg.index(order) == 0:
                model.prepare(datas[0], datas[1], datas[2])
            model.train(order, write_model, absdir + 'models_ppl.txt')
        if '-rescore' in sys.argv:
            model.rescore(write_model, order, datas[3],
                          write_model[0:-3] + '.lmscore')
        if '-wer' in sys.argv:
            [nbest, templ] = datas[3:5]
            lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore')
            acscore = wb.LoadScore(datas[5])

            [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore,
                                                 acscore,
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={} lmscale={} acscale={}'.format(wer, lmscale, acscale))
            fres = wb.FRes(result_file)
            fres.AddWER('KN{}'.format(order), wer)

            trans_txt = workdir + os.path.split(templ)[-1] + '.txt'
            wb.file_rmlabel(templ, trans_txt)
            PPL_temp = model.ppl(write_model, order, trans_txt)
            LL_temp = -wb.PPL2LL(PPL_temp, trans_txt)
            fres.Add('KN{}'.format(order), ['LL-wsj', 'PPL-wsj'],
                     [LL_temp, PPL_temp])
Ejemplo n.º 6
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER'
              )

    run_times = range(0, 10)   # for multiple run

    
    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_cpw.fs'
    maxlen = 0
    tmax = 20000
    t0 = 2000
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 4e-5
    thread = 8

    if '-res' in sys.argv:
        fres.Read()
        for i in range(1,len(fres.head)):
            value = []
            for runnum in run_times:
                write_name = 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)
                line = fres.Get(write_name)
                value.append(line[i])
            fres.Add('trf_c{}_{}.runavg'.format(class_num, feat[0:-3]), [fres.head[i]],
                     ['{:.2f}+{:.2f}'.format(np.mean(value), np.std(value))] )

    for runnum in run_times:
        write_model = workdir + 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)

        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 '
            config += ' -write-at-iter [{}:1000:{}]'.format(tmax-5000, tmax)  # output the intermediate models
            model.prepare(data()[0], data()[1], data()[2], class_num)
            model.train(config)
        if '-plot' in sys.argv:
            baseline = fres.Get('KN5')
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data()[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore, read_lmscore] = data()[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore), np.linspace(0.1,0.9,9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            LL = model.get_last_value(write_model + '.log')

            # output the result
            name = os.path.split(write_model)[1]
            fres.AddLL(name, LL, data()[0:3])
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(name, wer)
        if '-stat' in sys.argv:
            # calculate the mean and var of wers of the intermediate models
            inte_wer = []
            inte_model = []

            # find model
            for file_name in os.listdir(os.path.split(write_model)[0]):
                file_path = os.path.split(write_model)[0] + os.sep + file_name
                if not os.path.isfile(file_path):
                    continue
                if file_name.find(os.path.split(write_model)[1]) == 0 and \
                    file_path.split('.')[-1] == 'model' and \
                    file_path.split('.')[-2][0] == 'n':
                    inte_model.append(file_path)

            # compute wer
            flog = open(workdir + 'inte_model_wer.log', 'wt')
            for file_path in sorted(inte_model):
                print(file_path)
                t = int(file_path.split('.')[-2][1:])

                # lmscore
                write_lmscore = os.path.splitext(file_path)[0] + '.lmscore'
                config = ' -vocab {} '.format(vocab)
                config += ' -read {} '.format(file_path)
                config += ' -nbest {} '.format(data()[3])
                config += ' -lmscore {0} '.format(write_lmscore)
                model.use(config, False)
                # wer
                [wer, lmscale, acscale] = wb.TuneWER(data()[3], data()[4],
                                                 wb.LoadScore(write_lmscore),
                                                 wb.LoadScore(data()[5]), np.linspace(0.1, 0.9, 9))
                print('t={} wer={}'.format(t, wer))
                flog.write('{} \t wer={}\n'.format(file_path, wer))
                inte_wer.append([t, wer])
            flog.close()

            # plot wer
            inte_wer = sorted(inte_wer, key=lambda d: d[0])
            t_list = [i[0] for i in inte_wer]
            wer_list = [i[1] for i in inte_wer]
            wer_mean = np.mean(wer_list[-20:])
            wer_std = np.std(wer_list[-20:])
            print('wer_mean={}  wer_std={}'.format(wer_mean, wer_std))

            plt.figure()
            plt.plot(t_list, wer_list)
            plt.xlabel('t')
            plt.ylabel('wer')
            plt.show()
        if '-ais' in sys.argv:
            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_chain = 10
            ais_inter = 10000
            ais_model = '{}.ais{}_{}.model'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(trf.FileMaxLen(read_nbest)-1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore, np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0]*3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(write_model)[1]+":AIS{}-{}".format(ais_chain, ais_inter)
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
Ejemplo n.º 7
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    for tsize in [1]:
        bindir = '../../tools/trf/bin/'
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'trflm/'

        fres = wb.FRes('result.txt')
        model = trf.model(bindir, workdir)

        class_num = 200
        train = workdir + 'train.id'
        valid = workdir + 'valid.id'
        test = workdir + 'test.id'
        vocab = workdir + 'vocab_c{}.list'.format(class_num)
        order = 4
        feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
        #feat = 'g4_w_c_ws_cs_cpw.fs'
        maxlen = 100
        tmax = 50000
        t0 = 2000
        minibatch = 100
        gamma_lambda = '1000,0'
        gamma_zeta = '0,0.6'
        reg = 1e-6
        thread = 8

        write_model = workdir + 'trf_c{}_{}'.format(class_num, feat[0:-3])
        write_name = '{}:{}'.format(tsize, os.path.split(write_model)[1])

        if '-class' in sys.argv:
            # just cluster for each tsks.
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(
                vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(
                gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 -write-at-iter 10000:10000:{}'.format(
                tmax)
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
            model.train(config)
            # output
            LL = model.get_last_value(write_model + '.log')
            fres.AddLL(write_name, LL, data(tskdir)[0:3])
        if '-plot' in sys.argv:
            baseline = fres.Get('{}:KN5'.format(tsize))
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data(tskdir)[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore,
             read_lmscore] = data(tskdir)[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore),
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(
                wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)

            # output the result
            fres.Add(write_name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(write_name, wer)