Beispiel #1
0
    def train(self,
              order,
              write_model,
              output_res='',
              discount='-kndiscount',
              cutoff=[]):
        write_count = self.workdir + os.path.split(write_model)[1] + '.count'

        cmd = self.bindir + 'ngram-count '
        cmd += ' -text {0}train.no -vocab {0}vocab'.format(self.workdir)
        cmd += ' -order {} -write {} '.format(order, write_count)
        for i in range(len(cutoff)):
            cmd += ' -gt{}min {}'.format(i + 1, cutoff[i])
        os.system(cmd)

        cmd = self.bindir + 'ngram-count '
        cmd += ' -vocab {}vocab'.format(self.workdir)
        cmd += ' -read {}'.format(write_count)
        cmd += ' -order {} -lm {} '.format(order, write_model)
        cmd += discount + ' -interpolate -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 -gt5min 0'
        os.system(cmd)

        # get ppl
        if output_res != '':
            PPL = [0] * 3
            testno = [
                self.workdir + i + '.no' for i in ['train', 'valid', 'test']
            ]
            for i in range(3):
                cmd = self.bindir + 'ngram -order {} -lm {} -ppl {}'.format(
                    order, write_model, testno[i])
                res = os.popen(cmd).read()
                PPL[i] = float(res[res.find('ppl='):].split()[1])
            res_file = wb.FRes(output_res)
            res_file.AddPPL('KN{}'.format(order), PPL, testno)
Beispiel #2
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    bindir = '../../tools/srilm/'
    fres = wb.FRes('result.txt')  # the result file
    order_reg = [5]

    for tsize in [1, 2, 4]:
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'ngramlm/'
        model = ngram.model(bindir, workdir)

        for order in order_reg:
            write_model = workdir + '{}gram.lm'.format(order)
            write_name = '{}:KN{}'.format(tsize, order)

            print(write_model)

            if '-train' in sys.argv or '-all' in sys.argv:
                if order_reg.index(order) == 0:
                    model.prepare(
                        data(tskdir)[0],
                        data(tskdir)[1],
                        data(tskdir)[2])
                model.train(order, write_model)

            if '-test' in sys.argv or '-all' in sys.argv:
                PPL = [0] * 3
                PPL[0] = model.ppl(write_model, order, data(tskdir)[0])
                PPL[1] = model.ppl(write_model, order, data(tskdir)[1])
                PPL[2] = model.ppl(write_model, order, data(tskdir)[2])
                fres.AddPPL(write_name, PPL, data(tskdir)[0:3])

            if '-rescore' in sys.argv or '-all' in sys.argv:
                model.rescore(write_model, order,
                              data(tskdir)[3], write_model[0:-3] + '.lmscore')

            if '-wer' in sys.argv or '-all' in sys.argv:
                [nbest, templ] = data(tskdir)[3:5]
                lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore')
                acscore = wb.LoadScore(data(tskdir)[5])

                [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore,
                                                     acscore,
                                                     np.linspace(0.1, 0.9, 9))
                print('wer={} lmscale={} acscale={}'.format(
                    wer, lmscale, acscale))
                fres.AddWER(write_name, wer)

                trans_txt = workdir + os.path.split(templ)[-1] + '.txt'
                wb.file_rmlabel(templ, trans_txt)
                PPL_temp = model.ppl(write_model, order, trans_txt)
                LL_temp = -wb.PPL2LL(PPL_temp, trans_txt)
                fres.Add(write_name, ['LL-wsj', 'PPL-wsj'],
                         [LL_temp, PPL_temp])
Beispiel #3
0
def wer_all(lmpaths, lmtypes, outlog):
    fres = wb.FRes(outlog, True)
    fres.Clean()

    [read_nbest, read_trans, read_acscore] = data()[3:6]
    lmscale_vec = np.linspace(0.1, 0.9, 9)
    weight_vec = np.linspace(0.5, 0.5, 1)

    for type in lmtypes:
        exist_multiple_run = False
        a = type.split('+')
        for lm in a:
            if lmpaths[lm].find('<run>') != -1:
                exist_multiple_run = True
                break

        run_vec = [0]
        run_name = type
        if exist_multiple_run:
            run_vec = range(0, 10)
            run_name = type + ':<run>'

        for run in run_vec:
            run_str = 'run{}'.format(run)
            name = run_name.replace('<run>', run_str)
            opt_wer_vec = [100, 1.0, 1.0]
            opt_weight = 1.0

            if len(a) == 1:
                lmscore = wb.LoadScore(lmpaths[a[0]].replace('<run>', run_str))
                opt_wer_vec = wb.TuneWER(read_nbest, read_trans, lmscore,
                                         read_acscore, lmscale_vec)
                opt_weight = 1.0
            else:
                lmscore1 = np.array(
                    wb.LoadScore(lmpaths[a[0]].replace('<run>', run_str)))
                lmscore2 = np.array(
                    wb.LoadScore(lmpaths[a[1]].replace('<run>', run_str)))

                for w in weight_vec:
                    lmscore = w * lmscore1 + (1 - w) * lmscore2
                    [wer, lmscale,
                     acscale] = wb.TuneWER(read_nbest, read_trans, lmscore,
                                           read_acscore, lmscale_vec)
                    if wer < opt_wer_vec[0]:
                        opt_wer_vec = [wer, lmscale, acscale]
                        opt_weight = w

            fres.Add(name, ['wer', 'lmscale', 'acscale', 'weight'],
                     opt_wer_vec + [opt_weight])
Beispiel #4
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    absdir = os.getcwd() + '/'
    bindir = absdir + '../../tools/srilm/'
    workdir = absdir + 'ngramlm/'
    wb.mkdir(workdir)

    datas = [absdir + i for i in data()]
    result_file = absdir + 'models_ppl.txt'  # the result file
    model = ngram.model(bindir, workdir)
    order_reg = [2, 3, 4, 5]

    for order in order_reg:
        write_model = workdir + '{}gram.lm'.format(order)
        print(write_model)

        if '-train' in sys.argv:
            if order_reg.index(order) == 0:
                model.prepare(datas[0], datas[1], datas[2])
            model.train(order, write_model, absdir + 'models_ppl.txt')
        if '-rescore' in sys.argv:
            model.rescore(write_model, order, datas[3],
                          write_model[0:-3] + '.lmscore')
        if '-wer' in sys.argv:
            [nbest, templ] = datas[3:5]
            lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore')
            acscore = wb.LoadScore(datas[5])

            [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore,
                                                 acscore,
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={} lmscale={} acscale={}'.format(wer, lmscale, acscale))
            fres = wb.FRes(result_file)
            fres.AddWER('KN{}'.format(order), wer)

            trans_txt = workdir + os.path.split(templ)[-1] + '.txt'
            wb.file_rmlabel(templ, trans_txt)
            PPL_temp = model.ppl(write_model, order, trans_txt)
            LL_temp = -wb.PPL2LL(PPL_temp, trans_txt)
            fres.Add('KN{}'.format(order), ['LL-wsj', 'PPL-wsj'],
                     [LL_temp, PPL_temp])
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER'
              )

    run_times = range(0, 10)   # for multiple run

    
    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_cpw.fs'
    maxlen = 0
    tmax = 20000
    t0 = 2000
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 4e-5
    thread = 8

    if '-res' in sys.argv:
        fres.Read()
        for i in range(1,len(fres.head)):
            value = []
            for runnum in run_times:
                write_name = 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)
                line = fres.Get(write_name)
                value.append(line[i])
            fres.Add('trf_c{}_{}.runavg'.format(class_num, feat[0:-3]), [fres.head[i]],
                     ['{:.2f}+{:.2f}'.format(np.mean(value), np.std(value))] )

    for runnum in run_times:
        write_model = workdir + 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)

        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 '
            config += ' -write-at-iter [{}:1000:{}]'.format(tmax-5000, tmax)  # output the intermediate models
            model.prepare(data()[0], data()[1], data()[2], class_num)
            model.train(config)
        if '-plot' in sys.argv:
            baseline = fres.Get('KN5')
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data()[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore, read_lmscore] = data()[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore), np.linspace(0.1,0.9,9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            LL = model.get_last_value(write_model + '.log')

            # output the result
            name = os.path.split(write_model)[1]
            fres.AddLL(name, LL, data()[0:3])
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(name, wer)
        if '-stat' in sys.argv:
            # calculate the mean and var of wers of the intermediate models
            inte_wer = []
            inte_model = []

            # find model
            for file_name in os.listdir(os.path.split(write_model)[0]):
                file_path = os.path.split(write_model)[0] + os.sep + file_name
                if not os.path.isfile(file_path):
                    continue
                if file_name.find(os.path.split(write_model)[1]) == 0 and \
                    file_path.split('.')[-1] == 'model' and \
                    file_path.split('.')[-2][0] == 'n':
                    inte_model.append(file_path)

            # compute wer
            flog = open(workdir + 'inte_model_wer.log', 'wt')
            for file_path in sorted(inte_model):
                print(file_path)
                t = int(file_path.split('.')[-2][1:])

                # lmscore
                write_lmscore = os.path.splitext(file_path)[0] + '.lmscore'
                config = ' -vocab {} '.format(vocab)
                config += ' -read {} '.format(file_path)
                config += ' -nbest {} '.format(data()[3])
                config += ' -lmscore {0} '.format(write_lmscore)
                model.use(config, False)
                # wer
                [wer, lmscale, acscale] = wb.TuneWER(data()[3], data()[4],
                                                 wb.LoadScore(write_lmscore),
                                                 wb.LoadScore(data()[5]), np.linspace(0.1, 0.9, 9))
                print('t={} wer={}'.format(t, wer))
                flog.write('{} \t wer={}\n'.format(file_path, wer))
                inte_wer.append([t, wer])
            flog.close()

            # plot wer
            inte_wer = sorted(inte_wer, key=lambda d: d[0])
            t_list = [i[0] for i in inte_wer]
            wer_list = [i[1] for i in inte_wer]
            wer_mean = np.mean(wer_list[-20:])
            wer_std = np.std(wer_list[-20:])
            print('wer_mean={}  wer_std={}'.format(wer_mean, wer_std))

            plt.figure()
            plt.plot(t_list, wer_list)
            plt.xlabel('t')
            plt.ylabel('wer')
            plt.show()
        if '-ais' in sys.argv:
            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_chain = 10
            ais_inter = 10000
            ais_model = '{}.ais{}_{}.model'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(trf.FileMaxLen(read_nbest)-1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore, np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0]*3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(write_model)[1]+":AIS{}-{}".format(ais_chain, ais_inter)
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
Beispiel #6
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    run_times = range(0, 1)  # for multiple run

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    thread = 18

    ais_chain = 10
    ais_inter = 200000

    if '-wer' in sys.argv:
        # calculate mean of the WER of 10 TRFs after AIS
        res_list = []
        for runnum in run_times:
            name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais{}_{}'.format(
                runnum, ais_chain, ais_inter)
            res = fres.Get(name)[1:]
            if run_times.index(runnum) == 0:
                res_list = [[] for i in range(len(res))]
            for i in range(len(res)):
                res_list[i].append(res[i])
        name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais{}_{}'.format(
            ais_chain, ais_inter)
        head = fres.GetHead()[1:]
        for i in range(len(head)):
            mean = np.mean(res_list[i])
            std = np.std(res_list[i])
            fres.Add(name, [head[i]], ['{:.2f}+{:.2f}'.format(mean, std)])

    if '-ais' in sys.argv:
        for runnum in run_times:
            write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}'.format(
                runnum)

            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_model = '{}.ais{}_{}'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model + '.model'):
                config = ' -vocab {0} -read {1}.model -write {2}.model -log {2}.log'.format(
                    vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(
                    ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(
                    trf.FileMaxLen(read_nbest) -
                    1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = ais_model + '.lmscore'
            config = ' -vocab {} -read {}.model'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(
                read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore,
                                                 np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0] * 3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {}.model -test {} '.format(
                    vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(ais_model)[-1]
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])

    if '-cmp' in sys.argv:
        # compare the variance of exp(logz) with the variance of AIS weight
        # Load the logz of 10 independent runs
        multi_run = 10
        logzs = []
        for i in range(multi_run):
            logz = trf.LoadLogz(
                workdir +
                'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.run{}.model'
                .format(i))
            logzs.append(logz[0:33])
        mat_logzs = np.matrix(logzs).T

        # Load the weight of each length
        logws = []
        with open(workdir +
                  'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.log'
                  ) as f:
            for line in f:
                idx = line.find('logw=')
                if idx != -1:
                    a = line[idx:].split()[1:]
                    logws.append([float(i) for i in a])
        mat_logws = np.matrix(logws)

        w_var = mat_var(mat_logws)
        z_var = mat_var(mat_logzs)

        for i in range(len(w_var)):
            rate = np.exp(w_var[i] - z_var[i])
            print('len={} w_var={} z_var={} rate={}'.format(
                i + 1, w_var[i], z_var[i], rate))
    if '-cmp2' in sys.argv:
        # compare the logz of AIS and the SAMS
        write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'
        logz_sams = trf.LoadLogz(write_model + '.model')
        logz_ais = trf.LoadLogz('{}.ais{}_{}.model'.format(
            write_model, ais_chain, ais_inter))
        plt.figure()
        plt.plot(logz_sams[0:33], 'r-', label='sams')
        logz_ais10 = []
        for n in range(10):
            logz_ais10.append(
                trf.LoadLogz('{}.ais10_20000.run{}.model'.format(
                    write_model, n)))
            plt.plot(logz_ais10[-1][0:33], 'g--')
        logz_ais_m = [0] * 33
        for i in range(33):
            for n in range(10):
                logz_ais_m[i] += logz_ais10[n][i]
            logz_ais_m[i] /= 10
        plt.plot(logz_ais_m[0:33], 'r--')
        plt.plot(logz_ais[0:33], 'b--', label='ais 10-200K')
        #plt.legend()
        plt.show()

    if '-cmp3' in sys.argv:
        trf_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'
        # revise the logz of the trf model to the mean of results of 10 (10-20k) runs
        logz_sams = trf.LoadLogz(trf_model + '.model')
        logz_ais10 = []
        for n in range(10):
            logz_ais10.append(
                trf.LoadLogz('{}.ais10_20000.run{}.model'.format(trf_model,
                                                                 n)))
        logz_ais_m = [0] * 33
        for i in range(33):
            for n in range(10):
                logz_ais_m[i] += logz_ais10[n][i]
            logz_ais_m[i] /= 10

        print(logz_ais_m)
        ais_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.runavg.model'
        print('write -> ' + ais_model)
        revise_logz(trf_model + '.model', ais_model, logz_ais_m)

        # compute WER
        print('computer WER')
        wer = model.wer(vocab, ais_model, data()[3], data()[4], data()[5])
        print('WER={}'.format(wer))

        # compute PPL
        print('computer PPL')
        ppl = model.ppl(vocab, ais_model, data()[4], True)
        print('PPL={}'.format(ppl))

        # plot the logzs
        plt.figure()
        for n in range(10):
            plt.plot(logz_ais10[n][0:33], 'g-')
        plt.plot(logz_ais_m[0:33], 'r', label='ais10-20K-mean')
        plt.plot(logz_sams[0:33], 'b', label='sams')
        plt.legend()
        plt.show()

    if '-wer3' in sys.argv:
        # smooth zeta
        wer_ais = []
        wer_smooth = []
        ppl_ais = []
        ppl_smooth = []
        ll_ais = []
        ll_smooth = []
        for n in range(10):
            ais_name = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais10_20000'.format(
                n)
            print(ais_name)

            logz_ais = trf.LoadLogz(ais_name + '.model')[0:33]
            z = np.polyfit(np.linspace(1, 33, 33), logz_ais, 1)
            logz_ais_smooth = z[0] * np.linspace(1, 33, 33) + z[1]
            revise_logz(ais_name + '.model', ais_name + '.smooth.model',
                        logz_ais_smooth.tolist())
            print(logz_ais)
            print(logz_ais_smooth.tolist())

            if n == 0:
                logz_sams = trf.LoadLogz(
                    workdir +
                    'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.model'.format(n))
                logw = load_ais_weight(ais_name + '.log')
                plt.figure()
                for i in range(len(logw)):
                    plt.plot((i + 1) * np.ones(len(logw[i])), logw[i], 'k.')
                plt.plot(np.linspace(1, 33, 33),
                         logz_ais,
                         'r-',
                         label='standard AIS')
                plt.plot(np.linspace(1, 33, 33),
                         logz_ais_smooth,
                         'g-',
                         label='smoothed AIS')
                plt.plot(np.linspace(1, 33, 33),
                         logz_sams[0:33],
                         'b-',
                         label='SAMS')
                plt.legend()
                plt.xlim(1, 33)
                plt.xlabel('length')
                plt.ylabel('logZ')
                plt.show()

            wer = model.wer(vocab, ais_name + '.model',
                            data()[3],
                            data()[4],
                            data()[5])
            [ppl, LL] = model.ppl(vocab, ais_name + '.model', data()[4], True)
            wer_ais.append(wer)
            ppl_ais.append(ppl)
            ll_ais.append(LL)

            fres.Add(
                os.path.split(ais_name)[-1], ['WER', 'LL-wsj', 'PPL-wsj'],
                [wer, LL, ppl])

            wer = model.wer(vocab, ais_name + '.smooth.model',
                            data()[3],
                            data()[4],
                            data()[5])
            [ppl, LL] = model.ppl(vocab, ais_name + '.smooth.model',
                                  data()[4], True)
            wer_smooth.append(wer)
            ppl_smooth.append(ppl)
            ll_smooth.append(LL)

            fres.Add(
                os.path.split(ais_name)[-1] + '.smooth',
                ['WER', 'LL-wsj', 'PPL-wsj'], [wer, LL, ppl])

        for label, d in zip(['WER', 'LL-wsj', 'PPL-wsj'],
                            [wer_ais, ll_ais, ppl_ais]):
            cur_mean = np.mean(d)
            cur_std = np.std(d)
            fres.Add('trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000',
                     [label], ['{:.2f}+{:.2f}'.format(cur_mean, cur_std)])

        for label, d in zip(['WER', 'LL-wsj', 'PPL-wsj'],
                            [wer_smooth, ll_smooth, ppl_smooth]):
            cur_mean = np.mean(d)
            cur_std = np.std(d)
            fres.Add(
                'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000.smooth',
                [label], ['{:.2f}+{:.2f}'.format(cur_mean, cur_std)])

    if '-wer2' in sys.argv:
        # perform adjust-AIS and  evaluate the WER and PPL

        results = []
        for n in range(10):
            ais_name = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais10_20000'.format(
                n)
            print(ais_name)
            logw = load_ais_weight(ais_name + '.log')
            logz = [np.mean(a) for a in logw]
            revise_logz(ais_name + '.model', ais_name + '.adjust.model', logz)
            print('  wer')
            wer = model.wer(vocab, ais_name + '.adjust.model',
                            data()[3],
                            data()[4],
                            data()[5])
            print('  ppl')
            [ppl, LL] = model.ppl(vocab, ais_name + '.adjust.model',
                                  data()[4], True)
            fres.Add(
                os.path.split(ais_name)[-1] + '.ad',
                ['WER', 'LL-wsj', 'PPL-wsj'], [wer, LL, ppl])
            results.append([wer, LL, ppl])

        res_mean = []
        res_std = []
        for i in range(3):
            a = [b[i] for b in results]
            res_mean.append(np.mean(a))
            res_std.append(np.std(a))
        fres.Add('trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000.ad', [
            'WER', 'LL-wsj', 'PPL-wsj'
        ], ['{:.2f}+{:.2f}'.format(res_mean[i], res_std[i]) for i in range(3)])
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    run_times = range(3, 10)  # for multiple run

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    thread = 8

    ais_chain = 10
    ais_inter = 20000

    if '-wer' in sys.argv:
        res_list = []
        for runnum in run_times:
            name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais{}_{}.run{}'.format(
                ais_chain, ais_inter, runnum)
            res = fres.Get(name)[1:]
            if run_times.index(runnum) == 0:
                res_list = [[] for i in range(len(res))]
            for i in range(len(res)):
                res_list[i].append(res[i])
        name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais{}_{}.avg'.format(
            ais_chain, ais_inter)
        head = fres.GetHead()[1:]
        for i in range(len(head)):
            mean = np.mean(res_list[i])
            std = np.std(res_list[i])
            fres.Add(name, [head[i]], ['{:.2f}+{:.2f}'.format(mean, std)])

    if '-ais' in sys.argv:
        for runnum in run_times:
            write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'

            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_model = '{}.ais{}_{}.run{}.model'.format(
                write_model, ais_chain, ais_inter, runnum)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(
                    vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(
                    ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(
                    trf.FileMaxLen(read_nbest) -
                    1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(
                read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore,
                                                 np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0] * 3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(
                    vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.splitext(os.path.split(ais_model)[-1])[0]
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
Beispiel #8
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    for tsize in [1]:
        bindir = '../../tools/trf/bin/'
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'trflm/'

        fres = wb.FRes('result.txt')
        model = trf.model(bindir, workdir)

        class_num = 200
        train = workdir + 'train.id'
        valid = workdir + 'valid.id'
        test = workdir + 'test.id'
        vocab = workdir + 'vocab_c{}.list'.format(class_num)
        order = 4
        feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
        #feat = 'g4_w_c_ws_cs_cpw.fs'
        maxlen = 100
        tmax = 50000
        t0 = 2000
        minibatch = 100
        gamma_lambda = '1000,0'
        gamma_zeta = '0,0.6'
        reg = 1e-6
        thread = 8

        write_model = workdir + 'trf_c{}_{}'.format(class_num, feat[0:-3])
        write_name = '{}:{}'.format(tsize, os.path.split(write_model)[1])

        if '-class' in sys.argv:
            # just cluster for each tsks.
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(
                vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(
                gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 -write-at-iter 10000:10000:{}'.format(
                tmax)
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
            model.train(config)
            # output
            LL = model.get_last_value(write_model + '.log')
            fres.AddLL(write_name, LL, data(tskdir)[0:3])
        if '-plot' in sys.argv:
            baseline = fres.Get('{}:KN5'.format(tsize))
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data(tskdir)[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore,
             read_lmscore] = data(tskdir)[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore),
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(
                wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)

            # output the result
            fres.Add(write_name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(write_name, wer)
Beispiel #9
0
    trans = root + 'transcript.txt'
    ac = root + '1000best.acscore'
    lm = root + '1000best.lmscore'
    return data_verfy([nbest, trans, ac, lm])


if __name__ == '__main__':
    print(sys.argv)
    if len(sys.argv) == 1:
        print(' \"python run_rnn.py -train\"   train rnn\n '
              ' \"python run_rnn.py -test\"    get the ppl\n '
              ' \"python run_rnn.py -rescore\" rescore nbest\n'
              ' \"python run_rnn.py -wer\"         compute WER'
              )
    absdir = os.getcwd() + '/'
    fres = wb.FRes('result.txt')

    for tsize in [1, 2, 4]:
        bindir = '../../tools/rnn/rnnlm-0.3e/'
        tskdir = absdir + '{}/'.format(tsize)
        workdir = tskdir + 'rnnlm/'
        model = rnn.model(bindir, workdir)

        hidden = 250
        cnum = 1
        bptt = 4
        write_model = workdir + 'h{}_c{}_bptt{}.rnn'.format(hidden, cnum, bptt)
        write_name = '{}:RNN:'.format(tsize) + os.path.split(write_model)[1][0:-4]

        if '-train' in sys.argv or '-all' in sys.argv:
            if not os.path.exists(write_model):
Beispiel #10
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    nbest_root = 'data/nbest/'
    nbest_type_list = ['nbest_mvdr_single_heq_multi']

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_wsh_csh.fs'
    maxlen = 0
    tmax = 20000
    t0 = 0
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 1e-6
    thread = 8

    write_model = workdir + 'trf_c{}_{}_2'.format(class_num, feat[0:-3])
    if '-train' in sys.argv or '-all' in sys.argv:
        config = '-vocab {} -train {} -valid {} -test {} '.format(
            vocab, train, valid, test)
        config += ' -read {}.model'.format(write_model[0:-2])
        config += ' -order {} -feat {} '.format(order, feat)
        config += ' -len {} '.format(maxlen)
        config += ' -write {0}.model -log {0}.log '.format(write_model)
        config += ' -t0 {} -iter {}'.format(t0, tmax)
        config += ' -gamma-lambda {} -gamma-zeta {}'.format(
            gamma_lambda, gamma_zeta)
        config += ' -L2 {} '.format(reg)
        config += ' -mini-batch {} '.format(minibatch)
        config += ' -thread {} '.format(thread)
        config += ' -print-per-iter 10 '
        config += ' -write-at-iter [{}:10000:{}]'.format(
            tmax - 30000, tmax)  # output the intermediate models
        model.prepare('data/train', 'data/valid', 'data/valid', class_num)
        model.train(config)
    if '-plot' in sys.argv:
        baseline = fres.Get('KN5')
        trf.PlotLog([write_model], [baseline])
    if '-rescore' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            for tsk in [
                    'nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05']
                    for b in ['real', 'simu']
            ]:
                write_dir = workdir + nbest_type + '/' + tsk + '/'
                wb.mkdir(write_dir)
                print('{} : {}'.format(nbest_type, tsk))
                print('  write -> {}'.format(write_dir))
                write_lmscore = write_dir + os.path.split(write_model)[-1]
                # fill the empty lines
                process_nbest(nbest_dir + tsk + '/words_text',
                              write_lmscore + '.nbest')

                config = ' -vocab {} '.format(vocab)
                config += ' -read {}.model '.format(write_model)
                config += ' -nbest {} '.format(write_lmscore + '.nbest')
                config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                    write_lmscore)
                model.use(config)
    if '-wer' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            lmpaths = {
                'KN5':
                nbest_dir + '<tsk>/lmwt.lmonly',
                'RNN':
                nbest_dir + '<tsk>/lmwt.rnn',
                'LSTM':
                'lstm/' + nbest_type + '/<tsk>/lmwt.lstm',
                'TRF':
                workdir + nbest_type + '/<tsk>/' +
                os.path.split(write_model)[-1] + '.lmscore'
            }
            # 'TRF': nbestdir + '<tsk>/lmwt.trf'}
            # lmtypes = ['LSTM', 'KN5', 'RNN', 'TRF', 'RNN+KN5', 'LSTM+KN5', 'RNN+TRF', 'LSTM+TRF']
            lmtypes = ['TRF', 'RNN', 'KN5', 'RNN+TRF']
            wer_workdir = 'wer/' + nbest_type + '/'
            print('wer_workdir = ' + wer_workdir)
            wer.wer_all(wer_workdir, nbest_dir, lmpaths, lmtypes)
            config = wer.wer_tune(wer_workdir)
            wer.wer_print(wer_workdir, config)
Beispiel #11
0
    lmpaths = {
        'KN5': 'ngramlm/5gram.lmscore',
        'RNN': 'rnnlm/h250_c1_bptt5.run0.lmscore',
        'LSTM': 'lstmlm/h250_dropout0_epoch10.run0.lmscore',
        'TRF': 'trflm/trf_c200_g4_w_c_ws_cs_wsh_csh_tied.<run>.lmscore'
    }
    lmtypes = [
        'KN5', 'RNN', 'LSTM', 'TRF', 'RNN+KN5', 'RNN+TRF', 'LSTM+KN5',
        'LSTM+TRF'
    ]
    outlog = 'wer.log'

    if not os.path.exists(outlog):
        wer_all(lmpaths, lmtypes, outlog)

    fres = wb.FRes(outlog, True)

    lmwers = dict()
    with open(outlog, 'rt') as f:
        f.readline()
        for a in [line.split() for line in f]:
            if a[0].find('[all]') != -1:
                break
            type = a[0].split(':')[0]
            wer_vec = lmwers.setdefault(type, [])
            wer_vec.append(float(a[1]))

    for type in lmtypes:
        wer_vec = lmwers[type]
        wer_mean = np.mean(wer_vec)
        wer_std = np.std(wer_vec)
Beispiel #12
0
            a = line.lower().split()
            f2.write(' '.join(a[skip:]) + '\n')


if __name__ == '__main__':
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train rnn\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    run_times = range(0, 10)  # revise this to control the run times

    bindir = '../../tools/rnn/rnnlm-0.3e/'
    workdir = 'rnnlm/'
    fres = wb.FRes('models_ppl.txt')
    model = rnn.model(bindir, workdir)

    hidden = 250
    cnum = 1
    bptt = 5

    for runnum in run_times:
        write_model = workdir + 'h{}_c{}_bptt{}.run{}.rnn'.format(
            hidden, cnum, bptt, runnum)  # the write model name
        write_name = 'RNN:h{}c{}bptt{}:run{}'.format(
            hidden, cnum, bptt, runnum)  # the name in res file.

        if '-train' in sys.argv or '-all' in sys.argv:
            if not os.path.exists(write_model):
                config = ' -rnnlm {} '.format(write_model)