Esempio n. 1
0
def main():
    dataroot = 'data/1-billion/'
    traindir = dataroot + 'training-monolingual.tokenized.shuffled/'
    valid_txt = dataroot + 'heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050'
    test_txt = dataroot + 'heldout-monolingual.tokenized.shuffled/news.en.heldout-00001-of-00050'

    for tsize in [1, 2, 4]:
        print('tsk = {}'.format(tsize))
        tskdir = '{}/'.format(tsize)
        wb.mkdir(tskdir)
        wb.mkdir(tskdir + 'data')

        write_train_all = tskdir + 'data/train.txt.all'
        write_train = tskdir + 'data/train.txt'
        write_valid = tskdir + 'data/valid.txt'
        write_test = tskdir + 'data/test.txt'
        write_count = tskdir + 'data/train.unigram'

        GetTrainTxt(traindir, write_train_all, tsize)

        v = dict()
        GetVocab(write_train_all, v)
        CutVocab(v, 20000 - 1)  # leave a space of <unk>
        WriteVocab(write_count, v)

        CutTxt(write_train_all, write_train, v)
        CutTxt(valid_txt, write_valid, v)
        CutTxt(test_txt, write_test, v)
Esempio n. 2
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    absdir = os.getcwd() + '/'
    bindir = absdir + '../../tools/srilm/'
    workdir = absdir + 'ngramlm/'
    wb.mkdir(workdir)

    datas = [absdir + i for i in data()]
    result_file = absdir + 'models_ppl.txt'  # the result file
    model = ngram.model(bindir, workdir)
    order_reg = [2, 3, 4, 5]

    for order in order_reg:
        write_model = workdir + '{}gram.lm'.format(order)
        print(write_model)

        if '-train' in sys.argv:
            if order_reg.index(order) == 0:
                model.prepare(datas[0], datas[1], datas[2])
            model.train(order, write_model, result_file)
Esempio n. 3
0
def rescore_all(workdir, nbestdir, config):
    for tsk in ['nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu']]:
        print('process ' + tsk)
        nbest_txt = nbestdir + tsk + '/words_text'
        outdir = workdir + nbestdir.split('/')[-2] + '/' + tsk + '/'
        wb.mkdir(outdir)

        write_lmscore = outdir + 'lmwt.lstm'
        lstm.rescore(workdir, nbest_txt, write_lmscore, config)
Esempio n. 4
0
def wer_all(workdir, nbestdir, lmpaths, lmtypes):
    wb.mkdir(workdir)
    # calculate the wer for each task, each lmscale, each combination
    for tsk in ['nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu']]:
        print(tsk)
        wb.mkdir(workdir + tsk)
        fwer = open(workdir + tsk + '/wer.txt', 'wt')

        read_nbest_txt = nbestdir + tsk + '/words_text'
        read_transcript = nbestdir + tsk + '/text'
        read_acscore = nbestdir + tsk + '/acwt'
        read_gfscore = nbestdir + tsk + '/lmwt.nolm'

        # remove the <UNK> in nbest
        read_nbest_rmunk = workdir + tsk + '/words_text_rmunk'
        nbest_rmUNK(read_nbest_txt, read_nbest_rmunk)
        # load score
        acscore = np.array(wb.LoadScore(read_acscore))
        gfscore = np.array(wb.LoadScore(read_gfscore))
        # load label
        score_label = wb.LoadLabel(read_acscore)

        # lm config

        for lmtype in lmtypes:
            a = lmtype.split('+')
            if len(a) == 1:
                lmscore = np.array(wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk)))
            elif len(a) == 2:
                s1 = wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk))
                s2 = wb.LoadScore(lmpaths[a[1]].replace('<tsk>', tsk))
                lmscore = 0.5 * np.array(s1) + 0.5 * np.array(s2)

            # write lmscore
            wb.WriteScore(workdir + tsk + '/' + lmtype + '.lmscore', lmscore, score_label)

            for lmscale in np.linspace(9, 15, 7):
                write_best = workdir + tsk + '/{}_lmscale={}.best'.format(lmtype, lmscale)
                wb.GetBest(read_nbest_rmunk, (acscore + lmscale * (lmscore + gfscore)).tolist(), write_best)
                [err, num, wer] = wb.CmpWER(write_best, read_transcript)
                os.remove(write_best)
                s = '{} wer={:.2f} err={} num={} lmscale={}'.format(lmtype, wer, err, num, lmscale)
                print('  ' + s)
                fwer.write(s + '\n')
                fwer.flush()

        fwer.close()
Esempio n. 5
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER')

    absdir = os.getcwd() + '/'
    bindir = absdir + '../../tools/srilm/'
    workdir = absdir + 'ngramlm/'
    wb.mkdir(workdir)

    datas = [absdir + i for i in data()]
    result_file = absdir + 'models_ppl.txt'  # the result file
    model = ngram.model(bindir, workdir)
    order_reg = [2, 3, 4, 5]

    for order in order_reg:
        write_model = workdir + '{}gram.lm'.format(order)
        print(write_model)

        if '-train' in sys.argv:
            if order_reg.index(order) == 0:
                model.prepare(datas[0], datas[1], datas[2])
            model.train(order, write_model, absdir + 'models_ppl.txt')
        if '-rescore' in sys.argv:
            model.rescore(write_model, order, datas[3],
                          write_model[0:-3] + '.lmscore')
        if '-wer' in sys.argv:
            [nbest, templ] = datas[3:5]
            lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore')
            acscore = wb.LoadScore(datas[5])

            [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore,
                                                 acscore,
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={} lmscale={} acscale={}'.format(wer, lmscale, acscale))
            fres = wb.FRes(result_file)
            fres.AddWER('KN{}'.format(order), wer)

            trans_txt = workdir + os.path.split(templ)[-1] + '.txt'
            wb.file_rmlabel(templ, trans_txt)
            PPL_temp = model.ppl(write_model, order, trans_txt)
            LL_temp = -wb.PPL2LL(PPL_temp, trans_txt)
            fres.Add('KN{}'.format(order), ['LL-wsj', 'PPL-wsj'],
                     [LL_temp, PPL_temp])
def main(config, tap=True):
    dist = config['DIST']
    assert dist
    dist = home_fn(dist)
    bin = os.path.join(dist, 'bin')
    i386 = os.path.join(dist, 'i386')
    amd64 = os.path.join(dist, 'amd64')

    # build dist and subdirectories
    rm_rf(dist)
    mkdir(dist)
    mkdir(bin)
    if tap:
        mkdir(i386)
        mkdir(amd64)

    # copy openvpn.exe and manifest
    cp(home_fn('openvpn.exe'), bin)
    cp(home_fn('openvpn.exe.manifest'), bin)

    # copy DLL dependencies
    cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll'), bin)
    cp(home_fn(config['OPENSSL_DIR']+'/bin/libeay32.dll'), bin)
    cp(home_fn(config['OPENSSL_DIR']+'/bin/ssleay32.dll'), bin)

    # copy MSVC CRT
    cp_a(home_fn(config['MSVC_CRT']), bin)

    if tap:
        # copy TAP drivers
        for dir_name, dest in (('amd64', amd64), ('i386', i386)):
            dir = home_fn(os.path.join('tap-win32', dir_name))
            for dirpath, dirnames, filenames in os.walk(dir):
                for f in filenames:
                    root, ext = os.path.splitext(f)
                    if ext in ('.inf', '.cat', '.sys'):
                        cp(os.path.join(dir, f), dest)
                break

        # copy tapinstall
        dest = {'amd64' : amd64, 'i386' : i386}
        for dirpath, dirnames, filenames in os.walk(home_fn('tapinstall')):
            for f in filenames:
                if f == 'tapinstall.exe':
                    dir_name = os.path.basename(dirpath)
                    src = os.path.join(dirpath, f)
                    if dir_name in dest:
                        cp(src, dest[dir_name])
Esempio n. 7
0
def main(config, tap=True):
    dist = config['DIST']
    assert dist
    dist = home_fn(dist)
    bin = os.path.join(dist, 'bin')
    i386 = os.path.join(dist, 'i386')
    amd64 = os.path.join(dist, 'amd64')

    # build dist and subdirectories
    rm_rf(dist)
    mkdir(dist)
    mkdir(bin)
    if tap:
        mkdir(i386)
        mkdir(amd64)

    # copy openvpn.exe and manifest
    cp(home_fn('openvpn.exe'), bin)
    cp(home_fn('openvpn.exe.manifest'), bin)

    # copy DLL dependencies
    cp(home_fn(config['LZO_DIR'] + '/bin/lzo2.dll'), bin)
    cp(home_fn(config['OPENSSL_DIR'] + '/bin/libeay32.dll'), bin)
    cp(home_fn(config['OPENSSL_DIR'] + '/bin/ssleay32.dll'), bin)

    # copy MSVC CRT
    cp_a(home_fn(config['MSVC_CRT']), bin)

    if tap:
        # copy TAP drivers
        for dir_name, dest in (('amd64', amd64), ('i386', i386)):
            dir = home_fn(os.path.join('tap-win32', dir_name))
            for dirpath, dirnames, filenames in os.walk(dir):
                for f in filenames:
                    root, ext = os.path.splitext(f)
                    if ext in ('.inf', '.cat', '.sys'):
                        cp(os.path.join(dir, f), dest)
                break

        # copy tapinstall
        dest = {'amd64': amd64, 'i386': i386}
        for dirpath, dirnames, filenames in os.walk(home_fn('tapinstall')):
            for f in filenames:
                if f == 'tapinstall.exe':
                    dir_name = os.path.basename(dirpath)
                    src = os.path.join(dirpath, f)
                    if dir_name in dest:
                        cp(src, dest[dir_name])
Esempio n. 8
0
 def __init__(self, bindir, workdir):
     self.workdir = wb.folder(workdir)
     self.bindir = wb.folder(bindir)
     wb.mkdir(workdir)
Esempio n. 9
0
def main(config, tap=True):
    dist = config['DIST']
    assert dist
    dist = home_fn(dist)
    bin = os.path.join(dist, 'bin')
    i386 = os.path.join(dist, 'i386')
    amd64 = os.path.join(dist, 'amd64')
    samples = os.path.join(dist, 'samples')

    # build dist and subdirectories
    rm_rf(dist)
    mkdir(dist)
    mkdir(bin)
    mkdir(i386)
    mkdir(amd64)
    mkdir(samples)

    # copy openvpn.exe, openvpnserv.exe and their manifests
    cp(home_fn('openvpn.exe'), bin)
    cp(home_fn('openvpn.exe.manifest'), bin)
    cp(home_fn('service-win32/openvpnserv.exe'), bin)
    cp(home_fn('service-win32/openvpnserv.exe.manifest'), bin)

    # copy openvpn-gui
    cp(home_fn(config['OPENVPN_GUI_DIR']+"/"+config['OPENVPN_GUI']), bin)

    # copy DLL dependencies
    cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll'), bin)
    cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll.manifest'), bin)
    cp(home_fn(config['OPENSSL_DIR']+'/bin/libeay32.dll'), bin)
    cp(home_fn(config['OPENSSL_DIR']+'/bin/ssleay32.dll'), bin)
    cp(home_fn(config['PKCS11_HELPER_DIR']+'/lib/libpkcs11-helper-1.dll'), bin)
    cp(home_fn(config['PKCS11_HELPER_DIR']+'/lib/libpkcs11-helper-1.dll.manifest'), bin)

    # copy OpenSSL utilities (=openvpn.exe)
    cp(home_fn(config['OPENSSL_DIR']+'/bin/openssl.exe'), bin)

    # copy sample config files; renaming is necessary due to openvpn.nsi script
    cp(home_fn('install-win32/sample.ovpn'), samples)
    cp(home_fn('sample-config-files/client.conf'), samples)
    cp(home_fn('sample-config-files/server.conf'), samples)
    rename(os.path.join(samples,'client.conf'), os.path.join(samples, 'client.ovpn'))
    rename(os.path.join(samples,'server.conf'), os.path.join(samples, 'server.ovpn'))

    # embed manifests to executables and DLLs
    for f in [ "openvpn.exe", "openvpnserv.exe", "lzo2.dll", "libpkcs11-helper-1.dll" ]:

        outputresource = os.path.join(bin,f)
        manifest = outputresource+".manifest"

        # EXEs and DLLs require slightly different treatment
        if f.endswith(".exe"):
            type = "1"
        elif f.endswith(".dll"):
            type = "2"
        else:
            print "ERROR: Could not embed manifest to "+outputresouce+", bailing out."
            sys.exit(1)

        # Embed the manifest
        run_in_vs_shell('mt.exe -manifest %s -outputresource:%s;%s' % (manifest, outputresource, type))

    # copy MSVC CRT
    cp_a(home_fn(config['MSVC_CRT']), bin)

    # TAP-driver and tapinstall.exe were built, so copy those over
    if tap:
        drv_dir = 'tap-win32'
        ti_dir = 'tapinstall'

    # we're using prebuilt TAP-driver and tapinstall.exe
    elif 'TAP_PREBUILT' in config:
        drv_dir = config['TAP_PREBUILT']
        ti_dir = config['TAP_PREBUILT']

    else:
        print "ERROR: Could not find prebuilt TAP-drivers or tapinstall.exe. Please check win/settings.in"
        sys.exit(1)

    # copy TAP drivers
    for dir_name, dest in (('amd64', amd64), ('i386', i386)):
        dir = home_fn(os.path.join(drv_dir, dir_name))
        for dirpath, dirnames, filenames in os.walk(dir):
            for f in filenames:
                root, ext = os.path.splitext(f)
                if ext in ('.inf', '.cat', '.sys'):
                    cp(os.path.join(dir, f), dest)
            break

    # Copy tapinstall.exe (usually known as devcon.exe)
    dest = {'amd64' : amd64, 'i386' : i386}
    for dirpath, dirnames, filenames in os.walk(home_fn(ti_dir)):
        for f in filenames:
            if f in ( 'devcon.exe', 'tapinstall.exe' ):
                dir_name = os.path.basename(dirpath)
                src = os.path.join(dirpath, f)
                dst = os.path.join(dest[dir_name],'tapinstall.exe')
                if dir_name in dest:
                    cp(src, dst, dest_is_dir=False)
Esempio n. 10
0
 def __init__(self, workdir):
     self.workdir = wb.folder(workdir)
     self.net = None
     wb.mkdir(workdir)
Esempio n. 11
0
        write_lmscore = outdir + 'lmwt.lstm'
        lstm.rescore(workdir, nbest_txt, write_lmscore, config)


if __name__ == '__main__':
    print(sys.argv)
    if len(sys.argv) == 1:
        print(
        ' \"python run.py -train\" train LSTM\n \"python run.py -rescore\" rescore nbest\n \"python run.py -wer\" compute WER')

    absdir = os.getcwd() + '/'
    train = absdir + 'data/train'
    valid = absdir + 'data/valid'
    nbestdir = absdir + 'data/nbest/nbest_mvdr_single_heq_multi/'
    workdir = absdir + 'lstmlm/'
    wb.mkdir(workdir)
    os.chdir('../../tools/lstm/')

    config = '-hidden 500 -epoch 10 -dropout 0 -gpu 2'

    if '-train' in sys.argv:
        lstm.train(workdir, train, valid, valid, config)
    if '-test' in sys.argv:
        lstm.ppl(workdir, train, config)
        lstm.ppl(workdir, valid, config)
    if '-rescore' in sys.argv:
        rescore_all(workdir, nbestdir, config)
    if '-wer' in sys.argv:
        lmpaths = {'KN5': nbestdir + '<tsk>/lmwt.lmonly',
                   'RNN': nbestdir + '<tsk>/lmwt.rnn',
                   'LSTM': workdir + nbestdir.split('/')[-2] + '/<tsk>/lmwt.lstm',
Esempio n. 12
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    nbest_root = 'data/nbest/'
    nbest_type_list = ['nbest_mvdr_single_heq_multi']

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_wsh_csh.fs'
    maxlen = 0
    tmax = 20000
    t0 = 0
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 1e-6
    thread = 8

    write_model = workdir + 'trf_c{}_{}_2'.format(class_num, feat[0:-3])
    if '-train' in sys.argv or '-all' in sys.argv:
        config = '-vocab {} -train {} -valid {} -test {} '.format(
            vocab, train, valid, test)
        config += ' -read {}.model'.format(write_model[0:-2])
        config += ' -order {} -feat {} '.format(order, feat)
        config += ' -len {} '.format(maxlen)
        config += ' -write {0}.model -log {0}.log '.format(write_model)
        config += ' -t0 {} -iter {}'.format(t0, tmax)
        config += ' -gamma-lambda {} -gamma-zeta {}'.format(
            gamma_lambda, gamma_zeta)
        config += ' -L2 {} '.format(reg)
        config += ' -mini-batch {} '.format(minibatch)
        config += ' -thread {} '.format(thread)
        config += ' -print-per-iter 10 '
        config += ' -write-at-iter [{}:10000:{}]'.format(
            tmax - 30000, tmax)  # output the intermediate models
        model.prepare('data/train', 'data/valid', 'data/valid', class_num)
        model.train(config)
    if '-plot' in sys.argv:
        baseline = fres.Get('KN5')
        trf.PlotLog([write_model], [baseline])
    if '-rescore' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            for tsk in [
                    'nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05']
                    for b in ['real', 'simu']
            ]:
                write_dir = workdir + nbest_type + '/' + tsk + '/'
                wb.mkdir(write_dir)
                print('{} : {}'.format(nbest_type, tsk))
                print('  write -> {}'.format(write_dir))
                write_lmscore = write_dir + os.path.split(write_model)[-1]
                # fill the empty lines
                process_nbest(nbest_dir + tsk + '/words_text',
                              write_lmscore + '.nbest')

                config = ' -vocab {} '.format(vocab)
                config += ' -read {}.model '.format(write_model)
                config += ' -nbest {} '.format(write_lmscore + '.nbest')
                config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                    write_lmscore)
                model.use(config)
    if '-wer' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            lmpaths = {
                'KN5':
                nbest_dir + '<tsk>/lmwt.lmonly',
                'RNN':
                nbest_dir + '<tsk>/lmwt.rnn',
                'LSTM':
                'lstm/' + nbest_type + '/<tsk>/lmwt.lstm',
                'TRF':
                workdir + nbest_type + '/<tsk>/' +
                os.path.split(write_model)[-1] + '.lmscore'
            }
            # 'TRF': nbestdir + '<tsk>/lmwt.trf'}
            # lmtypes = ['LSTM', 'KN5', 'RNN', 'TRF', 'RNN+KN5', 'LSTM+KN5', 'RNN+TRF', 'LSTM+TRF']
            lmtypes = ['TRF', 'RNN', 'KN5', 'RNN+TRF']
            wer_workdir = 'wer/' + nbest_type + '/'
            print('wer_workdir = ' + wer_workdir)
            wer.wer_all(wer_workdir, nbest_dir, lmpaths, lmtypes)
            config = wer.wer_tune(wer_workdir)
            wer.wer_print(wer_workdir, config)