def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token=None,
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(reader.wsj0_nbest()[0], is_nbest=True)

    config = small_config(data)
    # config = medium_config(data)
    # config = large_config(data)

    work_dir = './lstm/' + create_name(config)
    wb.mkdir(work_dir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(work_dir, 'lstm.log'))
    print(work_dir)
    wb.pprint_dict(config.__dict__)

    data.write_vocab(work_dir + '/vocab.txt')
    data.write_data(data.datas[0], work_dir + '/train.id')
    data.write_data(data.datas[1], work_dir + '/valid.id')
    data.write_data(data.datas[2], work_dir + '/test.id')
    data.write_data(nbest_list, work_dir + '/nbest.id')

    write_model = os.path.join(work_dir, 'model.ckpt')

    with tf.Graph().as_default():
        # lm = lstmlm.FastLM(config, device_list=['/gpu:0', '/gpu:0'])
        lm = blocklm.LM(config, device='/gpu:0')
        param_num = tf.add_n([tf.size(v) for v in tf.trainable_variables()])

        for v in lm.train_net.variables:
            print(v.name)

        save = tf.train.Saver()

        # used to write ppl on valid/test set
        summ_bank = blocklm.layers.SummaryScalarBank(['ppl_valid', 'ppl_test'])
        summ_var = blocklm.layers.SummaryVariables()

        sv = tf.train.Supervisor(logdir=os.path.join(work_dir, 'logs'),
                                 summary_op=None,
                                 global_step=lm.global_step())
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            print('param_num={:,}'.format(session.run(param_num)))

            lm.train(sv, session, data.datas[0], data.datas[1], data.datas[2])

            save.save(session, write_model)
Esempio n. 2
0
def main():
    print(sys.argv)
    if len(sys.argv) == 1:
        print('\"python run_ngram.py -train\" train \n',
              '\"python run_ngram.py -rescore\" rescore nbest\n',
              '\"python run_ngram.py -wer\" compute WER'
              )
    if wb.is_window():
        bindir = 'd:\\wangbin\\tools'
    else:
        bindir = '../../tools/srilm'
    fres = wb.FRes('result.txt')  # the result file
    datadir = corpus.word_raw_dir()
    nbestdir = reader.wsj0_nbest()
    # print(nbestdir)
    workdir = 'ngramlm/'
    model = ngram.model(bindir, workdir)

    order_reg = [5]
    for order in order_reg:
        write_model = os.path.join(workdir, '{}gram.lm'.format(order))
        write_name = 'KN{}'.format(order)

        print(write_model)

        if '-train' in sys.argv or '-all' in sys.argv:
            if order_reg.index(order) == 0:
                model.prepare(*datadir)
            model.train(order, write_model)

        if '-test' in sys.argv or '-all' in sys.argv:
            PPL = [0]*3
            PPL[0] = model.ppl(write_model, order, datadir[0])
            PPL[1] = model.ppl(write_model, order, datadir[1])
            PPL[2] = model.ppl(write_model, order, datadir[2])
            fres.AddPPL(write_name, PPL, datadir[0:3])

        if '-rescore' in sys.argv or '-all' in sys.argv:
            model.rescore(write_model, order, nbestdir[3], write_model + '.lmscore')

        if '-wer' in sys.argv or '-all' in sys.argv:
            nbest = reader.NBest(*nbestdir)
            nbest.lmscore = wb.LoadScore(write_model + '.lmscore')

            wer = nbest.wer()
            print('wer={} lmscale={} acscale={}'.format(wer, nbest.lmscale, nbest.acscale))
            fres.AddWER(write_name, wer)

            trans_txt = workdir + 'nbest_transcripts.txt'
            nbest.get_trans_txt(trans_txt)
            PPL_trans = model.ppl(write_model, order, trans_txt)
            LL_trans = -wb.PPL2LL(PPL_trans, trans_txt)
            fres.Add(write_name, ['LL-wsj', 'PPL-wsj'], [LL_trans, PPL_trans])
Esempio n. 3
0
import tensorflow as tf
import sys
import os
import numpy as np
import time

from model import reader
from model import trfnnbase as trf
from model import wblib as wb

# [data]
data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                   add_beg_token='<s>',
                                   add_end_token='</s>')
data.build_char_vocab(add_beg_end_tokens=True)  # bulid char vocabulary
nbest = reader.NBest(*reader.wsj0_nbest())
nbest_list = data.load_data(nbest.nbest, is_nbest=True)


def create_name(config):
    return 'trf_' + str(config.config_trf) + '_maxlen{}'.format(config.max_len)


def get_config():
    config = trf.Config(data, 'rnn_char')
    config.jump_width = 2
    config.chain_num = 10
    config.batch_size = 100
    config.lr_cnn = trf.trfbase.LearningRateTime(beta=1.0, tc=1e4)
    config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2)
    config.max_epoch = 1000
Esempio n. 4
0
def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(nbest.nbest, is_nbest=True)
    print('nbest list info=', wb.TxtInfo(nbest.nbest))

    config = trfnce.Config(data)
    config.structure_type = 'rnn'
    config.embedding_dim = 200
    config.rnn_hidden_layers = 2
    config.rnn_hidden_size = 200
    config.batch_size = 20
    config.noise_factor = 100
    config.noise_sampler = 2
    config.init_weight = 0.1
    config.lr_param = trfbase.LearningRateTime(1e-3)
    config.max_epoch = 100
    # config.dropout = 0.75
    # config.init_zeta = config.get_initial_logz(20)
    config.update_zeta = False
    config.write_dbg = False
    config.pprint()

    name = create_name(config)
    logdir = 'trf_nce/' + name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log'))
    print(logdir)

    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    # wb.rmdir(logdirs)
    with tf.Graph().as_default():
        m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0')
        # noise_lstm = lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:1')
        # m.lstm = noise_lstm

        sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'),
                                 global_step=m.train_net.global_step)
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # print('load lstmlm for noise generator')
            # noise_lstm.restore(session,
            #                    './lstm/' + run_lstmlm_withBegToken.create_name(noise_lstm.config) + '/model.ckpt')

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
def main(_):
    data = reader.Data().load_raw_data(corpus.char_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token=None,
                                       max_length=1000)
    nbest = reader.NBest(*reader.wsj0_nbest())
    print(nbest.wer())

    config = trf.trfbase.Config(data)
    config.embedding_dim = 12
    config.cnn_filters = [(i, 12) for i in range(1, 11)]
    config.cnn_layers = 3
    config.cnn_hidden = 12
    config.cnn_shared_over_layers = False
    config.cnn_residual = True
    config.cnn_skip_connection = True
    config.max_epoch = 1000
    config.sample_sub = 100
    config.jump_width = 10
    config.init_weight = 0.1
    config.opt_method = 'adam'
    config.lr_cnn = trf.trfbase.LearningRateTime(1, 1.5, tc=1e4)
    config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2)
    config.load_embedding_path = './embedding/ptb_{}x{}.emb'.format(
        config.vocab_size, config.embedding_dim)
    config.auxiliary_hidden = 12
    config.auxiliary_lr = 1.0

    name = create_name(config)
    logdir = name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    print(logdir)
    config.pprint()

    # prapare embedding
    if wb.is_linux() and config.load_embedding_path is not None or \
            (config.feat_type_file and config.feat_cluster > 0):
        if config.load_embedding_path is None:
            fvectors = './embedding/ptb_{}x{}.emb'.format(
                config.vocab_size, config.embedding_dim)
        else:
            fvectors = config.load_embedding_path
        data.word2vec(fvectors,
                      dim=config.embedding_dim,
                      cnum=config.feat_cluster)
    else:
        config.load_embedding_path = None

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    nbest_char_txt = logdir + '/nbest.char.txt'
    corpus.word_text_to_char_text(reader.wsj0_nbest()[0],
                                  nbest_char_txt,
                                  is_nbest=True)
    nbest_list = data.load_data(nbest_char_txt, is_nbest=False)
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config,
                    data,
                    logdir=logdir,
                    device='/gpu:2',
                    simulater_device='/gpu:1')

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m._global_step)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs

        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            # s = ['it was not black monday', 'we did n\'t even get a chance']
            # eval_list = data.load_data([[data.beg_token_str] + w.split() + [data.end_token_str] for w in s])
            # print(eval_list)

            # import sampling as sp
            # x_batch = [x for x in sp.SeqIter(3, config.vocab_size,
            #                                  beg_token=config.beg_token,
            #                                  end_token=config.end_token)]
            # logprobs = m.get_log_probs(x_batch, False)
            # logz = sp.log_sum(logprobs)
            # print(logprobs)
            # print(logz)

            m.train(session,
                    sv,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
Esempio n. 6
0
import os
import numpy as np
import time

from model import reader
from model import trfcnn as trf
from model import wblib as wb
from model import lstmlm
import run_lstmlm_withBegToken

# [data]
data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                   add_beg_token='<s>',
                                   add_end_token='</s>')
# data.cut_train_to_length(20)
nbest = reader.NBest(*reader.wsj0_nbest())
nbest_list = data.load_data(reader.wsj0_nbest()[0], is_nbest=True)


class Ops(trf.trfjsa.Operation):
    def __init__(self, trf_model):
        super().__init__(trf_model)
        self.wer_next_epoch = 0
        self.wer_per_epoch = 1.0
        self.write_models = wb.mkdir(os.path.join(self.m.logdir, 'wer_models'))

    def run(self, step, epoch):
        super().run(step, epoch)

        if epoch >= self.wer_next_epoch:
            self.wer_next_epoch = (int(epoch / self.wer_per_epoch) +