def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token=None,
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(reader.wsj0_nbest()[0], is_nbest=True)

    config = small_config(data)
    # config = medium_config(data)
    # config = large_config(data)

    work_dir = './lstm/' + create_name(config)
    wb.mkdir(work_dir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(work_dir, 'lstm.log'))
    print(work_dir)
    wb.pprint_dict(config.__dict__)

    data.write_vocab(work_dir + '/vocab.txt')
    data.write_data(data.datas[0], work_dir + '/train.id')
    data.write_data(data.datas[1], work_dir + '/valid.id')
    data.write_data(data.datas[2], work_dir + '/test.id')
    data.write_data(nbest_list, work_dir + '/nbest.id')

    write_model = os.path.join(work_dir, 'model.ckpt')

    with tf.Graph().as_default():
        # lm = lstmlm.FastLM(config, device_list=['/gpu:0', '/gpu:0'])
        lm = blocklm.LM(config, device='/gpu:0')
        param_num = tf.add_n([tf.size(v) for v in tf.trainable_variables()])

        for v in lm.train_net.variables:
            print(v.name)

        save = tf.train.Saver()

        # used to write ppl on valid/test set
        summ_bank = blocklm.layers.SummaryScalarBank(['ppl_valid', 'ppl_test'])
        summ_var = blocklm.layers.SummaryVariables()

        sv = tf.train.Supervisor(logdir=os.path.join(work_dir, 'logs'),
                                 summary_op=None,
                                 global_step=lm.global_step())
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            print('param_num={:,}'.format(session.run(param_num)))

            lm.train(sv, session, data.datas[0], data.datas[1], data.datas[2])

            save.save(session, write_model)
def main():
    nbest_cmp = task.NBestComputer()
    data = reader.Data().load_raw_data([task.train, task.valid, task.valid],
                                       add_beg_token='<s>',
                                       add_end_token='</s>')

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'

    order_reg = [4, 5, 6]
    for order in order_reg:
        config.order = order
        config.cutoff = [0] * order

        workdir = wb.mkdir('ngramlm/' + str(config), is_recreate=False)
        sys.stdout = wb.std_log(workdir + '/ngram.log')
        print(workdir)

        m = ngramlm.Model(config, data, bindir, workdir)

        # train
        print('training...')
        m.train()

        # rescore
        print('rescoring...')
        time_beg = time.time()
        for nbest in nbest_cmp.nbests:
            nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
            # print(len(nbest.lmscore))
        print('rescore time={:.2f}m'.format((time.time() - time_beg) / 60))
        nbest_cmp.write_lmscore(workdir + '/model')

        # tune lm-scale
        print('computing wer...')
        nbest_cmp.cmp_wer()
        nbest_cmp.write_to_res(config.res_file, str(config))
Beispiel #3
0
import tensorflow as tf
import sys
import os
import numpy as np
import time

from model import reader
from model import trfnnbase as trf
from model import wblib as wb

# [data]
data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                   add_beg_token='<s>',
                                   add_end_token='</s>')
data.build_char_vocab(add_beg_end_tokens=True)  # bulid char vocabulary
nbest = reader.NBest(*reader.wsj0_nbest())
nbest_list = data.load_data(nbest.nbest, is_nbest=True)


def create_name(config):
    return 'trf_' + str(config.config_trf) + '_maxlen{}'.format(config.max_len)


def get_config():
    config = trf.Config(data, 'rnn_char')
    config.jump_width = 2
    config.chain_num = 10
    config.batch_size = 100
    config.lr_cnn = trf.trfbase.LearningRateTime(beta=1.0, tc=1e4)
    config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2)
    config.max_epoch = 1000
def main(_):
    # [data]
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(), add_beg_token=None, add_end_token='</s>')

    config = blocklm.Config()
    config.vocab_size = data.get_vocab_size()
    config.block_size = 5
    config.hidden_layers = 1

    m = lstmlm.LM(config)

    m2 = blocklm.LM(config)
    for v in m2.train_net.variables:
        print(v.name)

    wb.pprint_dict(config.__dict__)

    recoder = wb.clock()

    sv = tf.train.Supervisor()
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_config.gpu_options.allow_growth = True
    with sv.managed_session(config=session_config) as session:
        batch = m.sample_net.config.batch_size
        length = 1000

        initial_seqs = np.random.choice(config.vocab_size, size=(batch, 1))

        with recoder.recode('sample_block'):
            final_saqs = m2.simulate(session, initial_seqs, length, True)
            # for i in range(length // config.block_size):
            #     append_seqs, _ = m2.sample_net.run_predict(session, initial_seqs[:, -1:], m2.sample_net.draw)

        with recoder.recode('sample_lstm'):
            final_saqs = m.simulate(session, initial_seqs, length, True)

            # for i in range(length):
            #     append_seqs, _ = m.sample_net.run_predict(session, initial_seqs[:, -1:], m.sample_net.draw)



        # with recoder.recode('sample'):
        #     m.sample_net.set_zero_state(session)
        #     for i in range(length):
        #         append_seqs, _ = m.sample_net.run_predict(session, initial_seqs[:, -1:], m.sample_net.draw)
        #         initial_seqs = np.concatenate([initial_seqs, append_seqs], axis=-1)
        #     # print(initial_seqs)
        #
        # with recoder.recode('probs'):
        #     m.sample_net.set_zero_state(session)
        #     for i in range(length):
        #         probs = m.sample_net.run_predict(session, initial_seqs[:, i:i+1], [m.sample_net.softmax.probs])
        #     # print(initial_seqs)
        #
        # with recoder.recode('condition'):
        #     m.sample_net.set_zero_state(session)
        #     for i in range(length):
        #         m.sample_net.run(session, initial_seqs[:, i:i+1], initial_seqs[:, i+1:i+2], [m.sample_net.cost])
        #     # print(initial_seqs)

        for key, t in sorted(recoder.items(), key=lambda x:x[0]):
            print('{}={:.2f}'.format(key, t * 60))
Beispiel #5
0
def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(nbest.nbest, is_nbest=True)
    print('nbest list info=', wb.TxtInfo(nbest.nbest))

    config = trfnce.Config(data)
    config.structure_type = 'rnn'
    config.embedding_dim = 200
    config.rnn_hidden_layers = 2
    config.rnn_hidden_size = 200
    config.batch_size = 20
    config.noise_factor = 100
    config.noise_sampler = 2
    config.init_weight = 0.1
    config.lr_param = trfbase.LearningRateTime(1e-3)
    config.max_epoch = 100
    # config.dropout = 0.75
    # config.init_zeta = config.get_initial_logz(20)
    config.update_zeta = False
    config.write_dbg = False
    config.pprint()

    name = create_name(config)
    logdir = 'trf_nce/' + name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log'))
    print(logdir)

    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    # wb.rmdir(logdirs)
    with tf.Graph().as_default():
        m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0')
        # noise_lstm = lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:1')
        # m.lstm = noise_lstm

        sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'),
                                 global_step=m.train_net.global_step)
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # print('load lstmlm for noise generator')
            # noise_lstm.restore(session,
            #                    './lstm/' + run_lstmlm_withBegToken.create_name(noise_lstm.config) + '/model.ckpt')

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
import os
import sys
import time
import numpy as np

import task
from model import wblib as wb
from model import reader
from model import trfbase
from model import trfnce
from model import lstmlm
import run_lstmlm

# [data]
data = reader.Data().load_raw_data([task.train, task.valid, task.test],
                                   add_beg_token='<s>',
                                   add_end_token='</s>')


def create_name(config, q_config):
    s = str(config)
    if q_config is not None:
        s += '_with_' + run_lstmlm.create_name(q_config)
    return s


def main(_):
    config = trfnce.Config(data)

    config.structure_type = 'mix'
    config.embedding_dim = 128
def main(_):
    data = reader.Data().load_raw_data(corpus.char_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token=None,
                                       max_length=1000)
    nbest = reader.NBest(*reader.wsj0_nbest())
    print(nbest.wer())

    config = trf.trfbase.Config(data)
    config.embedding_dim = 12
    config.cnn_filters = [(i, 12) for i in range(1, 11)]
    config.cnn_layers = 3
    config.cnn_hidden = 12
    config.cnn_shared_over_layers = False
    config.cnn_residual = True
    config.cnn_skip_connection = True
    config.max_epoch = 1000
    config.sample_sub = 100
    config.jump_width = 10
    config.init_weight = 0.1
    config.opt_method = 'adam'
    config.lr_cnn = trf.trfbase.LearningRateTime(1, 1.5, tc=1e4)
    config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2)
    config.load_embedding_path = './embedding/ptb_{}x{}.emb'.format(
        config.vocab_size, config.embedding_dim)
    config.auxiliary_hidden = 12
    config.auxiliary_lr = 1.0

    name = create_name(config)
    logdir = name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    print(logdir)
    config.pprint()

    # prapare embedding
    if wb.is_linux() and config.load_embedding_path is not None or \
            (config.feat_type_file and config.feat_cluster > 0):
        if config.load_embedding_path is None:
            fvectors = './embedding/ptb_{}x{}.emb'.format(
                config.vocab_size, config.embedding_dim)
        else:
            fvectors = config.load_embedding_path
        data.word2vec(fvectors,
                      dim=config.embedding_dim,
                      cnum=config.feat_cluster)
    else:
        config.load_embedding_path = None

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    nbest_char_txt = logdir + '/nbest.char.txt'
    corpus.word_text_to_char_text(reader.wsj0_nbest()[0],
                                  nbest_char_txt,
                                  is_nbest=True)
    nbest_list = data.load_data(nbest_char_txt, is_nbest=False)
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config,
                    data,
                    logdir=logdir,
                    device='/gpu:2',
                    simulater_device='/gpu:1')

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m._global_step)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs

        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            # s = ['it was not black monday', 'we did n\'t even get a chance']
            # eval_list = data.load_data([[data.beg_token_str] + w.split() + [data.end_token_str] for w in s])
            # print(eval_list)

            # import sampling as sp
            # x_batch = [x for x in sp.SeqIter(3, config.vocab_size,
            #                                  beg_token=config.beg_token,
            #                                  end_token=config.end_token)]
            # logprobs = m.get_log_probs(x_batch, False)
            # logz = sp.log_sum(logprobs)
            # print(logprobs)
            # print(logz)

            m.train(session,
                    sv,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)