def main(_): data = reader.Data().load_raw_data(reader.ptb_raw_dir(), add_beg_token=None, add_end_token='</s>', add_unknwon_token='<unk>') nbest = reader.NBest(*reader.wsj0_nbest()) nbest_list = data.load_data(reader.wsj0_nbest()[0], is_nbest=True) config = small_config(data) # config = medium_config(data) # config = large_config(data) work_dir = './lstm/' + create_name(config) wb.mkdir(work_dir, is_recreate=True) sys.stdout = wb.std_log(os.path.join(work_dir, 'lstm.log')) print(work_dir) wb.pprint_dict(config.__dict__) data.write_vocab(work_dir + '/vocab.txt') data.write_data(data.datas[0], work_dir + '/train.id') data.write_data(data.datas[1], work_dir + '/valid.id') data.write_data(data.datas[2], work_dir + '/test.id') data.write_data(nbest_list, work_dir + '/nbest.id') write_model = os.path.join(work_dir, 'model.ckpt') with tf.Graph().as_default(): # lm = lstmlm.FastLM(config, device_list=['/gpu:0', '/gpu:0']) lm = blocklm.LM(config, device='/gpu:0') param_num = tf.add_n([tf.size(v) for v in tf.trainable_variables()]) for v in lm.train_net.variables: print(v.name) save = tf.train.Saver() # used to write ppl on valid/test set summ_bank = blocklm.layers.SummaryScalarBank(['ppl_valid', 'ppl_test']) summ_var = blocklm.layers.SummaryVariables() sv = tf.train.Supervisor(logdir=os.path.join(work_dir, 'logs'), summary_op=None, global_step=lm.global_step()) sv.summary_writer.add_graph( tf.get_default_graph()) # write the graph to logs session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True with sv.managed_session(config=session_config) as session: print('param_num={:,}'.format(session.run(param_num))) lm.train(sv, session, data.datas[0], data.datas[1], data.datas[2]) save.save(session, write_model)
def main(): nbest_cmp = task.NBestComputer() data = reader.Data().load_raw_data([task.train, task.valid, task.valid], add_beg_token='<s>', add_end_token='</s>') config = ngramlm.Config(data) config.res_file = 'results.txt' order_reg = [4, 5, 6] for order in order_reg: config.order = order config.cutoff = [0] * order workdir = wb.mkdir('ngramlm/' + str(config), is_recreate=False) sys.stdout = wb.std_log(workdir + '/ngram.log') print(workdir) m = ngramlm.Model(config, data, bindir, workdir) # train print('training...') m.train() # rescore print('rescoring...') time_beg = time.time() for nbest in nbest_cmp.nbests: nbest.lmscore = m.rescore(nbest.get_nbest_list(data)) # print(len(nbest.lmscore)) print('rescore time={:.2f}m'.format((time.time() - time_beg) / 60)) nbest_cmp.write_lmscore(workdir + '/model') # tune lm-scale print('computing wer...') nbest_cmp.cmp_wer() nbest_cmp.write_to_res(config.res_file, str(config))
import tensorflow as tf import sys import os import numpy as np import time from model import reader from model import trfnnbase as trf from model import wblib as wb # [data] data = reader.Data().load_raw_data(reader.ptb_raw_dir(), add_beg_token='<s>', add_end_token='</s>') data.build_char_vocab(add_beg_end_tokens=True) # bulid char vocabulary nbest = reader.NBest(*reader.wsj0_nbest()) nbest_list = data.load_data(nbest.nbest, is_nbest=True) def create_name(config): return 'trf_' + str(config.config_trf) + '_maxlen{}'.format(config.max_len) def get_config(): config = trf.Config(data, 'rnn_char') config.jump_width = 2 config.chain_num = 10 config.batch_size = 100 config.lr_cnn = trf.trfbase.LearningRateTime(beta=1.0, tc=1e4) config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2) config.max_epoch = 1000
def main(_): # [data] data = reader.Data().load_raw_data(reader.ptb_raw_dir(), add_beg_token=None, add_end_token='</s>') config = blocklm.Config() config.vocab_size = data.get_vocab_size() config.block_size = 5 config.hidden_layers = 1 m = lstmlm.LM(config) m2 = blocklm.LM(config) for v in m2.train_net.variables: print(v.name) wb.pprint_dict(config.__dict__) recoder = wb.clock() sv = tf.train.Supervisor() session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True with sv.managed_session(config=session_config) as session: batch = m.sample_net.config.batch_size length = 1000 initial_seqs = np.random.choice(config.vocab_size, size=(batch, 1)) with recoder.recode('sample_block'): final_saqs = m2.simulate(session, initial_seqs, length, True) # for i in range(length // config.block_size): # append_seqs, _ = m2.sample_net.run_predict(session, initial_seqs[:, -1:], m2.sample_net.draw) with recoder.recode('sample_lstm'): final_saqs = m.simulate(session, initial_seqs, length, True) # for i in range(length): # append_seqs, _ = m.sample_net.run_predict(session, initial_seqs[:, -1:], m.sample_net.draw) # with recoder.recode('sample'): # m.sample_net.set_zero_state(session) # for i in range(length): # append_seqs, _ = m.sample_net.run_predict(session, initial_seqs[:, -1:], m.sample_net.draw) # initial_seqs = np.concatenate([initial_seqs, append_seqs], axis=-1) # # print(initial_seqs) # # with recoder.recode('probs'): # m.sample_net.set_zero_state(session) # for i in range(length): # probs = m.sample_net.run_predict(session, initial_seqs[:, i:i+1], [m.sample_net.softmax.probs]) # # print(initial_seqs) # # with recoder.recode('condition'): # m.sample_net.set_zero_state(session) # for i in range(length): # m.sample_net.run(session, initial_seqs[:, i:i+1], initial_seqs[:, i+1:i+2], [m.sample_net.cost]) # # print(initial_seqs) for key, t in sorted(recoder.items(), key=lambda x:x[0]): print('{}={:.2f}'.format(key, t * 60))
def main(_): data = reader.Data().load_raw_data(reader.ptb_raw_dir(), add_beg_token='<s>', add_end_token='</s>', add_unknwon_token='<unk>') nbest = reader.NBest(*reader.wsj0_nbest()) nbest_list = data.load_data(nbest.nbest, is_nbest=True) print('nbest list info=', wb.TxtInfo(nbest.nbest)) config = trfnce.Config(data) config.structure_type = 'rnn' config.embedding_dim = 200 config.rnn_hidden_layers = 2 config.rnn_hidden_size = 200 config.batch_size = 20 config.noise_factor = 100 config.noise_sampler = 2 config.init_weight = 0.1 config.lr_param = trfbase.LearningRateTime(1e-3) config.max_epoch = 100 # config.dropout = 0.75 # config.init_zeta = config.get_initial_logz(20) config.update_zeta = False config.write_dbg = False config.pprint() name = create_name(config) logdir = 'trf_nce/' + name wb.mkdir(logdir, is_recreate=True) sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log')) print(logdir) data.write_vocab(logdir + '/vocab.txt') data.write_data(data.datas[1], logdir + '/valid.id') data.write_data(data.datas[2], logdir + '/test.id') data.write_data(nbest_list, logdir + '/nbest.id') # wb.rmdir(logdirs) with tf.Graph().as_default(): m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0') # noise_lstm = lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:1') # m.lstm = noise_lstm sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'), global_step=m.train_net.global_step) sv.summary_writer.add_graph( tf.get_default_graph()) # write the graph to logs session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True with sv.managed_session(config=session_config) as session: m.set_session(session) # print('load lstmlm for noise generator') # noise_lstm.restore(session, # './lstm/' + run_lstmlm_withBegToken.create_name(noise_lstm.config) + '/model.ckpt') m.train(sv, session, print_per_epoch=0.1, nbest=nbest, nbest_list=nbest_list)
import os import sys import time import numpy as np import task from model import wblib as wb from model import reader from model import trfbase from model import trfnce from model import lstmlm import run_lstmlm # [data] data = reader.Data().load_raw_data([task.train, task.valid, task.test], add_beg_token='<s>', add_end_token='</s>') def create_name(config, q_config): s = str(config) if q_config is not None: s += '_with_' + run_lstmlm.create_name(q_config) return s def main(_): config = trfnce.Config(data) config.structure_type = 'mix' config.embedding_dim = 128
def main(_): data = reader.Data().load_raw_data(corpus.char_raw_dir(), add_beg_token='<s>', add_end_token='</s>', add_unknwon_token=None, max_length=1000) nbest = reader.NBest(*reader.wsj0_nbest()) print(nbest.wer()) config = trf.trfbase.Config(data) config.embedding_dim = 12 config.cnn_filters = [(i, 12) for i in range(1, 11)] config.cnn_layers = 3 config.cnn_hidden = 12 config.cnn_shared_over_layers = False config.cnn_residual = True config.cnn_skip_connection = True config.max_epoch = 1000 config.sample_sub = 100 config.jump_width = 10 config.init_weight = 0.1 config.opt_method = 'adam' config.lr_cnn = trf.trfbase.LearningRateTime(1, 1.5, tc=1e4) config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2) config.load_embedding_path = './embedding/ptb_{}x{}.emb'.format( config.vocab_size, config.embedding_dim) config.auxiliary_hidden = 12 config.auxiliary_lr = 1.0 name = create_name(config) logdir = name wb.mkdir(logdir, is_recreate=True) sys.stdout = wb.std_log(logdir + '/trf.log') print(logdir) config.pprint() # prapare embedding if wb.is_linux() and config.load_embedding_path is not None or \ (config.feat_type_file and config.feat_cluster > 0): if config.load_embedding_path is None: fvectors = './embedding/ptb_{}x{}.emb'.format( config.vocab_size, config.embedding_dim) else: fvectors = config.load_embedding_path data.word2vec(fvectors, dim=config.embedding_dim, cnum=config.feat_cluster) else: config.load_embedding_path = None # write data data.write_vocab(logdir + '/vocab.txt') data.write_data(data.datas[0], logdir + '/train.id') data.write_data(data.datas[1], logdir + '/valid.id') data.write_data(data.datas[2], logdir + '/test.id') nbest_char_txt = logdir + '/nbest.char.txt' corpus.word_text_to_char_text(reader.wsj0_nbest()[0], nbest_char_txt, is_nbest=True) nbest_list = data.load_data(nbest_char_txt, is_nbest=False) data.write_data(nbest_list, logdir + '/nbest.id') with tf.Graph().as_default(): m = trf.TRF(config, data, logdir=logdir, device='/gpu:2', simulater_device='/gpu:1') sv = tf.train.Supervisor(logdir=logdir + '/logs', summary_op=None, global_step=m._global_step) # sv.summary_writer.add_graph(tf.get_default_graph()) # write the graph to logs session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True with sv.managed_session(config=session_config) as session: # s = ['it was not black monday', 'we did n\'t even get a chance'] # eval_list = data.load_data([[data.beg_token_str] + w.split() + [data.end_token_str] for w in s]) # print(eval_list) # import sampling as sp # x_batch = [x for x in sp.SeqIter(3, config.vocab_size, # beg_token=config.beg_token, # end_token=config.end_token)] # logprobs = m.get_log_probs(x_batch, False) # logz = sp.log_sum(logprobs) # print(logprobs) # print(logz) m.train(session, sv, print_per_epoch=0.1, nbest=nbest, nbest_list=nbest_list)