def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token=None,
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(reader.wsj0_nbest()[0], is_nbest=True)

    config = small_config(data)
    # config = medium_config(data)
    # config = large_config(data)

    work_dir = './lstm/' + create_name(config)
    wb.mkdir(work_dir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(work_dir, 'lstm.log'))
    print(work_dir)
    wb.pprint_dict(config.__dict__)

    data.write_vocab(work_dir + '/vocab.txt')
    data.write_data(data.datas[0], work_dir + '/train.id')
    data.write_data(data.datas[1], work_dir + '/valid.id')
    data.write_data(data.datas[2], work_dir + '/test.id')
    data.write_data(nbest_list, work_dir + '/nbest.id')

    write_model = os.path.join(work_dir, 'model.ckpt')

    with tf.Graph().as_default():
        # lm = lstmlm.FastLM(config, device_list=['/gpu:0', '/gpu:0'])
        lm = blocklm.LM(config, device='/gpu:0')
        param_num = tf.add_n([tf.size(v) for v in tf.trainable_variables()])

        for v in lm.train_net.variables:
            print(v.name)

        save = tf.train.Saver()

        # used to write ppl on valid/test set
        summ_bank = blocklm.layers.SummaryScalarBank(['ppl_valid', 'ppl_test'])
        summ_var = blocklm.layers.SummaryVariables()

        sv = tf.train.Supervisor(logdir=os.path.join(work_dir, 'logs'),
                                 summary_op=None,
                                 global_step=lm.global_step())
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            print('param_num={:,}'.format(session.run(param_num)))

            lm.train(sv, session, data.datas[0], data.datas[1], data.datas[2])

            save.save(session, write_model)
 def __init__(self, trf_model):
     super().__init__(trf_model)
     self.wer_next_epoch = 0
     self.wer_per_epoch = 0.2
     self.opt_det_wer = 100
     self.opt_txt_wer = 100
     self.write_models = wb.mkdir(os.path.join(self.m.logdir, 'wer_models'))
def main(_):

    config = get_config()
    q_config = run_lstmlm_withBegToken.small_config(data)
    name = create_name(config, q_config)
    logdir = wb.mkdir('./trf_nn/' + name, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    config.pprint()
    print(logdir)

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    m = trf.TRF(config, data, logdir=logdir, device='/gpu:0', simulater_device='/gpu:0',
                q_model=lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:0'))
    ops = Ops(m)

    sv = tf.train.Supervisor(logdir=logdir + '/logs', summary_op=None, global_step=m._global_step)
    # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_config.gpu_options.allow_growth = True
    with sv.managed_session(config=session_config) as session:
        m.set_session(session)

        print('load lstmlm q(x)')
        m.q_model.restore(session,
                          './lstm/' + run_lstmlm_withBegToken.create_name(m.q_model.config) + '/model.ckpt')

        m.train(sv, session,
                print_per_epoch=0.05,
                operation=ops,
                model_per_epoch=None)
Example #4
0
def main(_):

    config = get_config()
    name = create_name(config)
    logdir = wb.mkdir('./trf_nn_char/' + name, is_recreate=False)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    config.pprint()
    print(logdir)

    # word-embedding
    # config.config_trf.load_embedding_path = os.path.join(logdir, 'word_emb.txt')
    # config.config_trf.update_embedding = False
    # data.word2vec(config.config_trf.load_embedding_path, config.config_trf.embedding_dim, 0)

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_char_vocab(logdir + '/vocab_char.txt',
                          logdir + '/vocab_w2c.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config, data, logdir=logdir, device=['/gpu:0'])

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m.global_steps)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # print(m.get_log_probs(data.datas[1][1]))

            # train_seqs = data.datas[0][0: config.config_trf.train_batch_size]
            # sample_seqs = data.datas[0][0: config.config_trf.sample_batch_size]
            # inputs, lengths = reader.produce_data_to_trf(train_seqs + sample_seqs)
            # m.net_trf.run_train(session, inputs, lengths, len(train_seqs))

            # m.pre_train(sv, session, batch_size=20, max_epoch=10, lr=1e-3)

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
Example #5
0
def main(_):

    # print(data.datas[0][0: 10])
    # print(data.get_max_len())
    # return

    config = get_config()
    name = create_name(config)
    logdir = wb.mkdir('./trf_cnn_new/' + name, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    config.pprint()
    print(logdir)

    # word-embedding
    # config.config_trf.load_embedding_path = os.path.join(logdir, 'word_emb.txt')
    # config.config_trf.update_embedding = False
    # data.word2vec(config.config_trf.load_embedding_path, config.config_trf.embedding_dim, 0)

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config, data, logdir=logdir, device='/gpu:0')

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m.global_steps)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # m.pre_train(sv, session, batch_size=100, max_epoch=3, lr=1.)

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
def main(_):
    config = get_config()
    # config.auxiliary_shortlist = [4000, config.vocab_size]
    # config.sample_sub = 100
    # config.multiple_trial = 10
    name = create_name(config)
    logdir = wb.mkdir('./trf_rnn/' + name, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    config.pprint()
    print(logdir)

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config,
                    data,
                    logdir=logdir,
                    device='/gpu:0',
                    simulater_device='/gpu:0')

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m._global_step)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
Example #7
0
def main(_):

    config = get_config()
    name = create_name(config)
    logdir = wb.mkdir('./trf_nn_char/' + name, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    config.pprint()
    print(logdir)

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_char_vocab(logdir + '/vocab_char.txt',
                          logdir + '/vocab_w2c.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config, data, logdir=logdir, device=['/gpu:0'])

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m.global_steps)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # print(m.get_log_probs(data.datas[1][1]))

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
def main():
    nbest_cmp = task.NBestComputer()
    data = reader.Data().load_raw_data([task.train, task.valid, task.valid],
                                       add_beg_token='<s>',
                                       add_end_token='</s>')

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'

    order_reg = [4, 5, 6]
    for order in order_reg:
        config.order = order
        config.cutoff = [0] * order

        workdir = wb.mkdir('ngramlm/' + str(config), is_recreate=False)
        sys.stdout = wb.std_log(workdir + '/ngram.log')
        print(workdir)

        m = ngramlm.Model(config, data, bindir, workdir)

        # train
        print('training...')
        m.train()

        # rescore
        print('rescoring...')
        time_beg = time.time()
        for nbest in nbest_cmp.nbests:
            nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
            # print(len(nbest.lmscore))
        print('rescore time={:.2f}m'.format((time.time() - time_beg) / 60))
        nbest_cmp.write_lmscore(workdir + '/model')

        # tune lm-scale
        print('computing wer...')
        nbest_cmp.cmp_wer()
        nbest_cmp.write_to_res(config.res_file, str(config))
Example #9
0
def main(_):
    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token='<unk>')
    nbest = reader.NBest(*reader.wsj0_nbest())
    nbest_list = data.load_data(nbest.nbest, is_nbest=True)
    print('nbest list info=', wb.TxtInfo(nbest.nbest))

    config = trfnce.Config(data)
    config.structure_type = 'rnn'
    config.embedding_dim = 200
    config.rnn_hidden_layers = 2
    config.rnn_hidden_size = 200
    config.batch_size = 20
    config.noise_factor = 100
    config.noise_sampler = 2
    config.init_weight = 0.1
    config.lr_param = trfbase.LearningRateTime(1e-3)
    config.max_epoch = 100
    # config.dropout = 0.75
    # config.init_zeta = config.get_initial_logz(20)
    config.update_zeta = False
    config.write_dbg = False
    config.pprint()

    name = create_name(config)
    logdir = 'trf_nce/' + name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log'))
    print(logdir)

    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')
    data.write_data(nbest_list, logdir + '/nbest.id')

    # wb.rmdir(logdirs)
    with tf.Graph().as_default():
        m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0')
        # noise_lstm = lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:1')
        # m.lstm = noise_lstm

        sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'),
                                 global_step=m.train_net.global_step)
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            # print('load lstmlm for noise generator')
            # noise_lstm.restore(session,
            #                    './lstm/' + run_lstmlm_withBegToken.create_name(noise_lstm.config) + '/model.ckpt')

            m.train(sv,
                    session,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
def main(_):
    config = trfnce.Config(data)

    config.structure_type = 'mix'
    config.embedding_dim = 128
    config.cnn_filters = [(i, 128) for i in range(1, 5)]
    config.cnn_hidden = 128
    config.cnn_layers = 1
    config.cnn_skip_connection = False
    config.cnn_residual = True
    config.cnn_activation = 'relu'
    config.rnn_hidden_layers = 1
    config.rnn_hidden_size = 128
    config.attention = True

    config.batch_size = 100
    config.noise_factor = 2
    config.noise_sampler = 2
    config.init_weight = 0.1
    config.optimize_method = ['sgd', 'sgd']
    config.lr_param = trfbase.LearningRateEpochDelay(1e-2, 0.5)
    config.lr_zeta = trfbase.LearningRateEpochDelay(1e-2, 0.5)
    config.max_epoch = 10
    # config.dropout = 0.75
    # config.init_zeta = config.get_initial_logz(0)
    config.update_zeta = True
    config.write_dbg = False
    config.pprint()

    q_config = run_lstmlm.small_config(data)
    # q_config = None

    name = create_name(config, q_config)
    logdir = 'trf_nce/' + name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log'))
    print(logdir)

    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    # wb.rmdir(logdirs)
    with tf.Graph().as_default():
        if q_config is None:
            m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0')
        else:
            m = trfnce.TRF(config,
                           data,
                           logdir=logdir,
                           device='/gpu:1',
                           q_model=lstmlm.LM(q_config, device='/gpu:1'))
        # noise_lstm = lstmlm.LM(run_lstmlm_withBegToken.small_config(data), device='/gpu:1')
        # m.lstm = noise_lstm

        sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'),
                                 global_step=m.train_net.global_step)
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            m.set_session(session)

            if m.q_model is not None:
                print('load lstmlm for q model')
                m.q_model.restore(
                    session, './lstm/' + run_lstmlm.create_name(q_config) +
                    '/model.ckpt')

            m.train(
                sv,
                session,
                print_per_epoch=0.1,
                operation=task.Ops(m),
                # nbest=nbest,
                # lmscale_vec=np.linspace(1, 20, 20)
            )
def main(_):
    data = reader.Data().load_raw_data(corpus.char_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>',
                                       add_unknwon_token=None,
                                       max_length=1000)
    nbest = reader.NBest(*reader.wsj0_nbest())
    print(nbest.wer())

    config = trf.trfbase.Config(data)
    config.embedding_dim = 12
    config.cnn_filters = [(i, 12) for i in range(1, 11)]
    config.cnn_layers = 3
    config.cnn_hidden = 12
    config.cnn_shared_over_layers = False
    config.cnn_residual = True
    config.cnn_skip_connection = True
    config.max_epoch = 1000
    config.sample_sub = 100
    config.jump_width = 10
    config.init_weight = 0.1
    config.opt_method = 'adam'
    config.lr_cnn = trf.trfbase.LearningRateTime(1, 1.5, tc=1e4)
    config.lr_zeta = trf.trfbase.LearningRateTime(1.0, 0.2)
    config.load_embedding_path = './embedding/ptb_{}x{}.emb'.format(
        config.vocab_size, config.embedding_dim)
    config.auxiliary_hidden = 12
    config.auxiliary_lr = 1.0

    name = create_name(config)
    logdir = name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(logdir + '/trf.log')
    print(logdir)
    config.pprint()

    # prapare embedding
    if wb.is_linux() and config.load_embedding_path is not None or \
            (config.feat_type_file and config.feat_cluster > 0):
        if config.load_embedding_path is None:
            fvectors = './embedding/ptb_{}x{}.emb'.format(
                config.vocab_size, config.embedding_dim)
        else:
            fvectors = config.load_embedding_path
        data.word2vec(fvectors,
                      dim=config.embedding_dim,
                      cnum=config.feat_cluster)
    else:
        config.load_embedding_path = None

    # write data
    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[0], logdir + '/train.id')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    nbest_char_txt = logdir + '/nbest.char.txt'
    corpus.word_text_to_char_text(reader.wsj0_nbest()[0],
                                  nbest_char_txt,
                                  is_nbest=True)
    nbest_list = data.load_data(nbest_char_txt, is_nbest=False)
    data.write_data(nbest_list, logdir + '/nbest.id')

    with tf.Graph().as_default():
        m = trf.TRF(config,
                    data,
                    logdir=logdir,
                    device='/gpu:2',
                    simulater_device='/gpu:1')

        sv = tf.train.Supervisor(logdir=logdir + '/logs',
                                 summary_op=None,
                                 global_step=m._global_step)
        # sv.summary_writer.add_graph(tf.get_default_graph())  # write the graph to logs

        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:

            # s = ['it was not black monday', 'we did n\'t even get a chance']
            # eval_list = data.load_data([[data.beg_token_str] + w.split() + [data.end_token_str] for w in s])
            # print(eval_list)

            # import sampling as sp
            # x_batch = [x for x in sp.SeqIter(3, config.vocab_size,
            #                                  beg_token=config.beg_token,
            #                                  end_token=config.end_token)]
            # logprobs = m.get_log_probs(x_batch, False)
            # logz = sp.log_sum(logprobs)
            # print(logprobs)
            # print(logz)

            m.train(session,
                    sv,
                    print_per_epoch=0.1,
                    nbest=nbest,
                    nbest_list=nbest_list)
Example #12
0
def main(_):
    config = trfnce.Config(data)
    config.structure_type = 'cnn'
    config.embedding_dim = 200
    config.cnn_filters = [(i, 100) for i in range(1, 11)]
    config.cnn_width = 3
    config.cnn_layers = 3
    config.cnn_hidden = 200
    config.rnn_hidden_layers = 2
    config.rnn_hidden_size = 200
    config.rnn_predict = True
    config.batch_size = 10
    config.noise_factor = 10
    config.noise_sampler = 'lstm:lstm/lstm_e200_h200x2/model.ckpt'
    config.init_weight = 0.1
    config.optimize_method = ['adam', 'adam']
    config.lr_param = trfbase.LearningRateEpochDelay(0.001)
    config.lr_zeta = trfbase.LearningRateEpochDelay(0.01)
    config.max_epoch = 100
    # config.dropout = 0.75
    # config.init_zeta = config.get_initial_logz(20)
    config.update_zeta = True
    config.write_dbg = False
    config.print()

    # q_config = run_lstmlm.small_config(data)
    q_config = None

    name = create_name(config, q_config)
    logdir = 'trf_nce/' + name
    wb.mkdir(logdir, is_recreate=True)
    sys.stdout = wb.std_log(os.path.join(logdir, 'trf.log'))
    print(logdir)

    data.write_vocab(logdir + '/vocab.txt')
    data.write_data(data.datas[1], logdir + '/valid.id')
    data.write_data(data.datas[2], logdir + '/test.id')

    # wb.rmdir(logdirs)
    with tf.Graph().as_default():
        if q_config is None:
            m = trfnce.TRF(config, data, logdir=logdir, device='/gpu:0')
        else:
            m = trfnce.TRF(config,
                           data,
                           logdir=logdir,
                           device='/gpu:0',
                           q_model=lstmlm.LM(q_config, device='/gpu:0'))

        # s1 = trfnce.NoiseSamplerNgram(config, data, 2)
        # s2 = trfnce.NoiseSamplerLSTMEval(config, data, config.noise_sampler.split(':')[-1])

        sv = tf.train.Supervisor(logdir=os.path.join(logdir, 'logs'),
                                 global_step=m.train_net.global_step)
        sv.summary_writer.add_graph(
            tf.get_default_graph())  # write the graph to logs
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        session_config.gpu_options.allow_growth = True
        with sv.managed_session(config=session_config) as session:
            with session.as_default():

                if m.q_model is not None:
                    print('load lstmlm for q model')
                    m.q_model.restore(
                        session, './lstm/' + run_lstmlm.create_name(q_config) +
                        '/model.ckpt')

                m.train(
                    sv,
                    session,
                    print_per_epoch=0.1,
                    operation=task.Ops(m),
                )