def exec_(param):
    '''initialize vocab for specified problem'''
    write_log("--------------------------------------------------------\n")

    global time_point
    time_point = time.clock()
    # common config and user control parameter loading
    qa_data_mode = param['data_mode'].qa_data
    wdim = cfg.ModelConfig.WORD_DIM_DICT[qa_data_mode]
    # use_preprocessed = param['model_mode'].use_preprocessed
    train_wv = param["preprocess_mode"].train_wv
    clean_corpus = param["preprocess_mode"].clean_corpus
    # wdim = cfg.ModelConfig.WORD_DIM[qa_data_mode]
    corpus_mode = cfg.PreProcessConfig.CORPUS_MODE
    ling_unit = cfg.PreProcessConfig.LING_UNIT  # linguistic unit mode
    assert ling_unit in ("WORD", "CHAR")
    s_w_rmvl = cfg.PreProcessConfig.STOP_WORD_REMOVAL  # stop-word removal mode

    train_set = param['model_mode'].train_set
    eval_set = param['model_mode'].eval_set
    use_saved_4_training = param['model_mode'].use_saved_4_training
    use_saved_4_testing = param['model_mode'].use_saved_4_testing

    wv_path, qa_data_path_t, qa_data_path_e, model_weight_path, score_path = generate_model_paths(
        qa_data_mode, ling_unit, s_w_rmvl, train_set, eval_set)

    vocab = du.Vocab(wv_path)
    # initialize a vocab instance
    if train_wv:  # train new vectors using corpus
        # clean the corpus text
        corpus_path = cfg.DirConfig.CORPUS_PATH_DICT[corpus_mode]
        clean_corpus_path = cfg.DirConfig.CLEAN_CORPUS_PATH_DICT[corpus_mode]

        if clean_corpus:
            print("---starting to clean corpus text at running time %f---" %
                  (time.clock() - time_point))
            write_log("started to clean corpus text at running time %f\n" %
                      (time.clock() - time_point))
            text_cleaner = du.TextCleaner(corpus_path)

            text_cleaner.clean_chn_corpus_2file(clean_corpus_path)

        # copy cleaned text to word2vec directory
        clean_corpus_filename = cfg.DirConfig.CLEAN_CORPUS_FILENAME_DICT[
            corpus_mode]
        dst_corpus_path = cfg.DirConfig.W2V_DIR + clean_corpus_filename
        copyfile(clean_corpus_path, dst_corpus_path)

        print("---starting to build vocab database at running time %f---" %
              (time.clock() - time_point))
        write_log("started to build vocab database at running time %f\n" %
                  (time.clock() - time_point))
        vocab.build_vocab_database(qa_data_mode, clean_corpus_filename)
    else:
        print("---starting to load vocab from database at running time %f---" %
              (time.clock() - time_point))
        write_log("started to load vocab from database at running time %f\n" %
                  (time.clock() - time_point))
        vocab.load_wv_from_db(qa_data_mode)
    '''
     initialize data stream for model and start training
     (canceled) data stream initiates unknown-word mapping
     (canceled) so beware of deadlock in database operation
    '''
    if train_set != 'NAH' and eval_set != 'NAH':  # do one time evaluation before training for comparison
        print("evaluation before training:")
        write_log("evaluation before training:\n")
        print(
            "---starting to prepare evaluation data stream at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to prepare evaluation data stream at running time %f\n" %
            (time.clock() - time_point))

        predicted_score_lst = []
        score_file = open(score_path, 'wb')

        data_stream = du.SentenceDataStream(qa_data_path_e, vocab,
                                            (qa_data_mode, 'e'))
        print(
            "---starting to initialize model for evaluation at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to initialize model for evaluation at running time %f\n" %
            (time.clock() - time_point))
        model_graph = ConvQAModelGraph(wdim)
        model_in = model_graph.get_model_inputs()
        model_out = model_graph.get_model_outputs()
        my_model = Model(inputs=model_in, outputs=model_out)
        loss_func = cfg.ModelConfig.LOSS_FUNC
        optm = cfg.ModelConfig.OPT
        my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy'])
        if use_saved_4_training:  # in consistency with training here
            try:
                my_model.load_weights(model_weight_path)
            except Exception as e:
                print("%s" % e)
                write_log("%s" % e)
        else:
            pass
        # some hyper-parameters
        batch_size = data_stream.get_batch_size()
        g = data_stream.get_batch()
        while (True):
            try:
                q_batch, a_batch, add_feat_batch = next(g)
            except StopIteration:
                break
            x = [q_batch, a_batch, add_feat_batch]
            predicted_batch = list(my_model.predict(x, batch_size))
            predicted_score_lst.extend(predicted_batch)
            # y = model.predict(x, batch_size=batch_size)
        for sc in predicted_score_lst:
            score_to_write = (str(sc[0]) + '\n').encode('utf-8')
            score_file.write(score_to_write)
        score_file.close()

        res = eval_in_model(qa_data_path_e, score_path, '')
        write_log(res + '\n')
        write_log("before-training evaluation ends\n")
        write_log("----------------------------\n")

    if train_set != 'NAH':  # training
        print(
            "---starting to prepare training data stream at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to prepare training data stream at running time %f\n" %
            (time.clock() - time_point))
        data_stream = du.SentenceDataStream(qa_data_path_t, vocab,
                                            (qa_data_mode, 't'))

        print(
            "---starting to initialize model for training at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to initialize model for training at running time %f\n" %
            (time.clock() - time_point))
        model_graph = ConvQAModelGraph(wdim)
        model_in = model_graph.get_model_inputs()
        model_out = model_graph.get_model_outputs()
        my_model = Model(inputs=model_in, outputs=model_out)

        # some hyper-parameters
        batch_size = data_stream.get_batch_size()
        train_epoch = cfg.ModelConfig.TRAIN_EPOCH
        loss_func = cfg.ModelConfig.LOSS_FUNC
        optm = cfg.ModelConfig.OPT
        # model initialization
        my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy'])
        if use_saved_4_training:
            try:
                my_model.load_weights(model_weight_path)
            except Exception as e:
                print("%s" % e)
                write_log("%s" % e)
        else:
            pass

        # start training
        print("---starting to feed model at running time %f---" %
              (time.clock() - time_point))
        write_log("started to feed model at running time %f\n" %
                  (time.clock() - time_point))
        for _ in range(train_epoch):
            g = data_stream.get_batch()
            while (True):
                try:
                    q_batch, a_batch, label_batch, add_feat_batch = next(g)
                except StopIteration:
                    break
                x = [q_batch, a_batch, add_feat_batch]
                y = [label_batch]
                my_model.fit(x, y, batch_size=batch_size)
        my_model.save_weights(model_weight_path)

        print(vocab._unk_num)  #######################################
        print(len(vocab.kn_set))
        write_log("In %s set, %d known words and %d unknown words found.\n" %
                  (eval_set, len(vocab.kn_set), vocab._unk_num))

    if eval_set != 'NAH':  # evaluation
        print(
            "---starting to prepare evaluation data stream at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to prepare evaluation data stream at running time %f\n" %
            (time.clock() - time_point))

        predicted_score_lst = []
        score_file = open(score_path, 'wb')

        data_stream = du.SentenceDataStream(qa_data_path_e, vocab,
                                            (qa_data_mode, 'e'))
        print(
            "---starting to initialize model for evaluation at running time %f---"
            % (time.clock() - time_point))
        write_log(
            "started to initialize model for evaluation at running time %f\n" %
            (time.clock() - time_point))
        model_graph = ConvQAModelGraph(wdim)
        model_in = model_graph.get_model_inputs()
        model_out = model_graph.get_model_outputs()
        my_model = Model(inputs=model_in, outputs=model_out)
        loss_func = cfg.ModelConfig.LOSS_FUNC
        optm = cfg.ModelConfig.OPT
        my_model.compile(optimizer=optm, loss=loss_func, metrics=['accuracy'])
        if use_saved_4_testing:
            try:
                my_model.load_weights(model_weight_path)
            except Exception as e:
                print("%s" % e)
                write_log("%s" % e)
        else:
            pass
        # some hyper-parameters
        batch_size = data_stream.get_batch_size()
        g = data_stream.get_batch()
        while (True):
            try:
                q_batch, a_batch, add_feat_batch = next(g)
            except StopIteration:
                break
            x = [q_batch, a_batch, add_feat_batch]
            predicted_batch = list(my_model.predict(x, batch_size))
            predicted_score_lst.extend(predicted_batch)
            # y = model.predict(x, batch_size=batch_size)
        for sc in predicted_score_lst:
            score_to_write = (str(sc[0]) + '\n').encode('utf-8')
            score_file.write(score_to_write)
        score_file.close()

        print(vocab._unk_num)  #######################################
        print(len(vocab.kn_set))
        write_log("In %s set, %d known words and %d unknown words found.\n" %
                  (eval_set, len(vocab.kn_set), vocab._unk_num))

        res = eval_in_model(qa_data_path_e, score_path, '')
        write_log(res + '\n')

    write_log("Finished at time %f.\n" % (time.clock() - time_point))
    write_log("--------------------------------------------------------\n\n")
    time_point = time.clock()
Beispiel #2
0
import tensorflow as tf
import numpy as np
import data_util
import os
from text_cnn import TextCNN
from config import Config
import csv

test_data_file = "data/test/tokenized_reviews.txt"
test_label_file = "data/test/labels.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/checkpoints/cnn"
result_file = "./data/cnn_result.csv"
checkpoint_prefix = os.path.join(checkpoint_dir, "cnn")
max_vocab_size = 5e5
vocab = data_util.Vocab(vocab_file, max_vocab_size)
# load test data
test_docs, seq_len, max_len, test_labels = data_util.load_data(
    test_data_file, test_label_file, vocab)
config = Config(max_vocab_size, max_len)
model = TextCNN(config)
model.build()
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True
sess_config.gpu_options.per_process_gpu_memory_fraction = 0.9
sess = tf.Session(config=sess_config)
init = tf.global_variables_initializer()
sess.run(init)
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and tf.train.get_checkpoint_state(checkpoint_dir):
    model.restore(sess, ckpt.model_checkpoint_path)
Beispiel #3
0
dropout = 1.0
zoneout = 0.0
filter_width = 3
embedding_size = 300
num_layers = 1
summary_len = 100
attention_hidden_size = 100
beam_depth = 5
state_size = 120
mode = "test"
doc_file = "data/test_article.txt"
sum_file = "data/test_abstract.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/baseline/checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "baseline")
vocab = data_util.Vocab("data/vocab", max_vocab_size)
docs = data_util.load_test_data(doc_file, vocab, max_num_tokens)
summary_file = "result/summaries.txt"
with tf.Graph().as_default():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.9
    sess = tf.Session()
    log_writer = tf.summary.FileWriter(checkpoint_dir, graph=sess.graph)
    model = DenseQuasiGRU(vocab_size=max_vocab_size,
                          embedding_size=embedding_size,
                          num_layers=num_layers,
                          state_size=state_size,
                          decoder_vocab_size=max_vocab_size,
                          filter_width=filter_width,
                          zoneout=zoneout,