Ejemplo n.º 1
0
def decode():
    '''Load dictionaries'''
    # Load vocabularies.
    print(os.getcwd())
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
        print("Loading testing data")
    data = data_util.load_test_data(FLAGS.test_file, doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True)

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len =\
                model.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                print('ran code')
                loss, outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

                outputs = [np.argmax(item) for item in outputs[0]]

            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            print(gen_sum)
            gen_sum = data_util.sen_postprocess(gen_sum)
            print(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Ejemplo n.º 2
0
 def __init__(self,batch,train_kbest = None,train_gold = None,dev_kbest = None,dev_gold = None,
              test_kbest = None,test_gold = None,vocab_path = None):
     self.vocab = None
     self.train_kbest = train_kbest
     self.train_gold = train_gold
     self.dev_kbest = dev_kbest
     self.dev_gold = dev_gold
     self.batch = batch
     self.test_kbest = test_kbest
     self.test_gold = test_gold
     if os.path.exists(vocab_path):
         print 'load vocab'
         self.max_degree,self.vocab = data_util.load_dict(vocab_path)
     else:
         print 'creat vocab'
         self.vocab = Vocab.Vocab(self.train_gold)
         print 'get max_degree'
         self.max_degree = self.get_max_degree()
         print 'save dictionary'
         data_util.save_dict(self.vocab,self.max_degree, vocab_path)
     print 'vocab size:' + str(self.vocab.size())
     print 'max_degree' + str(self.max_degree)
     print 'get dev data'
     self.dev_data = dev_reader.read_dev(dev_kbest,dev_gold,self.vocab)
     print 'number of dev:'+str(len(self.dev_data))
     #self.test_data = dev_reader.read_dev(test_kbest,test_gold,self.vocab)
     # print 'create train batch'
     # self.train_iter = train_iterator.train_iterator(train_kbest,train_gold,self.vocab,self.batch)
     print 'get train data'
     self.train_data = dev_reader.read_dev(train_kbest,train_gold,self.vocab)
     print 'number of train:'+str(len(self.train_data))
Ejemplo n.º 3
0
def test_model():
    max_degree, vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT))
    dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'),
                                   os.path.join(DIR, DEV + '.gold'), vocab)
    test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'),
                                    os.path.join(DIR, TEST + '.gold'), vocab)
    print 'model file name %s' % OUTPUT_MODEL
    print 'build model'
    #model = dependency_model.get_model(vocab.size(),vocab.tagsize(),max_degree,PAIR_WISE)
    print 'load params'
    #model.set_parmas(os.path.join(DIR,OUTPUT_MODEL))
    model = 0
    max = 0
    max_r = 0
    for i in range(200):
        if PAIR_WISE:
            res = evaluate_dataset_pair(model, dev_data, True, ratio=0.005 * i)
        else:
            res = evaluate_dataset_point(model,
                                         dev_data,
                                         True,
                                         ratio=0.005 * i)
        if res[0] > max:
            max = res[0]
            max_r = 0.005 * i
    print max_r, max
Ejemplo n.º 4
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    data = data_util.load_test_data(FLAGS.data_dir + "/" + FLAGS.test_file,
                                    doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        # create reverse table
        reverse_table = tf.contrib.lookup.index_to_string_table_from_file(
            vocabulary_file=FLAGS.data_dir + "/sum_ordered_words.txt",
            default_value="<UNK>")
        reverse_table.init.run()
        model = create_model(sess, reverse_table, is_training=False)
        result = []
        for idx, token_ids in enumerate(data):
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len = model.get_batch(
                {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0)
            # repeat
            if encoder_inputs.shape[0] == 1:
                encoder_inputs = np.repeat(encoder_inputs,
                                           FLAGS.batch_size,
                                           axis=0)
                encoder_len = np.repeat(encoder_len, FLAGS.batch_size, axis=0)
            # outputs = [batch_size,length]
            step, outputs = model.inference(sess, encoder_inputs, encoder_len)
            # If there is an EOS symbol in outputs, cut them at that point.
            target_output = [item[0].decode() for item in outputs]
            if data_util.MARK_EOS in target_output:
                target_output = target_output[:target_output.index(data_util.
                                                                   MARK_EOS)]
            gen_sum = " ".join(target_output)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Ejemplo n.º 5
0
def test_model():
    max_degree,vocab = data_util.load_dict(os.path.join(DIR,OUTPUT_DICT))
    dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'),
                                        os.path.join(DIR, DEV + '.gold'), vocab)
    test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'),
                                        os.path.join(DIR, TEST + '.gold'), vocab)
    print 'build model'
    model = dependency_model.get_model(vocab.size(), max_degree)
    print 'load params'
    model.set_parmas(os.path.join(DIR,OUTPUT_MODEL))
    print 'addbase'
    max = 0
    for i in range(200):
        res = evaluate_dataset(model,dev_data,True,ratio=0.005*i)
        if res[0]>max:
            max = res[0]
Ejemplo n.º 6
0
def test_model():
    max_degree,vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT))
    dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'),
                                            os.path.join(DIR, DEV + '.gold'), vocab)
    # test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'),
    #                                     os.path.join(DIR, TEST + '.gold'), vocab)
    #evaluate_oracle_worst(test_data)
    evaluate_baseline_random(dev_data)
    evaluate_oracle_worst(dev_data)
    print 'build model'
    model = dependency_model.get_model(vocab.size(),vocab.tagsize(), max_degree,PAIR_WISE)
    print 'load params'
    model.set_parmas(os.path.join(DIR,OUTPUT_MODEL))
    if model.Pairwise:
        evaluate_dataset_pair(model,dev_data)
    else:
        evaluate_dataset_point(model, dev_data,False)
Ejemplo n.º 7
0
def test_model():
    max_degree,vocab = data_util.load_dict(os.path.join(DIR,OUTPUT_DICT))
    dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'),
                                        os.path.join(DIR, DEV + '.gold'), vocab)
    # test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'),
    #                                     os.path.join(DIR, TEST + '.gold'), vocab)
    #evaluate_oracle_worst(test_data)
    evaluate_oracle_worst(dev_data)
    print 'build model'
    model = dependency_model.get_model(vocab.size(), max_degree)
    print 'load params'
    model.set_parmas(os.path.join(DIR,OUTPUT_MODEL))
    print 'addbase'
    evaluate_dataset(model,dev_data,True)
    #evaluate_dataset(model, test_data, True)
    print 'withoutbase'
    evaluate_dataset(model,dev_data,False)
Ejemplo n.º 8
0
from models.model_BiLSTM import BiLSTM
from models.model_FastText import FastText
from models.model_FastText2 import FastText2
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard

#log config
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
file_handler = logging.FileHandler('log/Bi_CNN3.log', mode='w')
fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
file_handler.setFormatter(fmt)
logger.addHandler(file_handler)

word_dict = load_dict()
print('loading data ... ... ...')
#load data
train_tensor, train_label, test_tensor, test_label, num_class = get_train_test_tensor(
)
print('---------------------------------------------')
#parameter
model_path = 'save_models/Bi_CNN3.h5'

iter_num = 10
maxlen = 61
embedding_dim = 300
batch_size = 128
# num_class=20
print('Build model...')
Ejemplo n.º 9
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    en_dict = data_util.load_dict(FLAGS.data_dir + "/en_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    data, en_data = data_util.load_test_data(
        FLAGS.test_file, doc_dict, FLAGS.data_dir + "/test.entity.txt",
        en_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True, None, None, None)

        result = []
        for idx, token_ids in enumerate(data):
            en_ids = en_data[idx]
            if len(en_ids) == 0:
                en_ids = [data_util.ID_PAD]


#            token_ids, en_ids = d
#print(idx)
#print(token_ids)

# Get a 1-element batch to feed the sentence to the model.
            shiva = model.get_batch(
                {
                    0:
                    [(token_ids, [data_util.ID_GO, data_util.ID_EOS],
                      [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD] +
                      en_ids +
                      [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD])]
                }, 0)
            #print(shiva)
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, entity_inputs, entity_len = shiva
            K = min(FLAGS.K, np.amax(entity_len) - 6)
            #print("K", K)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs, att, t = model.step(sess, encoder_inputs,
                                                   decoder_inputs,
                                                   entity_inputs, encoder_len,
                                                   decoder_len, entity_len, K,
                                                   True)

                #outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          entity_inputs,
                                          entity_len,
                                          K,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            #print(outputs)
            f2 = open(FLAGS.test_output + '.disambig', 'a')
            f2.write(' '.join(
                str(y) + ":" + str(x.mean())
                for x, y in zip(t[0], entity_inputs[0][3:])) + '\n')
            f2.close()
            f2 = open(FLAGS.test_output + '.attention', 'a')
            f2.write(' '.join(
                str(y) + ":" + str(x)
                for x, y in zip(att[0], entity_inputs[0][3:])) + '\n')
            f2.close()
            outputs = list(outputs[0])
            if data_util.ID_EOS in output:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            #outputs = list(outputs)
            gen_sum = " ".join(data_util.sen_map2tok(
                outputs, sum_dict[1]))  #sum_dict[1])) #lvt_str
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))

        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)
Ejemplo n.º 10
0
def decode():
    # Load vocabularies.
    doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt")
    sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt")
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, True)
        class_model = create_class_model(sess, True)

        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
                data_util.get_batch(
                    {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

                outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(sess,
                                          encoder_inputs,
                                          encoder_len,
                                          geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if data_util.ID_EOS in outputs:
                outputs = outputs[:outputs.index(data_util.ID_EOS)]
            gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1]))
            gen_sum = data_util.sen_postprocess(gen_sum)
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))

        #Get Encoder outputs
        batchidx = 0
        final_inputs = []
        final_outputs = []
        final_len = []
        while batchidx + FLAGS.batch_size <= len(data):
            encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\
            data_util.get_batch(
                {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0)

            _, _, enc_outputs = model.step(sess, encoder_inputs,
                                           decoder_inputs, encoder_len,
                                           decoder_len, True)

            enc_outputs = data_util.add_pad_for_hidden(enc_outputs,
                                                       _buckets[0][0])

            final_inputs.append(enc_outputs)
            final_outputs.append(class_output)
            final_len.append(class_len)

            batchidx += FLAGS.batch_size

        final_inputs = np.asarray(final_inputs)
        final_inputs = np.concatenate(final_inputs, 0)
        final_outputs = np.asarray(final_outputs)
        final_outputs = np.concatenate(final_outputs, 0)
        final_len = np.asarray(final_len)
        final_len = np.concatenate(final_len, 0)
        print(final_inputs.shape, final_outputs.shape, final_len.shape)

        #Hidden classifier
        step_loss, output = class_model.step(sess, final_inputs[:],
                                             final_outputs[:], final_len[:],
                                             True)

        clipped = np.array(output > 0.5, dtype=np.int)
        #label = data_util.hidden_label_gen(FLAGS.test_file, "data/test.1981.msg.txt")
        #make confusion matrix to get precision
        #tn, fp, fn, tp = confusion_matrix(label.flatten(), clipped.flatten()).ravel()
        #print("Test precision : ", tp/(tp+fp))

        with open(FLAGS.test_output, "w") as f:
            for idx, item in enumerate(result):
                print(item, file=f)
                for j in range(len(docs[idx])):
                    if clipped[idx][j] == 1:
                        print("Recommended identifier: " + docs[idx][j] + " ",
                              file=f)
                print("\n", file=f)
Ejemplo n.º 11
0

def check_tree(root_node):
    x, tree = tree_rnn.gen_nn_inputs(root_node,
                                     max_degree=12,
                                     only_leaves_have_vals=False)
    # x list the val of leaves and internal nodes
    child_exists = tree[0] > -1
    offset = 5 * 1 - child_exists * 1

    check_input(x, tree)


if __name__ == '__main__':
    print 'load vocab'
    max_degree, vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT))
    print 'vocab size:' + str(vocab.size())
    print 'max_degree' + str(max_degree)
    print 'create train batch'
    train_iter = train_iterator.train_iterator(
        os.path.join(DIR, TRAIN + '.kbest'),
        os.path.join(DIR, TRAIN + '.gold'), vocab, TRAIN_BATCH_SIZE)
    print 'get 3237'
    inst = train_iter.read_give_tree(1)
    i = 0
    for root in inst.kbest:
        print i
        check_tree(root)
        print inst.gold_lines[0]
        print inst.lines[0]
        i += 1