def decode(): '''Load dictionaries''' # Load vocabularies. print(os.getcwd()) doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt") sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt") if doc_dict is None or sum_dict is None: logging.warning("Dict not found.") print("Loading testing data") data = data_util.load_test_data(FLAGS.test_file, doc_dict) with tf.Session() as sess: # Create model and load parameters. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, True) result = [] for idx, token_ids in enumerate(data): # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, encoder_len, decoder_len =\ model.get_batch( {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0) if FLAGS.batch_size == 1 and FLAGS.geneos: print('ran code') loss, outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) outputs = [np.argmax(item) for item in outputs[0]] else: outputs = model.step_beam(sess, encoder_inputs, encoder_len, geneos=FLAGS.geneos) # If there is an EOS symbol in outputs, cut them at that point. if data_util.ID_EOS in outputs: outputs = outputs[:outputs.index(data_util.ID_EOS)] gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1])) print(gen_sum) gen_sum = data_util.sen_postprocess(gen_sum) print(gen_sum) result.append(gen_sum) logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75])) with open(FLAGS.test_output, "w") as f: for item in result: print(item, file=f)
def __init__(self,batch,train_kbest = None,train_gold = None,dev_kbest = None,dev_gold = None, test_kbest = None,test_gold = None,vocab_path = None): self.vocab = None self.train_kbest = train_kbest self.train_gold = train_gold self.dev_kbest = dev_kbest self.dev_gold = dev_gold self.batch = batch self.test_kbest = test_kbest self.test_gold = test_gold if os.path.exists(vocab_path): print 'load vocab' self.max_degree,self.vocab = data_util.load_dict(vocab_path) else: print 'creat vocab' self.vocab = Vocab.Vocab(self.train_gold) print 'get max_degree' self.max_degree = self.get_max_degree() print 'save dictionary' data_util.save_dict(self.vocab,self.max_degree, vocab_path) print 'vocab size:' + str(self.vocab.size()) print 'max_degree' + str(self.max_degree) print 'get dev data' self.dev_data = dev_reader.read_dev(dev_kbest,dev_gold,self.vocab) print 'number of dev:'+str(len(self.dev_data)) #self.test_data = dev_reader.read_dev(test_kbest,test_gold,self.vocab) # print 'create train batch' # self.train_iter = train_iterator.train_iterator(train_kbest,train_gold,self.vocab,self.batch) print 'get train data' self.train_data = dev_reader.read_dev(train_kbest,train_gold,self.vocab) print 'number of train:'+str(len(self.train_data))
def test_model(): max_degree, vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT)) dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'), os.path.join(DIR, DEV + '.gold'), vocab) test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'), os.path.join(DIR, TEST + '.gold'), vocab) print 'model file name %s' % OUTPUT_MODEL print 'build model' #model = dependency_model.get_model(vocab.size(),vocab.tagsize(),max_degree,PAIR_WISE) print 'load params' #model.set_parmas(os.path.join(DIR,OUTPUT_MODEL)) model = 0 max = 0 max_r = 0 for i in range(200): if PAIR_WISE: res = evaluate_dataset_pair(model, dev_data, True, ratio=0.005 * i) else: res = evaluate_dataset_point(model, dev_data, True, ratio=0.005 * i) if res[0] > max: max = res[0] max_r = 0.005 * i print max_r, max
def decode(): # Load vocabularies. doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt") sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt") if doc_dict is None or sum_dict is None: logging.warning("Dict not found.") data = data_util.load_test_data(FLAGS.data_dir + "/" + FLAGS.test_file, doc_dict) with tf.Session() as sess: # Create model and load parameters. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) # create reverse table reverse_table = tf.contrib.lookup.index_to_string_table_from_file( vocabulary_file=FLAGS.data_dir + "/sum_ordered_words.txt", default_value="<UNK>") reverse_table.init.run() model = create_model(sess, reverse_table, is_training=False) result = [] for idx, token_ids in enumerate(data): # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, encoder_len, decoder_len = model.get_batch( {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS])]}, 0) # repeat if encoder_inputs.shape[0] == 1: encoder_inputs = np.repeat(encoder_inputs, FLAGS.batch_size, axis=0) encoder_len = np.repeat(encoder_len, FLAGS.batch_size, axis=0) # outputs = [batch_size,length] step, outputs = model.inference(sess, encoder_inputs, encoder_len) # If there is an EOS symbol in outputs, cut them at that point. target_output = [item[0].decode() for item in outputs] if data_util.MARK_EOS in target_output: target_output = target_output[:target_output.index(data_util. MARK_EOS)] gen_sum = " ".join(target_output) result.append(gen_sum) logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75])) with open(FLAGS.test_output, "w") as f: for item in result: print(item, file=f)
def test_model(): max_degree,vocab = data_util.load_dict(os.path.join(DIR,OUTPUT_DICT)) dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'), os.path.join(DIR, DEV + '.gold'), vocab) test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'), os.path.join(DIR, TEST + '.gold'), vocab) print 'build model' model = dependency_model.get_model(vocab.size(), max_degree) print 'load params' model.set_parmas(os.path.join(DIR,OUTPUT_MODEL)) print 'addbase' max = 0 for i in range(200): res = evaluate_dataset(model,dev_data,True,ratio=0.005*i) if res[0]>max: max = res[0]
def test_model(): max_degree,vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT)) dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'), os.path.join(DIR, DEV + '.gold'), vocab) # test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'), # os.path.join(DIR, TEST + '.gold'), vocab) #evaluate_oracle_worst(test_data) evaluate_baseline_random(dev_data) evaluate_oracle_worst(dev_data) print 'build model' model = dependency_model.get_model(vocab.size(),vocab.tagsize(), max_degree,PAIR_WISE) print 'load params' model.set_parmas(os.path.join(DIR,OUTPUT_MODEL)) if model.Pairwise: evaluate_dataset_pair(model,dev_data) else: evaluate_dataset_point(model, dev_data,False)
def test_model(): max_degree,vocab = data_util.load_dict(os.path.join(DIR,OUTPUT_DICT)) dev_data = dev_reader.read_dev(os.path.join(DIR, DEV + '.kbest'), os.path.join(DIR, DEV + '.gold'), vocab) # test_data = dev_reader.read_dev(os.path.join(DIR, TEST + '.kbest'), # os.path.join(DIR, TEST + '.gold'), vocab) #evaluate_oracle_worst(test_data) evaluate_oracle_worst(dev_data) print 'build model' model = dependency_model.get_model(vocab.size(), max_degree) print 'load params' model.set_parmas(os.path.join(DIR,OUTPUT_MODEL)) print 'addbase' evaluate_dataset(model,dev_data,True) #evaluate_dataset(model, test_data, True) print 'withoutbase' evaluate_dataset(model,dev_data,False)
from models.model_BiLSTM import BiLSTM from models.model_FastText import FastText from models.model_FastText2 import FastText2 from keras.utils.np_utils import to_categorical from keras.callbacks import ModelCheckpoint from keras.callbacks import TensorBoard #log config logger = logging.getLogger() logging.basicConfig(level=logging.INFO) file_handler = logging.FileHandler('log/Bi_CNN3.log', mode='w') fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(fmt) logger.addHandler(file_handler) word_dict = load_dict() print('loading data ... ... ...') #load data train_tensor, train_label, test_tensor, test_label, num_class = get_train_test_tensor( ) print('---------------------------------------------') #parameter model_path = 'save_models/Bi_CNN3.h5' iter_num = 10 maxlen = 61 embedding_dim = 300 batch_size = 128 # num_class=20 print('Build model...')
def decode(): # Load vocabularies. doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt") sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt") en_dict = data_util.load_dict(FLAGS.data_dir + "/en_dict.txt") if doc_dict is None or sum_dict is None: logging.warning("Dict not found.") data, en_data = data_util.load_test_data( FLAGS.test_file, doc_dict, FLAGS.data_dir + "/test.entity.txt", en_dict) with tf.Session() as sess: # Create model and load parameters. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, True, None, None, None) result = [] for idx, token_ids in enumerate(data): en_ids = en_data[idx] if len(en_ids) == 0: en_ids = [data_util.ID_PAD] # token_ids, en_ids = d #print(idx) #print(token_ids) # Get a 1-element batch to feed the sentence to the model. shiva = model.get_batch( { 0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS], [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD] + en_ids + [data_util.ID_PAD, data_util.ID_PAD, data_util.ID_PAD])] }, 0) #print(shiva) encoder_inputs, decoder_inputs, encoder_len, decoder_len, entity_inputs, entity_len = shiva K = min(FLAGS.K, np.amax(entity_len) - 6) #print("K", K) if FLAGS.batch_size == 1 and FLAGS.geneos: loss, outputs, att, t = model.step(sess, encoder_inputs, decoder_inputs, entity_inputs, encoder_len, decoder_len, entity_len, K, True) #outputs = [np.argmax(item) for item in outputs[0]] else: outputs = model.step_beam(sess, encoder_inputs, encoder_len, entity_inputs, entity_len, K, geneos=FLAGS.geneos) # If there is an EOS symbol in outputs, cut them at that point. #print(outputs) f2 = open(FLAGS.test_output + '.disambig', 'a') f2.write(' '.join( str(y) + ":" + str(x.mean()) for x, y in zip(t[0], entity_inputs[0][3:])) + '\n') f2.close() f2 = open(FLAGS.test_output + '.attention', 'a') f2.write(' '.join( str(y) + ":" + str(x) for x, y in zip(att[0], entity_inputs[0][3:])) + '\n') f2.close() outputs = list(outputs[0]) if data_util.ID_EOS in output: outputs = outputs[:outputs.index(data_util.ID_EOS)] #outputs = list(outputs) gen_sum = " ".join(data_util.sen_map2tok( outputs, sum_dict[1])) #sum_dict[1])) #lvt_str gen_sum = data_util.sen_postprocess(gen_sum) result.append(gen_sum) logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75])) with open(FLAGS.test_output, "w") as f: for item in result: print(item, file=f)
def decode(): # Load vocabularies. doc_dict = data_util.load_dict(FLAGS.data_dir + "/doc_dict.txt") sum_dict = data_util.load_dict(FLAGS.data_dir + "/sum_dict.txt") if doc_dict is None or sum_dict is None: logging.warning("Dict not found.") docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict) with tf.Session() as sess: # Create model and load parameters. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, True) class_model = create_class_model(sess, True) result = [] for idx, token_ids in enumerate(data): # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\ data_util.get_batch( {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0) if FLAGS.batch_size == 1 and FLAGS.geneos: loss, outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) outputs = [np.argmax(item) for item in outputs[0]] else: outputs = model.step_beam(sess, encoder_inputs, encoder_len, geneos=FLAGS.geneos) # If there is an EOS symbol in outputs, cut them at that point. if data_util.ID_EOS in outputs: outputs = outputs[:outputs.index(data_util.ID_EOS)] gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1])) gen_sum = data_util.sen_postprocess(gen_sum) result.append(gen_sum) logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75])) #Get Encoder outputs batchidx = 0 final_inputs = [] final_outputs = [] final_len = [] while batchidx + FLAGS.batch_size <= len(data): encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\ data_util.get_batch( {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0) _, _, enc_outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) enc_outputs = data_util.add_pad_for_hidden(enc_outputs, _buckets[0][0]) final_inputs.append(enc_outputs) final_outputs.append(class_output) final_len.append(class_len) batchidx += FLAGS.batch_size final_inputs = np.asarray(final_inputs) final_inputs = np.concatenate(final_inputs, 0) final_outputs = np.asarray(final_outputs) final_outputs = np.concatenate(final_outputs, 0) final_len = np.asarray(final_len) final_len = np.concatenate(final_len, 0) print(final_inputs.shape, final_outputs.shape, final_len.shape) #Hidden classifier step_loss, output = class_model.step(sess, final_inputs[:], final_outputs[:], final_len[:], True) clipped = np.array(output > 0.5, dtype=np.int) #label = data_util.hidden_label_gen(FLAGS.test_file, "data/test.1981.msg.txt") #make confusion matrix to get precision #tn, fp, fn, tp = confusion_matrix(label.flatten(), clipped.flatten()).ravel() #print("Test precision : ", tp/(tp+fp)) with open(FLAGS.test_output, "w") as f: for idx, item in enumerate(result): print(item, file=f) for j in range(len(docs[idx])): if clipped[idx][j] == 1: print("Recommended identifier: " + docs[idx][j] + " ", file=f) print("\n", file=f)
def check_tree(root_node): x, tree = tree_rnn.gen_nn_inputs(root_node, max_degree=12, only_leaves_have_vals=False) # x list the val of leaves and internal nodes child_exists = tree[0] > -1 offset = 5 * 1 - child_exists * 1 check_input(x, tree) if __name__ == '__main__': print 'load vocab' max_degree, vocab = data_util.load_dict(os.path.join(DIR, OUTPUT_DICT)) print 'vocab size:' + str(vocab.size()) print 'max_degree' + str(max_degree) print 'create train batch' train_iter = train_iterator.train_iterator( os.path.join(DIR, TRAIN + '.kbest'), os.path.join(DIR, TRAIN + '.gold'), vocab, TRAIN_BATCH_SIZE) print 'get 3237' inst = train_iter.read_give_tree(1) i = 0 for root in inst.kbest: print i check_tree(root) print inst.gold_lines[0] print inst.lines[0] i += 1