def analyze_no_relations(): key_file = os.path.join(FLAGS.train_dir, 'shuffled.test.key.tmp') prob_file = os.path.join(FLAGS.train_dir, 'shuffled.test.probs.tmp') key_labels = read_key_pred_file(key_file) pred_labels, all_probs = read_prob_file(prob_file) label2id = data_utils.load_from_dump(os.path.join(FLAGS.data_dir, 'label2id.dict')) correct_probs = [] # the probs of the correct labels predicted_probs = [] prob_diffs = [] # the diff between probs of correct label and predicted label for i, correct in enumerate(key_labels): pred = pred_labels[i] if pred == 'no_relation' and pred != correct: # wrongly predict as no_relation correct, pred = label2id[correct], label2id[pred] correct_prob = all_probs[i, correct] pred_prob = all_probs[i, pred] correct_probs.append(correct_prob) predicted_probs.append(pred_prob) assert pred_prob >= correct_prob prob_diffs.append(pred_prob - correct_prob) print max(prob_diffs) print min(prob_diffs) plt.figure() plt.subplot(131) plt.hist(correct_probs, 50, facecolor='blue', alpha=0.75) plt.title("Histogram of P(correct label)\n when there is a relation\n but predict to be no_relation") plt.subplot(132) plt.hist(predicted_probs, 50, facecolor='red', alpha=0.75) plt.title("Histogram of P(predicted label)\n when there is a relation\n but predict to be no_relation") plt.subplot(133) plt.hist(prob_diffs, 50, facecolor='green', alpha=0.75) plt.title("Histogram of P(predicted) - P(correct)\n when there is a relation\n but predict to be no_relation") plt.show()
def analyze_unk(): corruption_prob = 0.06 print "Loading data using vocab size %d..." % FLAGS.vocab_size word2id = data_utils.load_from_dump(FLAGS.data_dir + '%d.vocab' % FLAGS.vocab_size) train_loader = data_utils.DataLoader(FLAGS.data_dir + 'train.vocab%d.id' % FLAGS.vocab_size, 50, FLAGS.sent_len, unk_prob=corruption_prob) dev_loader = data_utils.DataLoader( FLAGS.data_dir + 'dev.vocab%d.id' % FLAGS.vocab_size, 50, FLAGS.sent_len) test_loader = data_utils.DataLoader( FLAGS.data_dir + 'test.vocab%d.id' % FLAGS.vocab_size, 50, FLAGS.sent_len) print "Counting..." train_unk, train_total = get_unk_count_in_dataset(train_loader) dev_unk, dev_total = get_unk_count_in_dataset(dev_loader) test_unk, test_total = get_unk_count_in_dataset(test_loader) print "Training token count:" print "\tunk:%d\ttotal:%d\tratio:%g" % (train_unk, train_total, 1.0 * train_unk / train_total) print "Dev token count:" print "\tunk:%d\ttotal:%d\tratio:%g" % (dev_unk, dev_total, 1.0 * dev_unk / dev_total) print "Test token count:" print "\tunk:%d\ttotal:%d\tratio:%g" % (test_unk, test_total, 1.0 * test_unk / test_total)
def main(): parser = argparse.ArgumentParser( description= "Create initial word embedding matrix from pretrained word vectors.") parser.add_argument('--vocab_size', default=36002, type=int, help='The vocabulary size for the embedding matrix.') parser.add_argument('--dim', default=300, type=int, help='The dimension of embeddings.') args = parser.parse_args() dim = args.dim vocab_size = args.vocab_size print "Creating embedding matrix of size %d x %d" % (vocab_size, dim) emb_file = EMB_ROOT + "/glove.6B.%dd.txt" % dim print "Creating embeddings from file " + emb_file word2id = data_utils.load_from_dump( os.path.join(DATA_ROOT, "dependency/%d.vocab" % vocab_size)) embedding = prepare_pretrained_embedding(emb_file, word2id, dim) np.save(DATA_ROOT + 'dependency/emb-v%d-d%d.npy' % (vocab_size, dim), embedding) print "Embedding matrix of size %d x %d has been created and saved!" % ( embedding.shape[0], embedding.shape[1])
def predict(): # create graph based on model and load trained parameters with tf.Graph().as_default(): ### the first model will be doing the full batches (a residual of examples will be left) with tf.variable_scope('model'): m = _get_model(is_train=False) saver = tf.train.Saver(tf.all_variables()) config = tf.ConfigProto() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(device_count={"GPU": 1}, gpu_options=gpu_options)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) print >> sys.stderr, "Preparing to load model from " + FLAGS.train_dir if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise IOError("Loading checkpoint file failed!") # load vocab word2id = data_utils.load_from_dump( os.path.join(FLAGS.data_dir, '%d.vocab' % FLAGS.vocab_size)) id2word = dict([(v, k) for k, v in word2id.iteritems()]) # load label2id mapping and create inverse mapping label2id = data_utils.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.iteritems()]) # load data from stdin, and run through model for line in sys.stdin: batch = json.loads(line.strip()) # batch should be a list of length-10 list: # [words, pos_tags, ner_tags, subj_id, obj_id, subj_ner, obj_ner, subj_begin, subj_end, obj_begin, obj_end] batch_map = preprocess_batch(batch, word2id) # run model through a batch if FLAGS.model == 'lstm': use_position = (FLAGS.attn and FLAGS.attn_pos_emb > 0) feed = _get_feed_dict(m, batch_map, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_position=use_position, position_type='zero') elif FLAGS.model == 'cnn': use_position = (FLAGS.pos_emb_size > 0) feed = _get_feed_dict(m, batch_map, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_position=use_position, position_type='separate') predictions, confidences = sess.run([m.prediction, m.confidence], feed_dict=feed) outputs = postprocess_batch(predictions, confidences, batch_map, id2label) for line in outputs: print line
def get_confusion_matrix(): key_file = os.path.join(FLAGS.train_dir, 'shuffled.test.key.tmp') pred_file = os.path.join(FLAGS.train_dir, 'shuffled.test.prediction.tmp') key_labels = read_key_pred_file(key_file) pred_labels = read_key_pred_file(pred_file) assert len(key_labels) == len(pred_labels) label2id = data_utils.load_from_dump(os.path.join(FLAGS.data_dir, 'label2id.dict')) id2label = {v:k for k,v in label2id.iteritems()} conf_matrix = np.zeros([len(label2id), len(label2id)]) for k,p in zip(key_labels, pred_labels): conf_matrix[label2id[k], label2id[p]] += 1 print "Constructed confusion matrix with size %d x %d" % tuple(conf_matrix.shape) normalized_conf_matrix = conf_matrix / conf_matrix.sum(axis=1)[:,np.newaxis] conf_matrix_file = key_file = os.path.join(FLAGS.train_dir, 'conf.matrix.test.tmp') normalized_conf_matrix_file = os.path.join(FLAGS.train_dir, 'conf.matrix.normalized.test.tmp') write_confusion_matrix(conf_matrix_file, conf_matrix, id2label, force_int=True) write_confusion_matrix(normalized_conf_matrix_file, normalized_conf_matrix, id2label) print "Confusion matrix written to files." plot_confusion_matrix(normalized_conf_matrix, id2label)