コード例 #1
0
def analyze_no_relations():
    key_file = os.path.join(FLAGS.train_dir, 'shuffled.test.key.tmp')
    prob_file = os.path.join(FLAGS.train_dir, 'shuffled.test.probs.tmp')
    key_labels = read_key_pred_file(key_file)
    pred_labels, all_probs = read_prob_file(prob_file)
    label2id = data_utils.load_from_dump(os.path.join(FLAGS.data_dir, 'label2id.dict'))

    correct_probs = [] # the probs of the correct labels
    predicted_probs = []
    prob_diffs = [] # the diff between probs of correct label and predicted label
    for i, correct in enumerate(key_labels):
        pred = pred_labels[i]
        if pred == 'no_relation' and pred != correct: # wrongly predict as no_relation
            correct, pred = label2id[correct], label2id[pred]
            correct_prob = all_probs[i, correct]
            pred_prob = all_probs[i, pred]
            correct_probs.append(correct_prob)
            predicted_probs.append(pred_prob)
            assert pred_prob >= correct_prob
            prob_diffs.append(pred_prob - correct_prob)
    print max(prob_diffs)
    print min(prob_diffs)
    plt.figure()
    plt.subplot(131)
    plt.hist(correct_probs, 50, facecolor='blue', alpha=0.75)
    plt.title("Histogram of P(correct label)\n when there is a relation\n but predict to be no_relation")
    plt.subplot(132)
    plt.hist(predicted_probs, 50, facecolor='red', alpha=0.75)
    plt.title("Histogram of P(predicted label)\n when there is a relation\n but predict to be no_relation")
    plt.subplot(133)
    plt.hist(prob_diffs, 50, facecolor='green', alpha=0.75)
    plt.title("Histogram of P(predicted) - P(correct)\n when there is a relation\n but predict to be no_relation")
    plt.show()
コード例 #2
0
def analyze_unk():
    corruption_prob = 0.06
    print "Loading data using vocab size %d..." % FLAGS.vocab_size
    word2id = data_utils.load_from_dump(FLAGS.data_dir +
                                        '%d.vocab' % FLAGS.vocab_size)
    train_loader = data_utils.DataLoader(FLAGS.data_dir +
                                         'train.vocab%d.id' % FLAGS.vocab_size,
                                         50,
                                         FLAGS.sent_len,
                                         unk_prob=corruption_prob)
    dev_loader = data_utils.DataLoader(
        FLAGS.data_dir + 'dev.vocab%d.id' % FLAGS.vocab_size, 50,
        FLAGS.sent_len)
    test_loader = data_utils.DataLoader(
        FLAGS.data_dir + 'test.vocab%d.id' % FLAGS.vocab_size, 50,
        FLAGS.sent_len)

    print "Counting..."
    train_unk, train_total = get_unk_count_in_dataset(train_loader)
    dev_unk, dev_total = get_unk_count_in_dataset(dev_loader)
    test_unk, test_total = get_unk_count_in_dataset(test_loader)

    print "Training token count:"
    print "\tunk:%d\ttotal:%d\tratio:%g" % (train_unk, train_total,
                                            1.0 * train_unk / train_total)
    print "Dev token count:"
    print "\tunk:%d\ttotal:%d\tratio:%g" % (dev_unk, dev_total,
                                            1.0 * dev_unk / dev_total)
    print "Test token count:"
    print "\tunk:%d\ttotal:%d\tratio:%g" % (test_unk, test_total,
                                            1.0 * test_unk / test_total)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Create initial word embedding matrix from pretrained word vectors.")
    parser.add_argument('--vocab_size',
                        default=36002,
                        type=int,
                        help='The vocabulary size for the embedding matrix.')
    parser.add_argument('--dim',
                        default=300,
                        type=int,
                        help='The dimension of embeddings.')
    args = parser.parse_args()

    dim = args.dim
    vocab_size = args.vocab_size
    print "Creating embedding matrix of size %d x %d" % (vocab_size, dim)

    emb_file = EMB_ROOT + "/glove.6B.%dd.txt" % dim
    print "Creating embeddings from file " + emb_file
    word2id = data_utils.load_from_dump(
        os.path.join(DATA_ROOT, "dependency/%d.vocab" % vocab_size))
    embedding = prepare_pretrained_embedding(emb_file, word2id, dim)
    np.save(DATA_ROOT + 'dependency/emb-v%d-d%d.npy' % (vocab_size, dim),
            embedding)
    print "Embedding matrix of size %d x %d has been created and saved!" % (
        embedding.shape[0], embedding.shape[1])
コード例 #4
0
ファイル: predict.py プロジェクト: frankxu2004/feedforward-RE
def predict():
    # create graph based on model and load trained parameters
    with tf.Graph().as_default():
        ### the first model will be doing the full batches (a residual of examples will be left)
        with tf.variable_scope('model'):
            m = _get_model(is_train=False)
        saver = tf.train.Saver(tf.all_variables())

        config = tf.ConfigProto()
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(device_count={"GPU": 1},
                                                gpu_options=gpu_options))
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        print >> sys.stderr, "Preparing to load model from " + FLAGS.train_dir
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise IOError("Loading checkpoint file failed!")

    # load vocab
    word2id = data_utils.load_from_dump(
        os.path.join(FLAGS.data_dir, '%d.vocab' % FLAGS.vocab_size))
    id2word = dict([(v, k) for k, v in word2id.iteritems()])

    # load label2id mapping and create inverse mapping
    label2id = data_utils.LABEL_TO_ID
    id2label = dict([(v, k) for k, v in label2id.iteritems()])

    # load data from stdin, and run through model
    for line in sys.stdin:
        batch = json.loads(line.strip())
        # batch should be a list of length-10 list:
        # [words, pos_tags, ner_tags, subj_id, obj_id, subj_ner, obj_ner, subj_begin, subj_end, obj_begin, obj_end]
        batch_map = preprocess_batch(batch, word2id)

        # run model through a batch
        if FLAGS.model == 'lstm':
            use_position = (FLAGS.attn and FLAGS.attn_pos_emb > 0)
            feed = _get_feed_dict(m,
                                  batch_map,
                                  use_pos=(FLAGS.pos_size > 0),
                                  use_ner=(FLAGS.ner_size > 0),
                                  use_position=use_position,
                                  position_type='zero')
        elif FLAGS.model == 'cnn':
            use_position = (FLAGS.pos_emb_size > 0)
            feed = _get_feed_dict(m,
                                  batch_map,
                                  use_pos=(FLAGS.pos_size > 0),
                                  use_ner=(FLAGS.ner_size > 0),
                                  use_position=use_position,
                                  position_type='separate')
        predictions, confidences = sess.run([m.prediction, m.confidence],
                                            feed_dict=feed)

        outputs = postprocess_batch(predictions, confidences, batch_map,
                                    id2label)
        for line in outputs:
            print line
コード例 #5
0
def get_confusion_matrix():
    key_file = os.path.join(FLAGS.train_dir, 'shuffled.test.key.tmp')
    pred_file = os.path.join(FLAGS.train_dir, 'shuffled.test.prediction.tmp')
    key_labels = read_key_pred_file(key_file)
    pred_labels = read_key_pred_file(pred_file)
    assert len(key_labels) == len(pred_labels)
    label2id = data_utils.load_from_dump(os.path.join(FLAGS.data_dir, 'label2id.dict'))
    id2label = {v:k for k,v in label2id.iteritems()}

    conf_matrix = np.zeros([len(label2id), len(label2id)])
    for k,p in zip(key_labels, pred_labels):
        conf_matrix[label2id[k], label2id[p]] += 1
    print "Constructed confusion matrix with size %d x %d" % tuple(conf_matrix.shape)
    normalized_conf_matrix = conf_matrix / conf_matrix.sum(axis=1)[:,np.newaxis]

    conf_matrix_file = key_file = os.path.join(FLAGS.train_dir, 'conf.matrix.test.tmp')
    normalized_conf_matrix_file = os.path.join(FLAGS.train_dir, 'conf.matrix.normalized.test.tmp')
    write_confusion_matrix(conf_matrix_file, conf_matrix, id2label, force_int=True)
    write_confusion_matrix(normalized_conf_matrix_file, normalized_conf_matrix, id2label)
    print "Confusion matrix written to files."
    plot_confusion_matrix(normalized_conf_matrix, id2label)