コード例 #1
0
ファイル: evaluations.py プロジェクト: ruihangdu/Satire
def get_attentions(fn, all_examples, word_dict, char_dict, args,
                   examples_size):
    att_matrix = np.zeros((examples_size, args.max_sent))
    for batch_x, mb_idx in all_examples:
        batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
        batch_x = util.vectorization(list(batch_x),
                                     word_dict,
                                     char_dict,
                                     max_char_length=args.max_char)
        batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
            util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
        batch_sent = util.sent_ling_padding(list(batch_sent), args.max_sent,
                                            args.max_ling)
        att = fn(batch_rnn, batch_cnn, batch_word_mask, batch_sent_mask,
                 batch_sent)
        att_matrix[mb_idx, :] = att
    return att_matrix.tolist()
コード例 #2
0
def main(args):
    logging.info("loading data...")
    fake_train, fake_dev, fake_test = du.load_fake()
    true_train, true_dev, true_test = du.load_true()
    if args.debug:
        true_train = [true_train[0][:100]]
        fake_train = fake_train[:10]
        true_dev = true_dev[:100]
        fake_dev = fake_dev[:10]
        true_test = true_test[:100]
        fake_test = fake_test[:10]
    if args.rnn_type == 'gru':
        args.rnn = lasagne.layers.GRULayer
    elif args.rnn_type == 'lstm':
        args.rnn = lasagne.layers.LSTMLayer
    else:
        args.rnn = lasagne.layers.RecurrentLayer

    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(
        None, max_words=0, dict_file=["word_dict", "char_dict"])
    logging.info("creating embedding matrix...")
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)
    char_embed = util.char2embedding(char_dict, 30)
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
    (args.char_vocab_size, args.char_embed_size) = char_embed.shape
    logging.info("compiling Theano function...")
    att_fn, eval_fn, train_fn, params = \
        tf.char_hierarchical_linguistic_fn(args, word_embed, char_embed, values=None)

    logging.info("batching examples...")
    # dev_examples = mb.doc_minibatch(fake_dev + true_dev, minibatch_size=args.batch_size, shuffle=False)
    dev_examples = mb.vec_minibatch(fake_dev + true_dev, word_dict, char_dict,
                                    args, False)
    # test_examples = mb.doc_minibatch(fake_test + true_test, args.batch_size, False)
    test_examples = mb.vec_minibatch(fake_test + true_test, word_dict,
                                     char_dict, args, False)
    train_examples = mb.train_doc_minibatch(fake_train,
                                            true_train,
                                            args,
                                            over_sample=True)
    logging.info("checking network...")
    # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args)
    dev_acc = evals.eval_vec_batch(eval_fn, dev_examples)
    print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
    test_acc = evals.eval_vec_batch(eval_fn, test_examples)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    prev_fsc = 0
    stop_count = 0
    best_fsc = 0
    best_acc = 0
    logging.info("training %d examples" % len(train_examples))
    start_time = time.time()
    n_updates = 0
    for epoch in range(args.epoches):
        np.random.shuffle(train_examples)
        if epoch > 3:
            logging.info("compiling Theano function again...")
            args.learning_rate *= 0.9
            att_fn, eval_fn, train_fn, params = \
                tf.char_hierarchical_linguistic_fn(args, word_embed, char_embed, values=[x.get_value() for x in params])
        for batch_x, _ in train_examples:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            train_loss = train_fn(batch_rnn, batch_cnn, batch_word_mask,
                                  batch_sent_mask, batch_sent, batch_doc,
                                  batch_y)
            n_updates += 1
            if n_updates % 100 == 0 and epoch > 6:
                logging.info(
                    'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' %
                    (epoch, train_loss, time.time() - start_time))
                # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args)
                dev_acc = evals.eval_vec_batch(eval_fn, dev_examples)
                logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
                if dev_acc[3] >= best_fsc and dev_acc[0] > best_acc:
                    best_fsc = dev_acc[3]
                    best_acc = dev_acc[0]
                    logging.info(
                        'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%'
                        % (epoch, n_updates, dev_acc[3]))
                    record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \
                             (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc
                    # test_acc = evals.eval_batch(eval_fn, test_examples, word_dict, char_dict, args)
                    test_acc = evals.eval_vec_batch(eval_fn, test_examples)
                    print(
                        'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f'
                        % test_acc)
                    if test_acc[3] > 91.4:
                        util.save_params(
                            'char_hierarchical_rnn_params_%.2f_%.2f' %
                            (dev_acc[3], test_acc[3]),
                            params,
                            epoch=epoch,
                            n_updates=n_updates)
                if prev_fsc > dev_acc[3]:
                    stop_count += 1
                else:
                    stop_count = 0
                if stop_count == 6:
                    print("stopped")
                prev_fsc = dev_acc[3]

    print(record)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
コード例 #3
0
ファイル: evaluations.py プロジェクト: ruihangdu/Satire
def eval_batch(fn,
               examples,
               word_dict,
               char_dict,
               args,
               char=True,
               sent_ling=True,
               doc_ling=True):
    a = b = c = d = 0
    for batch_x, _ in examples:
        if char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_sent, batch_doc)
        elif char and not sent_ling and doc_ling:
            batch_x, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_doc)
        elif char and sent_ling and not doc_ling:
            batch_x, batch_sent, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_sent)
        elif char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask)
        elif not char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_word_mask, batch_sent_mask)
        elif not char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_word_mask, batch_sent_mask,
                         batch_sent, batch_doc)

        matrix = confusion_matrix(predict, batch_y)
        a += matrix[0]
        b += matrix[1]
        c += matrix[2]
        d += matrix[3]
    acc = 100.0 * (a + d) / (a + b + c + d)
    pre = 100.0 * a / (a + c)
    rec = 100.0 * a / (a + b)
    fsc = 2 * pre * rec / (pre + rec)
    return acc, pre, rec, fsc
コード例 #4
0
ファイル: minibatch.py プロジェクト: ruihangdu/Satire
def vec_minibatch(docs,
                  word_dict,
                  char_dict,
                  args,
                  shuffle=True,
                  char=True,
                  sent_ling=True,
                  doc_ling=True):
    examples = []
    if shuffle:
        random.shuffle(docs)
    doc_length = len(docs)
    id_list = np.arange(0, doc_length, args.batch_size)
    if shuffle:
        np.random.shuffle(id_list)
    mbs = [
        np.arange(id, min(id + args.batch_size, doc_length)) for id in id_list
    ]
    for mb in mbs:
        batch_x = [docs[i] for i in mb]
        if char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_sent, batch_doc, batch_y, mb))
        elif char and sent_ling and not doc_ling:
            batch_x, batch_sent, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_sent, batch_y, mb))
        elif char and not sent_ling and doc_ling:
            batch_x, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_doc, batch_y, mb))
        elif char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_y, mb))
        elif not char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            examples.append(
                (batch_rnn, batch_sent_mask, batch_word_mask, batch_y, mb))
        elif not char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_sent, batch_doc, batch_y, mb))
    return examples