Example #1
0
def get_sentences(fn, all_examples, word_dict, char_dict, args, examples_size):
    sentence_matrix = np.zeros((examples_size, args.max_sent, 160))
    for batch_x, mb_idx in all_examples:
        batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
        batch_x = util.vectorization(list(batch_x),
                                     word_dict,
                                     char_dict,
                                     max_char_length=args.max_char)
        batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
            util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
        sentence_array = fn(batch_rnn, batch_cnn, batch_word_mask,
                            batch_sent_mask)
        sentence_matrix[mb_idx, :, :] = sentence_array
    return sentence_matrix
Example #2
0
def get_attentions(fn, all_examples, word_dict, char_dict, args,
                   examples_size):
    att_matrix = np.zeros((examples_size, args.max_sent))
    for batch_x, mb_idx in all_examples:
        batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
        batch_x = util.vectorization(list(batch_x),
                                     word_dict,
                                     char_dict,
                                     max_char_length=args.max_char)
        batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
            util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
        batch_sent = util.sent_ling_padding(list(batch_sent), args.max_sent,
                                            args.max_ling)
        att = fn(batch_rnn, batch_cnn, batch_word_mask, batch_sent_mask,
                 batch_sent)
        att_matrix[mb_idx, :] = att
    return att_matrix.tolist()
Example #3
0
def main(args):
    logging.info("loading data...")
    fake_train, fake_dev, fake_test = du.load_fake(doc_ling=False,
                                                   sent_ling=False)
    true_train, true_dev, true_test = du.load_true(doc_ling=False,
                                                   sent_ling=False)
    if args.debug:
        true_train = true_train[0][:100]
        fake_train = fake_train[:10]
        true_dev = true_dev[:100]
        fake_dev = fake_dev[:10]
        true_test = true_test[:100]
        fake_test = fake_test[:10]
    if args.rnn_type == 'gru':
        args.rnn = lasagne.layers.GRULayer
    elif args.rnn_type == 'lstm':
        args.rnn = lasagne.layers.LSTMLayer
    else:
        args.rnn = lasagne.layers.RecurrentLayer

    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(
        None, max_words=0, dict_file=["word_dict", "char_dict"])
    logging.info("creating embedding matrix...")
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)
    char_embed = util.char2embedding(char_dict, 30)
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
    (args.char_vocab_size, args.char_embed_size) = char_embed.shape
    logging.info("compiling Theano function...")
    att_fn, eval_fn, train_fn, params = create_theano_function(word_embed,
                                                               char_embed,
                                                               values=None)
    logging.info("batching examples...")
    dev_examples = mb.vec_minibatch(fake_dev + true_dev,
                                    word_dict,
                                    char_dict,
                                    args,
                                    False,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    test_examples = mb.vec_minibatch(fake_test + true_test,
                                     word_dict,
                                     char_dict,
                                     args,
                                     False,
                                     char=False,
                                     sent_ling=False,
                                     doc_ling=False)

    temp = []
    for true_batch in true_train:
        temp += true_batch
    true_train = temp
    del temp
    train_examples = mb.doc_minibatch(fake_train + true_train, args.batch_size)

    # train_examples = mb.train_doc_minibatch(fake_train, true_train, args)
    logging.info("checking network...")
    dev_acc = evals.eval_vec_batch(eval_fn,
                                   dev_examples,
                                   char=False,
                                   sent_ling=False,
                                   doc_ling=False)
    print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
    test_acc = evals.eval_vec_batch(eval_fn,
                                    test_examples,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    prev_fsc = 0
    stop_count = 0
    best_fsc = 0
    best_acc = 0
    logging.info("training %d examples" % len(train_examples))
    start_time = time.time()
    n_updates = 0
    for epoch in range(args.epoches):
        np.random.shuffle(train_examples)
        # if epoch > 3:
        #     logging.info("compiling Theano function again...")
        #     args.learning_rate *= 0.9
        #     att_fn, eval_fn, train_fn, params = create_theano_function(
        #         word_embed, char_embed, values=[x.get_value() for x in params])
        for batch_x, _ in train_examples:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            train_loss = train_fn(batch_rnn, batch_word_mask, batch_sent_mask,
                                  batch_y)
            n_updates += 1
            if n_updates % 100 == 0 and epoch > 7:
                logging.info(
                    'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' %
                    (epoch, train_loss, time.time() - start_time))
                # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args)
                dev_acc = evals.eval_vec_batch(eval_fn,
                                               dev_examples,
                                               char=False,
                                               sent_ling=False,
                                               doc_ling=False)
                logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
                if dev_acc[3] > best_fsc and dev_acc[0] > best_acc:
                    best_fsc = dev_acc[3]
                    best_acc = dev_acc[0]
                    logging.info(
                        'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%'
                        % (epoch, n_updates, dev_acc[3]))
                    record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \
                             (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc
                    test_acc = evals.eval_vec_batch(eval_fn,
                                                    test_examples,
                                                    char=False,
                                                    sent_ling=False,
                                                    doc_ling=False)
                    print(
                        'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f'
                        % test_acc)
                    # util.save_params('char_not_params_%.2f' % test_acc[3], params,
                    #                  epoch=epoch, n_updates=n_updates)
                if prev_fsc > dev_acc[3]:
                    stop_count += 1
                else:
                    stop_count = 0
                if stop_count == 6:
                    print("stopped")
                prev_fsc = dev_acc[3]

    print(record)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    return
Example #4
0
def eval_batch(fn,
               examples,
               word_dict,
               char_dict,
               args,
               char=True,
               sent_ling=True,
               doc_ling=True):
    a = b = c = d = 0
    for batch_x, _ in examples:
        if char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_sent, batch_doc)
        elif char and not sent_ling and doc_ling:
            batch_x, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_doc)
        elif char and sent_ling and not doc_ling:
            batch_x, batch_sent, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask, batch_sent)
        elif char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_cnn, batch_word_mask,
                         batch_sent_mask)
        elif not char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_word_mask, batch_sent_mask)
        elif not char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            predict = fn(batch_rnn, batch_word_mask, batch_sent_mask,
                         batch_sent, batch_doc)

        matrix = confusion_matrix(predict, batch_y)
        a += matrix[0]
        b += matrix[1]
        c += matrix[2]
        d += matrix[3]
    acc = 100.0 * (a + d) / (a + b + c + d)
    pre = 100.0 * a / (a + c)
    rec = 100.0 * a / (a + b)
    fsc = 2 * pre * rec / (pre + rec)
    return acc, pre, rec, fsc
Example #5
0
def vec_minibatch(docs,
                  word_dict,
                  char_dict,
                  args,
                  shuffle=True,
                  char=True,
                  sent_ling=True,
                  doc_ling=True):
    examples = []
    if shuffle:
        random.shuffle(docs)
    doc_length = len(docs)
    id_list = np.arange(0, doc_length, args.batch_size)
    if shuffle:
        np.random.shuffle(id_list)
    mbs = [
        np.arange(id, min(id + args.batch_size, doc_length)) for id in id_list
    ]
    for mb in mbs:
        batch_x = [docs[i] for i in mb]
        if char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_sent, batch_doc, batch_y, mb))
        elif char and sent_ling and not doc_ling:
            batch_x, batch_sent, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_sent, batch_y, mb))
        elif char and not sent_ling and doc_ling:
            batch_x, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_doc, batch_y, mb))
        elif char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, batch_cnn = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_cnn, batch_y, mb))
        elif not char and not sent_ling and not doc_ling:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            examples.append(
                (batch_rnn, batch_sent_mask, batch_word_mask, batch_y, mb))
        elif not char and sent_ling and doc_ling:
            batch_x, batch_sent, batch_doc, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_sent = util.sent_ling_padding(list(batch_sent),
                                                args.max_sent, args.max_ling)
            batch_doc = util.doc_ling_padding(list(batch_doc), args.max_ling)
            batch_y = np.array(list(batch_y))
            examples.append((batch_rnn, batch_sent_mask, batch_word_mask,
                             batch_sent, batch_doc, batch_y, mb))
    return examples