Exemple #1
0
def load_data(args):
    global word_dict, word_embed
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news
    # docs += du.load_sent('../datasets/BBC_news.txt')
    word_dict = util.build_dict(docs)
    # inv_dict = util.build_inv_dict(word_dict)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    print('word_dict:', word_dict)
    with open('../datasets/word_dict', 'wb') as fid:
        dump(word_dict, fid)
    doc = ' '.join(docs)
    return doc
def test1(args):
    
    docs = []
    docs += du.load_sent('../datasets/bbcnews.txt')
    logging.info('docs: {}'.format(len(docs)))
    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(docs)
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file) 
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
  
    logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117
    print(word_dict)
    doc = ' '.join(docs[0])
    # with open('bbcnews.txt') as f:
    #     docs = f.read()
    # sp.build_graph(doc)
    vertice_map = sp.hash_vertex(doc)
    for vertice in vertice_map:
        print(words2word(vertice[0],word_embed,word_dict))
Exemple #3
0
def main(args):
    logging.info("loading data...")
    fake_train, fake_dev, fake_test = du.load_fake(doc_ling=False,
                                                   sent_ling=False)
    true_train, true_dev, true_test = du.load_true(doc_ling=False,
                                                   sent_ling=False)
    if args.debug:
        true_train = true_train[0][:100]
        fake_train = fake_train[:10]
        true_dev = true_dev[:100]
        fake_dev = fake_dev[:10]
        true_test = true_test[:100]
        fake_test = fake_test[:10]
    if args.rnn_type == 'gru':
        args.rnn = lasagne.layers.GRULayer
    elif args.rnn_type == 'lstm':
        args.rnn = lasagne.layers.LSTMLayer
    else:
        args.rnn = lasagne.layers.RecurrentLayer

    logging.info("building dictionary...")
    word_dict, char_dict = util.build_dict(
        None, max_words=0, dict_file=["word_dict", "char_dict"])
    logging.info("creating embedding matrix...")
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)
    char_embed = util.char2embedding(char_dict, 30)
    (args.word_vocab_size, args.word_embed_size) = word_embed.shape
    (args.char_vocab_size, args.char_embed_size) = char_embed.shape
    logging.info("compiling Theano function...")
    att_fn, eval_fn, train_fn, params = create_theano_function(word_embed,
                                                               char_embed,
                                                               values=None)
    logging.info("batching examples...")
    dev_examples = mb.vec_minibatch(fake_dev + true_dev,
                                    word_dict,
                                    char_dict,
                                    args,
                                    False,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    test_examples = mb.vec_minibatch(fake_test + true_test,
                                     word_dict,
                                     char_dict,
                                     args,
                                     False,
                                     char=False,
                                     sent_ling=False,
                                     doc_ling=False)

    temp = []
    for true_batch in true_train:
        temp += true_batch
    true_train = temp
    del temp
    train_examples = mb.doc_minibatch(fake_train + true_train, args.batch_size)

    # train_examples = mb.train_doc_minibatch(fake_train, true_train, args)
    logging.info("checking network...")
    dev_acc = evals.eval_vec_batch(eval_fn,
                                   dev_examples,
                                   char=False,
                                   sent_ling=False,
                                   doc_ling=False)
    print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
    test_acc = evals.eval_vec_batch(eval_fn,
                                    test_examples,
                                    char=False,
                                    sent_ling=False,
                                    doc_ling=False)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    prev_fsc = 0
    stop_count = 0
    best_fsc = 0
    best_acc = 0
    logging.info("training %d examples" % len(train_examples))
    start_time = time.time()
    n_updates = 0
    for epoch in range(args.epoches):
        np.random.shuffle(train_examples)
        # if epoch > 3:
        #     logging.info("compiling Theano function again...")
        #     args.learning_rate *= 0.9
        #     att_fn, eval_fn, train_fn, params = create_theano_function(
        #         word_embed, char_embed, values=[x.get_value() for x in params])
        for batch_x, _ in train_examples:
            batch_x, batch_y = zip(*batch_x)
            batch_x = util.vectorization(list(batch_x),
                                         word_dict,
                                         char_dict,
                                         max_char_length=args.max_char)
            batch_rnn, batch_sent_mask, batch_word_mask, _ = \
                util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char)
            batch_y = np.array(list(batch_y))
            train_loss = train_fn(batch_rnn, batch_word_mask, batch_sent_mask,
                                  batch_y)
            n_updates += 1
            if n_updates % 100 == 0 and epoch > 7:
                logging.info(
                    'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' %
                    (epoch, train_loss, time.time() - start_time))
                # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args)
                dev_acc = evals.eval_vec_batch(eval_fn,
                                               dev_examples,
                                               char=False,
                                               sent_ling=False,
                                               doc_ling=False)
                logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc)
                if dev_acc[3] > best_fsc and dev_acc[0] > best_acc:
                    best_fsc = dev_acc[3]
                    best_acc = dev_acc[0]
                    logging.info(
                        'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%'
                        % (epoch, n_updates, dev_acc[3]))
                    record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \
                             (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc
                    test_acc = evals.eval_vec_batch(eval_fn,
                                                    test_examples,
                                                    char=False,
                                                    sent_ling=False,
                                                    doc_ling=False)
                    print(
                        'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f'
                        % test_acc)
                    # util.save_params('char_not_params_%.2f' % test_acc[3], params,
                    #                  epoch=epoch, n_updates=n_updates)
                if prev_fsc > dev_acc[3]:
                    stop_count += 1
                else:
                    stop_count = 0
                if stop_count == 6:
                    print("stopped")
                prev_fsc = dev_acc[3]

    print(record)
    print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc)
    return
def train(trainloader, args):
    device = 'cuda' if args.cuda else 'cpu'

    losses = []

    # TODO: concretize the embedding step

    # BASE = '/homes/du113/scratch/satire-models/'
    print('load word dict...')
    BASE = '../datasets/dict/'
    with open(BASE + 'cnn_dict.pkl', 'rb') as fid:
        word_dict = load(fid)
    print('loading completed!')
    print(len(word_dict))
    print('trump:', word_dict['trump'])
    print('obama:', word_dict['obama'])

    # load word embedding
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)

    trump_emb_before = word_embed[word_dict['trump']]
    obama_emb_before = word_embed[word_dict['obama']]
    dist_abs_before = distance_abs(trump_emb_before, obama_emb_before)
    dist_mul_before = distance_mul(trump_emb_before, obama_emb_before)

    # print(word_embed.shape)

    model = FEELModel(pretrained=True, embeddings=word_embed)
    model = model.double()

    if device == 'cuda':
        model = nn.DataParallel(model).cuda()

    optimizer = optim.SGD(model.parameters(), lr=args.lr)

    for e in range(args.epochs):
        logging.warning('{}th epoch'.format(e))
        total_loss = 0
        for i, batch in enumerate(trainloader):
            # each tensor has size (batchsize x 4 x arg.max_word_length)
            # most probably a 3 dimensional tensor
            # query, pos, neg = batch

            # query_zip, pos_zip, neg_zip = batch

            # # TODO: query should contain both vertex id(query[0]) and word_idx_list(query[1])
            # qtree = du.build_tree(query_zip[0])
            # ptree = du.build_tree(pos_zip[0])
            # ntree = du.build_tree(neg_zip[0])

            # query, pos, neg = query_zip[1], pos_zip[1], neg_zip[1]

            query, pos, neg = batch
            print('neg.shape:', neg.shape)  # torch.Size([16, 4, 203])

            qtree = du.build_tree(query)
            ptree = du.build_tree(pos)
            ntree = du.build_tree(neg)

            # print(query.shape) 2d
            # print(pos.shape)
            '''
            q_a0, q_v, q_a1, q_a2, q_m = query[:,0], query[:,1], \
                    query[:,2], query[:,3], query[:,4]
            p_a0, p_v, p_a1, p_a2, p_m = pos[:,0], pos[:,1], \
                    pos[:,2], pos[:,3], pos[:,4]
            n_a0, n_v, n_a1, n_a2, n_m = neg[:,0], neg[:,1], \
                    neg[:,2], neg[:,3], neg[:,4]

            q_a0, q_v, q_a1, q_a2, q_m = Variable(q_a0).to(device), \
                    Variable(q_v).to(device), \
                    Variable(q_a1).to(device), \
                    Variable(q_a2).to(device), \
                    Variable(q_m).to(device)

            p_a0, p_v, p_a1, p_a2, p_m = Variable(p_a0).to(device), \
                    Variable(p_v).to(device), \
                    Variable(p_a1).to(device), \
                    Variable(p_a2).to(device), \
                    Variable(p_m).to(device)

            n_a0, n_v, n_a1, n_a2, n_m = Variable(n_a0).to(device), \
                    Variable(n_v).to(device), \
                    Variable(n_a1).to(device), \
                    Variable(n_a2).to(device), \
                    Variable(n_m).to(device)

            model.zero_grad()
            
            output = model((q_v, p_v, n_v)) + model((q_v, q_a0, n_a0)) \
                    + model((q_v, q_a1, n_a1)) + model((q_v, q_a2, n_a2)) \
                    + model((q_v, q_m, n_m))
            '''
            q_v, q_a0, q_a1, q_a2 = query[:, 0, :], query[:, 1, :], \
                query[:, 2, :], query[:, 3, :]
            p_v, p_a0, p_a1, p_a2 = pos[:, 0, :], pos[:, 1, :], \
                pos[:, 2, :], pos[:, 3, :]
            n_v, n_a0, n_a1, n_a2 = neg[:, 0, :], neg[:, 1, :], \
                neg[:, 2, :], neg[:, 3, :]

            q_v, q_a0, q_a1, q_a2 = \
                Variable(q_v).to(device), \
                Variable(q_a0).to(device), \
                Variable(q_a1).to(device), \
                Variable(q_a2).to(device)

            p_v, p_a0, p_a1, p_a2 = \
                Variable(p_v).to(device), \
                Variable(p_a0).to(device), \
                Variable(p_a1).to(device), \
                Variable(p_a2).to(device)

            n_v, n_a0, n_a1, n_a2 = \
                Variable(n_v).to(device), \
                Variable(n_a0).to(device), \
                Variable(n_a1).to(device), \
                Variable(n_a2).to(device)

            query, pos, neg = \
                Variable(query).to(device), \
                Variable(pos).to(device), \
                Variable(neg).to(device)

            model.zero_grad()

            inputs = (q_v, q_a0, n_a0), (q_v, q_a1, n_a1), (q_v, q_a2, n_a2),\
                     (query, pos, neg), (qtree, ptree, ntree)
            output = model(inputs)

            loss = my_loss_fn(output)
            loss.backward()

            optimizer.step()

            total_loss += loss.item()
            if i % 20 == 0:
                logging.warning('{}th iteration'.format(i))
        logging.warning('loss: {}'.format(total_loss))
        losses.append(total_loss)

        trump_emb_after = model.embeddings.weight[word_dict['trump']]
        obama_emb_after = model.embeddings.weight[word_dict['obama']]
        dist_abs_after = distance_abs(trump_emb_after, obama_emb_after)
        dist_mul_after = distance_mul(trump_emb_after, obama_emb_after)

        print(
            '{}th epoch, distance abs before: {}, distance abs after: {}, distance mul before: {}, distance mul after: {}'
            .format(e, dist_abs_before, dist_abs_after, dist_mul_before,
                    dist_mul_after))

    trump_emb_after = model.embeddings.weight[word_dict['trump']]
    obama_emb_after = model.embeddings.weight[word_dict['obama']]
    dist_abs_after = distance_abs(trump_emb_after, obama_emb_after)
    dist_mul_after = distance_mul(trump_emb_after, obama_emb_after)

    print(
        'distance abs before: {}, distance abs after: {}, distance mul before: {}, distance mul after: {}'
        .format(dist_abs_before, dist_abs_after, dist_mul_before,
                dist_mul_after))

    # plot
    x = list(range(len(losses)))
    plt.plot(x, losses, 'ro')
    # plt.axis([-2, 2, -2, 2])
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.title('loss')
    # plt.show()
    plt.savefig("../datasets/plot/loss_treeLSTM.png")
def train(trainloader, args):
    device = 'cuda' if args.cuda else 'cpu'

    losses = []

    # TODO: concretize the embedding step

    # BASE = '/homes/du113/scratch/satire-models/'
    BASE = '../datasets/'
    with open(BASE + 'word_dict', 'rb') as fid:
        word_dict = load(fid)

    # load word embedding
    word_embed = util.words2embedding(word_dict, 100, args.embedding_file)
    # print(word_embed.shape)

    model_intra = FEELModel_intra(pretrained=True, embeddings=word_embed)
    model_inter = FEELModel_inter()

    if device == 'cuda':
        model_inter = nn.DataParallel(model_inter).cuda()
        model_intra = nn.DataParallel(model_intra).cuda()

    optimizer = optim.SGD(model_intra.parameters(), lr=args.lr)

    for e in range(args.epochs):
        logging.warning('{}th epoch'.format(e))
        total_loss = 0
        for i, batch in enumerate(trainloader):
            # each tensor has size (batchsize x 4 x arg.max_word_length)
            # most probably a 3 dimensional tensor
            query, pos, neg = batch

            # print(pos.shape)
            # print(neg.shape)
            '''
            q_a0, q_v, q_a1, q_a2, q_m = query[:,0], query[:,1], \
                    query[:,2], query[:,3], query[:,4]
            p_a0, p_v, p_a1, p_a2, p_m = pos[:,0], pos[:,1], \
                    pos[:,2], pos[:,3], pos[:,4]
            n_a0, n_v, n_a1, n_a2, n_m = neg[:,0], neg[:,1], \
                    neg[:,2], neg[:,3], neg[:,4]

            q_a0, q_v, q_a1, q_a2, q_m = Variable(q_a0).to(device), \
                    Variable(q_v).to(device), \
                    Variable(q_a1).to(device), \
                    Variable(q_a2).to(device), \
                    Variable(q_m).to(device)

            p_a0, p_v, p_a1, p_a2, p_m = Variable(p_a0).to(device), \
                    Variable(p_v).to(device), \
                    Variable(p_a1).to(device), \
                    Variable(p_a2).to(device), \
                    Variable(p_m).to(device)

            n_a0, n_v, n_a1, n_a2, n_m = Variable(n_a0).to(device), \
                    Variable(n_v).to(device), \
                    Variable(n_a1).to(device), \
                    Variable(n_a2).to(device), \
                    Variable(n_m).to(device)

            model.zero_grad()
            
            output = model((q_v, p_v, n_v)) + model((q_v, q_a0, n_a0)) \
                    + model((q_v, q_a1, n_a1)) + model((q_v, q_a2, n_a2)) \
                    + model((q_v, q_m, n_m))
            '''
            q_v, q_a0, q_a1, q_a2 = query[:, 0, :], query[:, 1, :], \
                query[:, 2, :], query[:, 3, :]
            p_v, p_a0, p_a1, p_a2 = pos[:, 0, :], pos[:, 1, :], \
                pos[:, 2, :], pos[:, 3, :]
            n_v, n_a0, n_a1, n_a2 = neg[:, 0, :], neg[:, 1, :], \
                neg[:, 2, :], neg[:, 3, :]

            # torch.Size([2, 4, 3]) -> torch.Size([2, 3])


            q_v, q_a0, q_a1, q_a2 = \
                Variable(q_v).to(device), \
                Variable(q_a0).to(device), \
                Variable(q_a1).to(device), \
                Variable(q_a2).to(device)

            p_v, p_a0, p_a1, p_a2 = \
                Variable(p_v).to(device), \
                Variable(p_a0).to(device), \
                Variable(p_a1).to(device), \
                Variable(p_a2).to(device)

            n_v, n_a0, n_a1, n_a2 = \
                Variable(n_v).to(device), \
                Variable(n_a0).to(device), \
                Variable(n_a1).to(device), \
                Variable(n_a2).to(device)


            query, pos, neg = \
                Variable(query).to(device), \
                Variable(pos).to(device), \
                Variable(neg).to(device)

            model.zero_grad()

            output1 = model_intra((q_v, q_a0, n_a0))
            output2 = model_intra((q_v, q_a1, n_a1))
            output3 = model_intra((q_v, q_a2, n_a2))
            output_inter = model_inter((query, pos, neg))
            # query, pos, neg: 2d wordidx matrix -> average to 1d list emb
            # the output should be of dimensions (batch_size)
            output = output1 + output2 + output3 + output_inter

            loss = my_loss_fn(output)
            loss.backward()

            optimizer.step()

            total_loss += loss.item()
            if i % 20 == 0:
                logging.warning('{}th iteration'.format(i))
        logging.warning('loss: {}'.format(total_loss))
        losses.append(total_loss)