Exemple #1
0
def print_vectors():
    vocab = Vocab()
    vocab.build(TRAIN_PATH)

    # torch.tensor([2764, 64])
    pre_trained_embedding = vocab.load_weight(EMBED_PATH)

    train_data, test_data, test_answer = ev.prepare_evaluate()

    obj = train_data

    for d in obj.keys():
        t_in = prep.tensorFromSentence(vocab, obj[d])
        embedded = get_sent_embed(t_in, pre_trained_embedding)
        print("%s\t%s" %
              (d, ' '.join([str(e)
                            for e in embedded.squeeze().data.tolist()])))
Exemple #2
0
def evaluate():
    vocab = Vocab()
    vocab.build(TRAIN_PATH)
    # torch.tensor([2764, 64])
    pre_trained_embedding = vocab.load_weight(EMBED_PATH)

    train_data, test_data, test_answer = ev.prepare_evaluate()

    train_embed = get_embed(vocab, train_data, pre_trained_embedding)

    # evaluation
    print("[INFO] start evaluating!")
    total = len(test_data)
    answer5 = 0
    answer1 = 0

    for tk in test_data:
        print("Q.%s %s" % (tk, pretty_printer2(test_data[tk])))
        test_in = prep.tensorFromSentence(vocab, test_data[tk])
        embedded = get_sent_embed(test_in, pre_trained_embedding)

        temp = {}
        for candi in train_embed.keys():
            t = train_embed[candi]
            e = embedded
            temp[candi] = cosine_similarity(t, e)

        top_n = get_top_n(temp, 5)
        for e in top_n.keys():
            print("%.4f %4s %s" %
                  (top_n[e], e, pretty_printer2(train_data[e])))
            if ev.isAnswer(e, test_answer[tk]):
                answer5 += 1
                break
        top1 = list(top_n.keys())[0]
        if ev.isAnswer(top1, test_answer[tk]):
            answer1 += 1
        print("------------------------------------------")

    accuracy_at_5 = answer5 / total * 100
    accuracy_at_1 = answer1 / total * 100

    print("total: %d, accuracy@5: %.4f, accuracy@1: %.4f" %
          (total, accuracy_at_5, accuracy_at_1))
Exemple #3
0
def evaluate(args):
    vocab = Vocab()
    vocab.build(train_file)

    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device)

    if args.encoder:
        encoder.load_state_dict(torch.load(args.encoder))
        print("[INFO] load encoder with %s" % args.encoder)
    if args.decoder:
        decoder.load_state_dict(torch.load(args.decoder))
        print("[INFO] load decoder with %s" % args.decoder)

    # evaluate_similarity(encoder, vocab, batch_size, decoder=decoder)

    pre_trained_embedding = vocab.load_weight(EMBED_PATH)
    eval_sim_lc(encoder,
                vocab,
                batch_size,
                pre_trained_embedding,
                decoder=decoder)
Exemple #4
0
def main(args):
    global batch_size
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size
    lr = args.lr

    train_file = 'data/train_data_nv.txt'

    vocab = Vocab()
    vocab.build(train_file)

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)

    if args.encoder:
        encoder.load_state_dict(torch.load(args.encoder))
        print("[INFO] load encoder with %s" % args.encoder)
    if args.decoder:
        decoder.load_state_dict(torch.load(args.decoder))
        print("[INFO] load decoder with %s" % args.decoder)

    train_data = prep.read_train_data(train_file)
    train_loader = data.DataLoader(train_data,
                                   batch_size=batch_size,
                                   shuffle=True)

    # ev.evaluateRandomly(encoder, decoder, train_data, vocab, batch_size)
    # ev.evaluate_with_print(encoder, vocab, batch_size)

    # initialize
    max_a_at_5, max_a_at_1 = ev.evaluate_similarity(encoder,
                                                    vocab,
                                                    batch_size,
                                                    decoder=decoder)
    # max_a_at_5, max_a_at_1 = 0, 0
    max_bleu = 0

    total_epoch = args.epoch
    print(args)
    for epoch in range(1, total_epoch + 1):
        random.shuffle(train_data)
        trainIters(args,
                   epoch,
                   encoder,
                   decoder,
                   total_epoch,
                   train_data,
                   vocab,
                   train_loader,
                   print_every=2,
                   learning_rate=lr)

        if epoch % 20 == 0:
            a_at_5, a_at_1 = ev.evaluate_similarity(encoder,
                                                    vocab,
                                                    batch_size,
                                                    decoder=decoder)

            if a_at_1 > max_a_at_1:
                max_a_at_1 = a_at_1
                print("[INFO] New record! accuracy@1: %.4f" % a_at_1)

            if a_at_5 > max_a_at_5:
                max_a_at_5 = a_at_5
                print("[INFO] New record! accuracy@5: %.4f" % a_at_5)
                if args.save == 'y':
                    torch.save(encoder.state_dict(), 'encoder-max.model')
                    torch.save(decoder.state_dict(), 'decoder-max.model')
                    print("[INFO] new model saved")

            bleu = ev.evaluateRandomly(encoder, decoder, train_data, vocab,
                                       batch_size)
            if bleu > max_bleu:
                max_bleu = bleu
                if args.save == 'y':
                    torch.save(encoder.state_dict(), 'encoder-max-bleu.model')
                    torch.save(decoder.state_dict(), 'decoder-max-bleu.model')
                    print("[INFO] new model saved")

    print("Done! max accuracy@5: %.4f, max accuracy@1: %.4f" %
          (max_a_at_5, max_a_at_1))
    print("max bleu: %.2f" % max_bleu)
    if args.save == 'y':
        torch.save(encoder.state_dict(), 'encoder-last.model')
        torch.save(decoder.state_dict(), 'decoder-last.model')
Exemple #5
0
    parser.add_argument('--w_embed_size', type=int, default=64)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epoch', type=int, default=400)
    parser.add_argument('--save', choices=['y', 'n'], default='n')
    parser.add_argument('--pre_trained_embed', choices=['y', 'n'], default='y')
    args = parser.parse_args()

    global batch_size
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size

    train_file = 'data/train_data_nv.txt'

    vocab = Vocab()
    vocab.build(train_file)

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device)