Exemple #1
0
def print_vectors():
    vocab = Vocab()
    vocab.build(TRAIN_PATH)

    # torch.tensor([2764, 64])
    pre_trained_embedding = vocab.load_weight(EMBED_PATH)

    train_data, test_data, test_answer = ev.prepare_evaluate()

    obj = train_data

    for d in obj.keys():
        t_in = prep.tensorFromSentence(vocab, obj[d])
        embedded = get_sent_embed(t_in, pre_trained_embedding)
        print("%s\t%s" %
              (d, ' '.join([str(e)
                            for e in embedded.squeeze().data.tolist()])))
Exemple #2
0
def evaluate():
    vocab = Vocab()
    vocab.build(TRAIN_PATH)
    # torch.tensor([2764, 64])
    pre_trained_embedding = vocab.load_weight(EMBED_PATH)

    train_data, test_data, test_answer = ev.prepare_evaluate()

    train_embed = get_embed(vocab, train_data, pre_trained_embedding)

    # evaluation
    print("[INFO] start evaluating!")
    total = len(test_data)
    answer5 = 0
    answer1 = 0

    for tk in test_data:
        print("Q.%s %s" % (tk, pretty_printer2(test_data[tk])))
        test_in = prep.tensorFromSentence(vocab, test_data[tk])
        embedded = get_sent_embed(test_in, pre_trained_embedding)

        temp = {}
        for candi in train_embed.keys():
            t = train_embed[candi]
            e = embedded
            temp[candi] = cosine_similarity(t, e)

        top_n = get_top_n(temp, 5)
        for e in top_n.keys():
            print("%.4f %4s %s" %
                  (top_n[e], e, pretty_printer2(train_data[e])))
            if ev.isAnswer(e, test_answer[tk]):
                answer5 += 1
                break
        top1 = list(top_n.keys())[0]
        if ev.isAnswer(top1, test_answer[tk]):
            answer1 += 1
        print("------------------------------------------")

    accuracy_at_5 = answer5 / total * 100
    accuracy_at_1 = answer1 / total * 100

    print("total: %d, accuracy@5: %.4f, accuracy@1: %.4f" %
          (total, accuracy_at_5, accuracy_at_1))
Exemple #3
0
    def process(self):
        tok = Tokenizer()

        # consider entire corpus as text ( train + test text columns )
        if self.test_csv:
            text = list(self.df.loc[:, self.text_cols].values) + list(
                self.test_df.loc[:, self.text_cols])
        else:
            text = list(self.df.loc[:, self.text_cols].values)

        self.tokens = [tok.tokenizer(x) for x in text]
        self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq)

        self.ntokens = [self.vocab.numericalize(t) for t in self.tokens]

        # only full training
        if self.valid_pct == 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens, self.df.loc[:,
                                                     self.label_cols].values)
            self.vld_ds = ([], [])
            self.test_ds = ([], [])

        # holdout
        elif self.valid_pct > 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens[self.cut:],
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (self.ntokens[:self.cut],
                           self.df.loc[:, self.label_cols].values[:self.cut])
            self.tst_ds = ([], [])

        # holdout and test prediction
        elif self.valid_pct > 0 and self.test_csv is not None:
            self.trn_tokens = self.ntokens[:len(self.df)]
            self.tst_ds = (self.ntokens[len(self.df):], [])

            trn_tokens = self.trn_tokens[self.cut:]
            vld_tokens = self.trn_tokens[:self.cut]

            self.trn_ds = (trn_tokens,
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (vld_tokens,
                           self.df.loc[:, self.label_cols].values[:self.cut])

        # full training and test prediction
        else:
            self.trn_ds = (self.ntokens[:len(self.df)],
                           self.df.loc[:, self.label_cols].values)
            self.vld_ds = ([], [])
            self.tst_ds = (self.ntokens[len(self.df):], [])

        return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds
Exemple #4
0
def CaptchaGenerator4(samples, batch_size):
    # to determine dimensions
    #
    while True:
        batch = np.random.choice(samples, batch_size)
        X = []
        y = []
        for sample in batch:
            img = np.asarray(Image.open(sample))
            text = Vocab().text_to_one_hots(sample[-8:-4])
            X.append(img)
            y.append(text)
        X = np.asarray(X)
        y = np.asarray(y)
        for i in range(4):
            print(y[:, i])
        yield X, [y[:, i] for i in range(4)]
Exemple #5
0
def CaptchaGenerator(samples, batch_size):

    while True:
        batch = np.random.choice(samples, batch_size)
        X = []
        y = []
        for sample in batch:
            img = np.asarray(Image.open(sample))
            text = Vocab().text_to_one_hot(sample[-8:-7])
            X.append(img)
            y.append(text)

        X = np.asarray(X)
        y = np.asarray(y)
        # print("data:")
        # print(X.shape)
        # print(y.shape)

        yield X, y
Exemple #6
0
def evaluate(args):
    vocab = Vocab()
    vocab.build(train_file)

    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device)

    if args.encoder:
        encoder.load_state_dict(torch.load(args.encoder))
        print("[INFO] load encoder with %s" % args.encoder)
    if args.decoder:
        decoder.load_state_dict(torch.load(args.decoder))
        print("[INFO] load decoder with %s" % args.decoder)

    # evaluate_similarity(encoder, vocab, batch_size, decoder=decoder)

    pre_trained_embedding = vocab.load_weight(EMBED_PATH)
    eval_sim_lc(encoder,
                vocab,
                batch_size,
                pre_trained_embedding,
                decoder=decoder)
Exemple #7
0
def main(args):
    global batch_size
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size
    lr = args.lr

    train_file = 'data/train_data_nv.txt'

    vocab = Vocab()
    vocab.build(train_file)

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)

    if args.encoder:
        encoder.load_state_dict(torch.load(args.encoder))
        print("[INFO] load encoder with %s" % args.encoder)
    if args.decoder:
        decoder.load_state_dict(torch.load(args.decoder))
        print("[INFO] load decoder with %s" % args.decoder)

    train_data = prep.read_train_data(train_file)
    train_loader = data.DataLoader(train_data,
                                   batch_size=batch_size,
                                   shuffle=True)

    # ev.evaluateRandomly(encoder, decoder, train_data, vocab, batch_size)
    # ev.evaluate_with_print(encoder, vocab, batch_size)

    # initialize
    max_a_at_5, max_a_at_1 = ev.evaluate_similarity(encoder,
                                                    vocab,
                                                    batch_size,
                                                    decoder=decoder)
    # max_a_at_5, max_a_at_1 = 0, 0
    max_bleu = 0

    total_epoch = args.epoch
    print(args)
    for epoch in range(1, total_epoch + 1):
        random.shuffle(train_data)
        trainIters(args,
                   epoch,
                   encoder,
                   decoder,
                   total_epoch,
                   train_data,
                   vocab,
                   train_loader,
                   print_every=2,
                   learning_rate=lr)

        if epoch % 20 == 0:
            a_at_5, a_at_1 = ev.evaluate_similarity(encoder,
                                                    vocab,
                                                    batch_size,
                                                    decoder=decoder)

            if a_at_1 > max_a_at_1:
                max_a_at_1 = a_at_1
                print("[INFO] New record! accuracy@1: %.4f" % a_at_1)

            if a_at_5 > max_a_at_5:
                max_a_at_5 = a_at_5
                print("[INFO] New record! accuracy@5: %.4f" % a_at_5)
                if args.save == 'y':
                    torch.save(encoder.state_dict(), 'encoder-max.model')
                    torch.save(decoder.state_dict(), 'decoder-max.model')
                    print("[INFO] new model saved")

            bleu = ev.evaluateRandomly(encoder, decoder, train_data, vocab,
                                       batch_size)
            if bleu > max_bleu:
                max_bleu = bleu
                if args.save == 'y':
                    torch.save(encoder.state_dict(), 'encoder-max-bleu.model')
                    torch.save(decoder.state_dict(), 'decoder-max-bleu.model')
                    print("[INFO] new model saved")

    print("Done! max accuracy@5: %.4f, max accuracy@1: %.4f" %
          (max_a_at_5, max_a_at_1))
    print("max bleu: %.2f" % max_bleu)
    if args.save == 'y':
        torch.save(encoder.state_dict(), 'encoder-last.model')
        torch.save(decoder.state_dict(), 'decoder-last.model')
Exemple #8
0
    parser.add_argument('--hidden_size', type=int, default=128)
    parser.add_argument('--w_embed_size', type=int, default=64)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epoch', type=int, default=400)
    parser.add_argument('--save', choices=['y', 'n'], default='n')
    parser.add_argument('--pre_trained_embed', choices=['y', 'n'], default='y')
    args = parser.parse_args()

    global batch_size
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size

    train_file = 'data/train_data_nv.txt'

    vocab = Vocab()
    vocab.build(train_file)

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size,
                          weight).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size, weight).to(device)
Exemple #9
0
from preprocess import Vocab
import matplotlib.pyplot as plt  # plt 用于显示图片

model = load_model('capcha_model_one_char.h5')
samples = glob.glob('data/test/*.jpg')

batch_size = 10
batch = np.random.choice(samples, batch_size)
print(batch)

for sample in batch:
    img = np.asarray(Image.open(sample)).reshape(1, 60, 240, 3)
    plt.imshow(Image.open(sample))

    text = sample[-8:-4]
    pre_text = Vocab().one_hot_to_text(model.predict(img)[0])
    print(model.predict(img))
    print('prediction is:{}'.format(pre_text))
    print('real is {}'.format(text))

# batch = np.random.choice(samples, batch_size)
# X = []
# y = []
# for sample in batch:
#     img = np.asarray(Image.open(sample))
#     text = Vocab().text_to_one_hot(sample[-8:-7])
#     X.append(img)
#     y.append(text)
# X = np.asarray(X)
# y = np.asarray(y)
# # y1 = [y[:,i] for i in range(4)]
Exemple #10
0
with open(trainfile, 'r') as file:
    line = file.readline()
    while line:
        s, t, f = line.split('\t')
        data[0].append((s, t, f.split(';')))
        line = file.readline()
with open(parsefile, 'r') as file:
    line = file.readline()
    while line:
        s, t, f = line.split('\t')
        data[1].append((s, t, f.split(';')))
        line = file.readline()

# Prepare vocabulary. Here, it doesn't add words that appear only in evaluation set.
vocab = Vocab(data[0])
vocab.add_parsefile(data[1])

# Instanciate a model.
mdl = model.Model(char_dim=config.char_dim,
                  feat_dim=config.feat_dim,
                  hidden_dim=config.hidden_dim,
                  char_size=len(vocab._char_dict.x2i),
                  feat_sizes=[len(fd.x2i) for fd in vocab._feat_dicts])

# Train and validate the model.
# It stops training when the maximum accuracy in validation set does not improve for more than specified epochs.
max_acc = 0
has_not_been_updated_for = 0

for epc in range(config.epochs):
Exemple #11
0
    for i in test_in[1:]:
        x = torch.cat((x, _pre_trained_embedding[i].view(1, -1)), 0)

    return x


def get_sentence_embed(vocab, sentence, pre_trained_embedding):
    """ represent sentence by averaing word embeddings """
    we_matrix = get_word_embed_matrix(sentence, vocab, pre_trained_embedding)
    return torch.mean(we_matrix, 0)


def get_sentence_embed_sa(vocab, sentence, pre_trained_embedding):
    we_matrix = get_word_embed_matrix(vocab, sentence, pre_trained_embedding)
    applied_sent = scaled_dot_product_attn(we_matrix, we_matrix, we_matrix)
    return applied_sent


if __name__ == "__main__":
    vocab = Vocab()
    vocab.build(train_file)
    pre_trained_embedding = vocab.load_weight(EMBED_PATH)

    sentence = '에어컨/NNG 작동/NNG 시/NNB 냉방/NNG 성능/NNG 떨어지/VV 그렇/VA 모르/VV 어떻/VA 하/VV 하/VX'
    #sentence = '에어컨/NNG 시원/XR 나오/VV 않/VX 그렇/VA 자동차/NNG 고장/NNG 아니/VCN 하/VV 연락/NNG 드리/VV'
    #data = get_word_embed_matrix(vocab, sentence, pre_trained_embedding)
    data = get_sentence_embed_sa(vocab, sentence, pre_trained_embedding)
    print(data.size())
    sns.heatmap(data)
    plt.show()
Exemple #12
0
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
from config import config
from preprocess import Vocab, Preprocess, dataset
from model import Encoder, Attention, Decoder
from torch.utils.data import DataLoader, Dataset

if __name__ == "__main__":
    print('==> Loading config......')
    cfg = config()
    print('==> Preprocessing data......')
    voc = Vocab(cfg)
    voc.gen_counter_dict()
    voc.gen_vocab()
    cfg.vocab_len = voc.vocab_len
    print('The length of vocab is: {}'.format(cfg.vocab_len))

    prep = Preprocess(cfg, voc.vocab)
    pairs = prep.gen_pair_sen()
    print('pairs sentences generated.')
    pairs = prep.tokenize(pairs)
    print('sentences tokenized.')

    traindataset = dataset(pairs, voc.vocab)
    traindataloader = DataLoader(traindataset, batch_size=5, shuffle=False)
    one_iter = iter(traindataloader).next()
Exemple #13
0
    print("total: %d, accuracy@5: %.4f, accuracy@1: %.4f" % (total, accuracy_at_5, accuracy_at_1))
    """


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--encoder', help='load exisiting model')
    parser.add_argument('--decoder', help='load exisiting model')
    parser.add_argument('--batch_size', type=int, default=40)
    parser.add_argument('--hidden_size', type=int, default=128)
    parser.add_argument('--w_embed_size', type=int, default=64)
    parser.add_argument('--pre_trained_embed', choices=['y', 'n'], default='n')
    args = parser.parse_args()

    vocab = Vocab()
    vocab.build(train_file)

    batch_size = args.batch_size
    hidden_size = args.hidden_size
    w_embed_size = args.w_embed_size

    if args.pre_trained_embed == 'n':
        encoder = Encoder(vocab.n_words, w_embed_size, hidden_size,
                          batch_size).to(device)
        decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size,
                                   batch_size).to(device)
        # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device)
    else:
        # load pre-trained embedding
        weight = vocab.load_weight(path="data/komoran_hd_2times.vec")