Esempio n. 1
0
def main():
    from model import TowardModel
    model = TowardModel(gpu=gpu_use)
    model_name = 'simple_model_50000'
    model.load_state_dict(torch.load('models/{}'.format(model_name)))

    model = model.to(device)
    model.eval()

    f = open("../../sentiment_data/nsmc-master/ratings_train.txt", 'r')
    lines = f.readlines()
    data_number = len(lines) - 1

    line_number = 0
    accuracy = 0
    for it in range(n_iter):
        inputs_value = []
        inputs_token = []
        labels = []
        fake_labels = []

        inputsize = 0  # count batch size
        while inputsize < mb_size:
            line_number = line_number % data_number + 1

            token_value_list = []
            token_list = []

            input_line = lines[line_number]

            input_split = re.split('\t', input_line)

            input_sentence = input_split[1]
            input_label = input_split[2].strip()

            condition = True
            try:
                input_label = float(input_label)

                if len(input_sentence) < 1 or (
                        len(bert_tokenizer.tokenize(input_sentence)) == 1 and
                        bert_tokenizer.tokenize(input_sentence)[0] == '[UNK]'):
                    condition = False
                else:
                    condition = True
            except:
                condition = False

            if condition:
                if input_label == 0:
                    input_label = [1, 0]
                    fake_label = [0, 1]
                else:
                    input_label = [0, 1]
                    fake_label = [1, 0]
                labels.append(input_label)
                fake_labels.append(fake_label)

                output_bert = embedding(input_sentence, bert_model,
                                        bert_tokenizer)

                for token_order in range(len(output_bert['features'])):
                    token_value = np.asarray(
                        output_bert['features'][token_order]['layers'][0]
                        ['values'])
                    token_value = torch.from_numpy(token_value)
                    token_value = token_value.unsqueeze(0)
                    token_value_list.append(token_value)

                    try:
                        token_list.append(vocab[output_bert['features']
                                                [token_order]['token']])
                    except:
                        token_list.append(vocab['[UNK]'])
                token_value_emb = torch.cat(token_value_list, 0)
                token_value_emb = token_value_emb.unsqueeze(1).type(
                    torch.FloatTensor)  # [token_len, 1, emb_dim]
                token_value_emb = token_value_emb[
                    1:-1, :, :]  # without [CLS], [SEP]

                inputs_value.append(token_value_emb)

                tokens = np.asarray(token_list[:-1])  # without [SEP]
                tokens = torch.from_numpy(tokens)
                inputs_token.append(tokens)  # [[token_len], ...]

                inputsize += 1

        enc_value = padding_values(inputs_value).to(device)
        dec_token = padding_tokens(inputs_token).to(device)

        attributes = torch.from_numpy(np.asarray(labels)).type(
            torch.FloatTensor).to(device)
        fake_attributes = torch.from_numpy(np.asarray(fake_labels)).type(
            torch.FloatTensor).to(device)

        ## inference
        enc_out = model.encoder(enc_value)
        dec_out, dec_out_vocab = model.decoder(enc_value, dec_token,
                                               attributes)

        recon_gen_tokens = model.generator(enc_value, attributes, train=False)
        real_gen_sentences = model.gen2sentence(recon_gen_tokens)

        fake_gen_tokens = model.generator(enc_value,
                                          fake_attributes,
                                          train=False)
        fake_gen_sentences = model.gen2sentence(fake_gen_tokens)

        gen_value = []
        for i in range(len(fake_gen_sentences)):
            output_bert = embedding(fake_gen_sentences[i], bert_model,
                                    bert_tokenizer)

            token_value_list = []
            for token_order in range(len(output_bert['features'])):
                token_value = np.asarray(output_bert['features'][token_order]
                                         ['layers'][0]['values'])
                token_value = torch.from_numpy(token_value)
                token_value = token_value.unsqueeze(0)
                token_value_list.append(token_value)

            token_value_emb = torch.cat(token_value_list, 0)
            token_value_emb = token_value_emb.unsqueeze(1).type(
                torch.FloatTensor)  # [token_len, 1, emb_dim]
            token_value_emb = token_value_emb[1:
                                              -1, :, :]  # without [CLS], [SEP]

            gen_value.append(token_value_emb)

        gen_enc_value = padding_values(gen_value).to(device)

        gen_enc_out = model.encoder(gen_enc_value)
        gen_dec_out, gen_dec_out_vocab = model.decoder(gen_enc_value,
                                                       dec_token, attributes)

        if attributes[0].cpu().numpy().argmax() == 0:
            gt = 'negative'
        else:
            gt = 'positive'

        print("{}번째 입력과 문장 생성!!".format(it + 1))
        print(input_sentence, gt)

        cls_1 = model.discriminator(recon_gen_tokens)
        if cls_1.argmax() == 0:
            cls_1 = 'negative'
        else:
            cls_1 = 'positive'
        real_gen_sentences = postprocessing(real_gen_sentences)
        print("Real attributes recon: ", real_gen_sentences, ": ", cls_1)

        cls_2 = model.discriminator(fake_gen_tokens)
        if cls_2.argmax() == 0:
            cls_2 = 'negative'
        else:
            cls_2 = 'positive'
        fake_gen_sentences = postprocessing(fake_gen_sentences)
        print("Fake attributes recon: ", fake_gen_sentences, ": ", cls_2)

        #         if gt == cls_1:
        #             accuracy += 1/total
        #         else:
        #             pass

        if gt == cls_2:
            pass
        else:
            accuracy += 1 / n_iter

        a1 = torch.argmax(dec_out_vocab, 2)
        b1 = a1.transpose(0, 1)
        ed_recon = postprocessing(model.gen2sentence(b1))
        print("Encoder-Decoder Recurrent recon: ", ed_recon)

        a = torch.argmax(gen_dec_out_vocab, 2)
        b = a.transpose(0, 1)
        rfr_recon = postprocessing(model.gen2sentence(b))
        print("real-fake-real Recurrent recon: ", rfr_recon)
        print("")

    print("정확도: {}%".format(accuracy * 100))
Esempio n. 2
0
from tensorboardX import SummaryWriter

summary = SummaryWriter(logdir='./logs')

n_iter = 50000
vocab_size = 6222
mb_size = 32

vocab = load_vocab(
    '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu_use = True

from model import TowardModel

model = TowardModel(gpu=gpu_use)

model = model.to(device)
model.train()


def main():
    initial_lr = 0.001
    cls_trainer = optim.Adamax(model.cls_params,
                               lr=initial_lr)  # initial 0.001

    max_grad_norm = 10
    weight_decay = 5000

    f = open("../../sentiment_data/nsmc-master/ratings_train.txt", 'r')
    lines = f.readlines()
Esempio n. 3
0
from pytorch_bert_embedding import *
import torch.optim as optim

bert_model, bert_tokenizer = bert_model_load('bert-base-multilingual-cased')

vocab_size = 6222
mb_size = 1

vocab = load_vocab(
    '/DATA/joosung/pytorch_pretrained_BERT_master/korea_vocab.txt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu_use = True

from model import TowardModel

model = TowardModel(gpu=gpu_use)
model_name = 'simple_model_50000'
model.load_state_dict(torch.load('models/{}'.format(model_name)))

model = model.to(device)
model.eval()


def main():
    f = open("../../sentiment_data/nsmc-master/ratings_test.txt", 'r')
    lines = f.readlines()
    data_number = len(lines)
    n_iter = data_number

    print("id	document	label")
Esempio n. 4
0
def main():
    from model import TowardModel
    model = TowardModel(gpu=gpu_use)
    model_name = 'simple_model_50000'
    model.load_state_dict(torch.load('models/{}'.format(model_name)))

    model = model.to(device)
    model.eval()

    f = open("../../sentiment_data/nsmc-master/ratings_test.txt", 'r')
    lines = f.readlines()
    data_number = len(lines) - 1
    n_iter = data_number

    line_number = 0
    accuracy = 0
    for it in tqdm(range(n_iter)):
        inputs_value = []
        inputs_token = []
        labels = []
        fake_labels = []

        inputsize = 0  # count batch size
        while inputsize < mb_size:
            line_number = line_number % data_number + 1

            token_value_list = []
            token_list = []

            input_line = lines[line_number]

            input_split = re.split('\t', input_line)

            input_sentence = input_split[1]
            input_label = input_split[2].strip()

            condition = True
            try:
                input_label = float(input_label)

                if len(input_sentence) < 1 or (
                        len(bert_tokenizer.tokenize(input_sentence)) == 1 and
                        bert_tokenizer.tokenize(input_sentence)[0] == '[UNK]'):
                    condition = False
                else:
                    condition = True
            except:
                condition = False

            if condition:
                if input_label == 0:
                    input_label = [1, 0]
                    fake_label = [0, 1]
                else:
                    input_label = [0, 1]
                    fake_label = [1, 0]
                labels.append(input_label)
                fake_labels.append(fake_label)

                output_bert = embedding(input_sentence, bert_model,
                                        bert_tokenizer)

                for token_order in range(len(output_bert['features'])):
                    token_value = np.asarray(
                        output_bert['features'][token_order]['layers'][0]
                        ['values'])
                    token_value = torch.from_numpy(token_value)
                    token_value = token_value.unsqueeze(0)
                    token_value_list.append(token_value)

                    try:
                        token_list.append(vocab[output_bert['features']
                                                [token_order]['token']])
                    except:
                        token_list.append(vocab['[UNK]'])
                token_value_emb = torch.cat(token_value_list, 0)
                token_value_emb = token_value_emb.unsqueeze(1).type(
                    torch.FloatTensor)  # [token_len, 1, emb_dim]
                token_value_emb = token_value_emb[
                    1:-1, :, :]  # without [CLS], [SEP]

                inputs_value.append(token_value_emb)

                tokens = np.asarray(token_list[:-1])  # without [SEP]
                tokens = torch.from_numpy(tokens)
                inputs_token.append(tokens)  # [[token_len], ...]

                inputsize += 1

        enc_value = padding_values(inputs_value).to(device)
        dec_token = padding_tokens(inputs_token).to(device)

        attributes = torch.from_numpy(np.asarray(labels)).type(
            torch.FloatTensor).to(device)
        fake_attributes = torch.from_numpy(np.asarray(fake_labels)).type(
            torch.FloatTensor).to(device)

        ## inference
        recon_gen_tokens = model.generator(enc_value, attributes, train=False)

        if attributes[0].cpu().numpy().argmax() == 0:
            gt = 'negative'
        else:
            gt = 'positive'

        cls_1 = model.discriminator(recon_gen_tokens)
        if cls_1.argmax() == 0:
            cls_1 = 'negative'
        else:
            cls_1 = 'positive'

        if gt == cls_1:
            accuracy += 1
        else:
            pass

        if it % 10000 == 0:
            print("중간 정확도: {}%".format(accuracy / (it + 1) * 100))

    print("정확도: {}%".format(accuracy / n_iter * 100))