コード例 #1
0
import os
#import nltk
import re
from collections import Counter

import data_utils
import glove_utils

IMDB_PATH = 'aclImdb'
MAX_VOCAB_SIZE = 50000
GLOVE_PATH = 'glove.840B.300d.txt'

if not os.path.exists('aux_files'):
    os.mkdir('aux_files')
imdb_dataset = data_utils.IMDBDataset(path=IMDB_PATH,
                                      max_vocab_size=MAX_VOCAB_SIZE)

# save the dataset
with open(('aux_files/dataset_%d.pkl' % (MAX_VOCAB_SIZE)), 'wb') as f:
    pickle.dump(imdb_dataset, f)

# create the glove embeddings matrix (used by the classification model)
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)
glove_embeddings, _ = glove_utils.create_embeddings_matrix(
    glove_model, imdb_dataset.dict, imdb_dataset.full_dict)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE),
        glove_embeddings)

print('All done')
コード例 #2
0
IMDB_PATH = 'aclImdb'
MAX_VOCAB_SIZE = 50000
GLOVE_PATH = 'glove.840B.300d.txt'

if not os.path.exists('aux_files'):
    os.mkdir('aux_files')
imdb_dataset = data_utils.IMDBDataset(path=IMDB_PATH,
                                      max_vocab_size=MAX_VOCAB_SIZE)

# save the dataset
with open(('aux_files/dataset_%d.pkl' % (MAX_VOCAB_SIZE)), 'wb') as f:
    pickle.dump(imdb_dataset, f)

# create the glove embeddings matrix (used by the classification model)
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)
glove_embeddings, _ = glove_utils.create_embeddings_matrix(
    glove_model, imdb_dataset.dict, imdb_dataset.full_dict)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE),
        glove_embeddings)

# Load the counterfitted-vectors (used by our attack)
glove2 = glove_utils.loadGloveModel('counter-fitted-vectors.txt')
# create embeddings matrix for our vocabulary
counter_embeddings, missed = glove_utils.create_embeddings_matrix(
    glove2, imdb_dataset.dict, imdb_dataset.full_dict)

# save the embeddings for both words we have found, and words that we missed.
np.save(('aux_files/embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)),
        counter_embeddings)
MAX_VOCAB_SIZE = 50000
GLOVE_PATH = '/content/drive/My Drive/Master_Final_Project/Genetic_attack/Code/nlp_adversarial_example_master_pytorch/glove.840B.300d.txt'
COUNTER_PATH = 'counter-fitted-vectors.txt'

if not os.path.exists('aux_files'):
    os.mkdir('aux_files')
yelp_dataset = data_utils_yelp.YELPDataset(path=IMDB_PATH,
                                           max_vocab_size=MAX_VOCAB_SIZE)

# save the dataset
# 将数据序列化保存为pickle文件
with open(('aux_files/dataset_%d.pkl' % (MAX_VOCAB_SIZE)), 'wb') as f:
    pickle.dump(yelp_dataset, f)

# create the glove embeddings matrix (used by the classification model)
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)
# convert all valid words into matrix and their individual labels are same as their column order [300, n_of_words]
glove_embeddings, _ = glove_utils.create_embeddings_matrix(
    glove_model, yelp_dataset.dict, yelp_dataset.full_dict)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE),
        glove_embeddings)

# Load the counterfitted-vectors (used by our attack)
glove2 = glove_utils.loadGloveModel(COUNTER_PATH)
# create embeddings matrix for our vocabulary
counter_embeddings, missed = glove_utils.create_embeddings_matrix(
    glove2, yelp_dataset.dict, yelp_dataset.full_dict)

# save the embeddings for both words we have found, and words that we missed.
np.save(('aux_files/embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)),
コード例 #4
0
import pickle
import numpy as np
import glove_utils

with open('./nli_tokenizer.pkl', 'rb') as fh:
    tokenizer = pickle.load(fh)

nli_words_index = tokenizer.word_index

inv_word_index = {i: w for (w, i) in nli_words_index.items()}
MAX_VOCAB_SIZE = len(nli_words_index)
# Load the counterfitted-vectors (used by our attack)
glove2 = glove_utils.loadGloveModel('counter-fitted-vectors.txt')
# create embeddings matrix for our vocabulary
counter_embeddings, missed = glove_utils.create_embeddings_matrix(
    glove2, nli_words_index, None)

# save the embeddings for both words we have found, and words that we missed.
np.save(('aux_files/nli_embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)),
        counter_embeddings)
np.save(('aux_files/nli_missed_embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)),
        missed)

print('Done preparing the embedding matrix.')
print('Computing the distance matrix.. this may take a while')
c_ = -2 * np.dot(counter_embeddings.T, counter_embeddings)
a = np.sum(np.square(counter_embeddings), axis=0).reshape((1, -1))
b = a.T
dist = a + b + c_
np.save(('aux_files/nli_dist_counter_%d.npy' % (MAX_VOCAB_SIZE)), dist)