Ejemplo n.º 1
0
import os
#import nltk
import re
from collections import Counter

import data_utils
import glove_utils

IMDB_PATH = 'aclImdb'
MAX_VOCAB_SIZE = 50000
GLOVE_PATH = 'glove.840B.300d.txt'

if not os.path.exists('aux_files'):
    os.mkdir('aux_files')
imdb_dataset = data_utils.IMDBDataset(path=IMDB_PATH,
                                      max_vocab_size=MAX_VOCAB_SIZE)

# save the dataset
with open(('aux_files/dataset_%d.pkl' % (MAX_VOCAB_SIZE)), 'wb') as f:
    pickle.dump(imdb_dataset, f)

# create the glove embeddings matrix (used by the classification model)
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)
glove_embeddings, _ = glove_utils.create_embeddings_matrix(
    glove_model, imdb_dataset.dict, imdb_dataset.full_dict)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE),
        glove_embeddings)

print('All done')
Ejemplo n.º 2
0
import pickle

import os
#import nltk
import re
from collections import Counter

import data_utils
import glove_utils

PATH = 'sst-2'
MAX_VOCAB_SIZE = 13837
GLOVE_PATH = 'glove.840B.300d.txt'

if not os.path.exists('aux_files'):
    os.mkdir('aux_files')
dianping_dataset = data_utils.IMDBDataset(path=PATH,
                                          max_vocab_size=MAX_VOCAB_SIZE)

# save the dataset
with open(('aux_files/dataset_%d.pkl' % (MAX_VOCAB_SIZE)), 'wb') as f:
    pickle.dump(dianping_dataset, f)

# create the glove embeddings matrix (used by the classification model)
glove_model = glove_utils.loadGloveModel(GLOVE_PATH)
glove_embeddings, _ = glove_utils.create_embeddings_matrix(
    glove_model, dianping_dataset.dict, dianping_dataset.full_dict, d=300)
# save the glove_embeddings matrix
np.save('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE),
        glove_embeddings)