Exemple #1
0
def get_embeddings(hparams):
    if hparams.glove_path and hparams.vocab_path:
        tf.logging.info("Loading Glove embeddings...")
        vocab_array, vocab_dict = helpers.load_vocab(hparams.vocab_path)
        glove_vectors, glove_dict = helpers.load_glove_vectors(
            hparams.glove_path, vocab=set(vocab_array))
        initializer = helpers.build_initial_embedding_matrix(
            vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim)
    else:
        tf.logging.info(
            "No glove/vocab path specificed, starting with random embeddings.")
        initializer = tf.random_uniform_initializer(-0.25, 0.25)

    return tf.get_variable("word_embeddings",
                           shape=[hparams.vocab_size, hparams.embedding_dim],
                           initializer=initializer)
#
# Read Training data
# ----------------------------------------------------------------------------
print ("Loading data..")
train_texts = []
train_tags = []
train_labels = []
word_index_pickle = open(TRAIN_DATA_FILE, 'rb')
pickling = pickle.load(word_index_pickle)
x = pickling['word_indices']
y = pickling['y']
tags= pickling['meta_tag']
word_index = 60000 


embedding_matrix = helpers.build_initial_embedding_matrix(word_index, EMBEDDING_DIM)


#
# Prepare embeddings
# ----------------------------------------------------------------------------
print('Preparing embedding matrix')

#nb_words = min(MAX_NB_WORDS, len(word_index)) + 1
nb_words = min(MAX_NB_WORDS, len(word_index))


#
# Sample train/validation data
# ----------------------------------------------------------------------------
np.random.seed(1234)
Exemple #3
0
# 	EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin'
# 	EMBEDDING_DIM = 300
# elif(embedding == "twitter"):
# 	EMBEDDING_FILE = 'glove.twitter.27B.200d.txt'
# 	EMBEDDING_DIM = 200
EMBEDDING_FILE = 'glove.twitter.27B.200d.txt'
EMBEDDING_DIM = 200
EMBEDDING_FILE = DATA_PATH + EMBEDDING_FILE

embedding_matrix = ''

nb_words = min(MAX_NB_WORDS, len(word_index)) + 1

glove_vectors, glove_dict = helpers.load_glove_vectors(EMBEDDING_FILE,
                                                       vocab=set(word_index))
embedding_matrix = helpers.build_initial_embedding_matrix(
    word_index, glove_dict, glove_vectors, EMBEDDING_DIM)

# if(embedding == 'google'):
# 	word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE,
# 											 binary=True)
# 	embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
# 	for word, i in word_index.items():
# 		if word in word2vec.vocab:
# 			embedding_matrix[i] = word2vec.word_vec(word)
# else:
# 	glove_vectors, glove_dict = helpers.load_glove_vectors(EMBEDDING_FILE, vocab=set(word_index))
# 	embedding_matrix = helpers.build_initial_embedding_matrix(word_index, glove_dict, glove_vectors, EMBEDDING_DIM)

########################################
## sample train/validation data
########################################