def getEmbeddingMatrix(word_vector_fn, word_index, dataset, orig_fn): matrix_fn = orig_fn + "matrix/" + word_vector_fn + ".npy" if os.path.exists(matrix_fn): embedding_matrix = np.load(matrix_fn) else: if dataset == 0: word_vectors = np.load(orig_fn + "wordvectors/vectors/" + word_vector_fn + ".npy") word_vector_entities = rt.importArray( orig_fn + "wordvectors/words/" + word_vector_fn + ".txt", encoding="cp1252") word_dict = {} for w in range(len(word_vector_entities)): word_dict[word_vector_entities[w]] = word_vectors[w] embedding_matrix = np.zeros( (len(word_index) + 1, len(word_vectors[0]))) for word, w in word_index.items(): embedding_vector = word_dict.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[w] = embedding_vector np.save(matrix_fn, embedding_matrix) elif dataset == 1 or dataset == 2: word_vectors = rt.load_dict(orig_fn + "raw/wv_vocab.dict") embedding_matrix = np.zeros( (len(word_vectors.keys()), len(word_vectors[0])) ) # movie is magic keyword. we assume it exists as word vector for key, item in word_vectors.items(): embedding_matrix[key] = item return embedding_matrix, len(embedding_matrix[0]), len(embedding_matrix)
import numpy as np import read_text as rt import parse_binary_treebank as pbt import string import re origin = "../data/sst/raw/" vocab = rt.load_dict(origin + "vocab.npy") wv = rt.load_dict(origin + "wv_vocab.npy") x_train_w = np.load(origin + "x_train_w.npy") x_test_w = np.load(origin + "x_test_w.npy") x_dev_w = np.load(origin + "x_dev_w.npy") reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict") x_train, y_train, x_test, y_test, x_dev, y_dev = pbt.loadSplits( "../data/sentiment/sst_binary/") def verifyIndexed(indexed, words, r_vocab): print("verifying index") for s in range(len(indexed)): for w in range(len(indexed[s])): if r_vocab[indexed[s][w]] != words[s][w]: print(">>>>> Failed indexed equivalence") print(indexed[s]) print(words[s]) break
def loadVocabs(): reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict") vocab = rt.save_dict(origin + "vocab.dict") word_vectors = rt.save_dict(origin + "wv_vocab.dict") return vocab, reversed_vocab, word_vectors
import parse_binary_treebank as pbt import read_text as rt import numpy as np import scipy.sparse as sp origin = "../data/sst/raw/" vocab = rt.load_dict(origin + "vocab.dict") reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict") x_train, x_test, x_dev, y_train, y_test, y_dev = pbt.loadProcessedSplits() lowest_amt = 0 highest_amt = 5 classification = "all" save_origin = "/mnt/62423FE6423FBD9B/PhD/Code/Paper 2/data/" all_fn = save_origin + "sst/bow/frequency/phrases/class-all-" + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".npz" all_fn_binary = save_origin + "sst/bow/binary/phrases/class-all-" + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".npz" word_fn = save_origin + "sst/bow/names/" + str(lowest_amt) + "-" + str( highest_amt) + "-" + classification + ".txt" vectors = np.concatenate((x_train, x_dev, x_test), axis=0) classes = np.concatenate((y_train, y_dev, y_test), axis=0) word_list = list(vocab.keys()) tf = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32) tf_binary = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32)