Esempio n. 1
0
def getEmbeddingMatrix(word_vector_fn, word_index, dataset, orig_fn):
    matrix_fn = orig_fn + "matrix/" + word_vector_fn + ".npy"
    if os.path.exists(matrix_fn):
        embedding_matrix = np.load(matrix_fn)
    else:
        if dataset == 0:
            word_vectors = np.load(orig_fn + "wordvectors/vectors/" +
                                   word_vector_fn + ".npy")
            word_vector_entities = rt.importArray(
                orig_fn + "wordvectors/words/" + word_vector_fn + ".txt",
                encoding="cp1252")
            word_dict = {}
            for w in range(len(word_vector_entities)):
                word_dict[word_vector_entities[w]] = word_vectors[w]
            embedding_matrix = np.zeros(
                (len(word_index) + 1, len(word_vectors[0])))
            for word, w in word_index.items():
                embedding_vector = word_dict.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[w] = embedding_vector
            np.save(matrix_fn, embedding_matrix)
        elif dataset == 1 or dataset == 2:
            word_vectors = rt.load_dict(orig_fn + "raw/wv_vocab.dict")
            embedding_matrix = np.zeros(
                (len(word_vectors.keys()), len(word_vectors[0]))
            )  # movie is magic keyword. we assume it exists as word vector
            for key, item in word_vectors.items():
                embedding_matrix[key] = item
    return embedding_matrix, len(embedding_matrix[0]), len(embedding_matrix)
Esempio n. 2
0
import numpy as np
import read_text as rt
import parse_binary_treebank as pbt
import string
import re

origin = "../data/sst/raw/"

vocab = rt.load_dict(origin + "vocab.npy")
wv = rt.load_dict(origin + "wv_vocab.npy")
x_train_w = np.load(origin + "x_train_w.npy")
x_test_w = np.load(origin + "x_test_w.npy")
x_dev_w = np.load(origin + "x_dev_w.npy")

reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict")

x_train, y_train, x_test, y_test, x_dev, y_dev = pbt.loadSplits(
    "../data/sentiment/sst_binary/")


def verifyIndexed(indexed, words, r_vocab):
    print("verifying index")
    for s in range(len(indexed)):
        for w in range(len(indexed[s])):
            if r_vocab[indexed[s][w]] != words[s][w]:
                print(">>>>> Failed indexed equivalence")
                print(indexed[s])
                print(words[s])
                break

def loadVocabs():
    reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict")
    vocab = rt.save_dict(origin + "vocab.dict")
    word_vectors = rt.save_dict(origin + "wv_vocab.dict")
    return vocab, reversed_vocab, word_vectors
Esempio n. 4
0
import parse_binary_treebank as pbt
import read_text as rt
import numpy as np
import scipy.sparse as sp

origin = "../data/sst/raw/"

vocab = rt.load_dict(origin + "vocab.dict")
reversed_vocab = rt.load_dict(origin + "reversed_vocab.dict")

x_train, x_test, x_dev, y_train, y_test, y_dev = pbt.loadProcessedSplits()

lowest_amt = 0
highest_amt = 5
classification = "all"

save_origin = "/mnt/62423FE6423FBD9B/PhD/Code/Paper 2/data/"
all_fn = save_origin + "sst/bow/frequency/phrases/class-all-" + str(
    lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".npz"
all_fn_binary = save_origin + "sst/bow/binary/phrases/class-all-" + str(
    lowest_amt) + "-" + str(highest_amt) + "-" + classification + ".npz"
word_fn = save_origin + "sst/bow/names/" + str(lowest_amt) + "-" + str(
    highest_amt) + "-" + classification + ".txt"

vectors = np.concatenate((x_train, x_dev, x_test), axis=0)
classes = np.concatenate((y_train, y_dev, y_test), axis=0)

word_list = list(vocab.keys())

tf = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32)
tf_binary = np.zeros(shape=(len(vectors), len(word_list)), dtype=np.int32)