def initialize(filepath="preprocessed_texts/alice-only-spaced.txt"):#AliceInWonderland.txt"): n_grams = [] # RI_letters = random_idx.generate_letter_id_vectors(N, k, alph) lv = random_idx.generate_letter_id_vectors(N,15000); f = open(filepath, "r"); text = f.read(); text = ''.join([x for x in text if x in alphabet])[0:10000]; #for each n, dictionary of key: n gram and value: frequency n_gram_frequencies = [{} for _ in range(len(cluster_sizes) + 1)] # lang_vectors in sizes 1-8 lang_vectors = [] for size in cluster_sizes: lang_vectors.append(create_lang_vec(filepath, lv, [size])) lang_vectors.insert(0, np.zeros((1,N))) # save vectors to file fwrite = open("intermediate/lv", "w") fwrite1 = open("intermediate/lang_vectors", "w") fwrite2 = open("intermediate/n_gram_frequencies", "w") pickle.dump(lv, fwrite) pickle.dump(lang_vectors, fwrite1) pickle.dump(n_gram_frequencies, fwrite2) fwrite.close() fwrite1.close() fwrite2.close()
def __init__(self, N, k, b): self.N = N # N-dimensional space self.k = k # 2k sparse vector self.wordz = {} # dictionary to store words to random index self.b = b # number of basis elements to learn self.basis = np.random.rand(N, b) self.RI_letters = ridx.generate_letter_id_vectors(self.N, self.k, alph=alphabet)
def gen_lets(N=N, k=k): # generate letter vectors RI_letters = random_idx.generate_letter_id_vectors(N, k) RI_letters_n = RI_letters / np.linalg.norm(RI_letters) return RI_letters, RI_letters_n
def gen_lets(N=N,k=k): # generate letter vectors RI_letters = random_idx.generate_letter_id_vectors(N,k) RI_letters_n = RI_letters/np.linalg.norm(RI_letters) return RI_letters, RI_letters_n
ks = list(np.round((np.logspace(0,4,10)))) #V = np.zeros((len(Ns),len(sparsities))) V = np.zeros((len(Ns), len(ks))) # build V (matrix of variances) for i in xrange(len(Ns)): N = int(Ns[i]) #for j in xrange(len(sparsities)): for j in xrange(len(ks)): #sparsity = sparsities[j] #k = int(N*sparsity/2) k = ks[j] if k >= N: continue RI_letters = random_idx.generate_letter_id_vectors(N,k) total_vec = [] print N,k print '==========' # iterate over ordered vs unordered for cluster_sz in cluster_sizes: print "~~~~~~~~~~" print 'cz = ', cluster_sz # calculate language vectors lang_vectors = random_idx.generate_RI_lang(N, RI_letters, cluster_sz, ordered, languages=languages) total_vec.append(lang_vectors) # print cosine angles if ordered == 0: ord_str = 'unordered!' else:
import string import pandas as pd import matplotlib.pyplot as plt import pickle import sys from sklearn import manifold, datasets k = 500 N = 1000 n_components = 2 n_neighbors = 10 method = 'hessian' alphabet = string.lowercase + ' ' a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt' one_hot_encoding = random_idx.generate_letter_id_vectors(N, k) lst = [] with open(a, 'r') as f: for line in f: for word in line.split(): beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding) lst.append(beta) tup = tuple(lst) big_matrix = np.vstack(lst) big_matrix = big_matrix[0:1000] print big_matrix.shape print "compressing data"
def generate_letter_id_vectors(N, k, alph=alphabet): # build row-wise k-sparse random index matrix # each row is random index vector for letter return random_idx.generate_letter_id_vectors(N, K, alph)
import pandas as pd import matplotlib.pyplot as plt import pickle import sys from sklearn import manifold, datasets k = 500 N = 1000 n_components = 2 n_neighbors = 10 method = 'hessian' alphabet = string.lowercase + ' ' a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt' one_hot_encoding = random_idx.generate_letter_id_vectors(N, k) lst = [] with open(a, 'r') as f: for line in f: for word in line.split(): beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding) lst.append(beta) tup = tuple(lst) big_matrix = np.vstack(lst) big_matrix = big_matrix[0:1000] print big_matrix.shape print "compressing data"