def initialize(filepath="preprocessed_texts/alice-only-spaced.txt"):#AliceInWonderland.txt"):
    n_grams = []
    # RI_letters = random_idx.generate_letter_id_vectors(N, k, alph)
    lv = random_idx.generate_letter_id_vectors(N,15000);
    f = open(filepath, "r");
    text = f.read();
    text = ''.join([x for x in text if x in alphabet])[0:10000];
    #for each n, dictionary of key: n gram and value: frequency
    n_gram_frequencies = [{} for _ in range(len(cluster_sizes) + 1)]

    # lang_vectors in sizes 1-8
    lang_vectors = []
    for size in cluster_sizes:
        lang_vectors.append(create_lang_vec(filepath, lv, [size]))
    lang_vectors.insert(0, np.zeros((1,N)))

    # save vectors to file
    fwrite = open("intermediate/lv", "w")
    fwrite1 = open("intermediate/lang_vectors", "w")
    fwrite2 = open("intermediate/n_gram_frequencies", "w")
    pickle.dump(lv, fwrite)
    pickle.dump(lang_vectors, fwrite1)
    pickle.dump(n_gram_frequencies, fwrite2)
    fwrite.close()
    fwrite1.close()
    fwrite2.close()
Exemple #2
0
 def __init__(self, N, k, b):
     self.N = N  # N-dimensional space
     self.k = k  # 2k sparse vector
     self.wordz = {}  # dictionary to store words to random index
     self.b = b  # number of basis elements to learn
     self.basis = np.random.rand(N, b)
     self.RI_letters = ridx.generate_letter_id_vectors(self.N,
                                                       self.k,
                                                       alph=alphabet)
def gen_lets(N=N, k=k):
    # generate letter vectors
    RI_letters = random_idx.generate_letter_id_vectors(N, k)
    RI_letters_n = RI_letters / np.linalg.norm(RI_letters)
    return RI_letters, RI_letters_n
def gen_lets(N=N,k=k):
        # generate letter vectors
        RI_letters = random_idx.generate_letter_id_vectors(N,k)
        RI_letters_n = RI_letters/np.linalg.norm(RI_letters)
        return RI_letters, RI_letters_n
ks = list(np.round((np.logspace(0,4,10))))

#V = np.zeros((len(Ns),len(sparsities)))
V = np.zeros((len(Ns), len(ks)))

# build V (matrix of variances)
for i in xrange(len(Ns)):
		N = int(Ns[i])
		#for j in xrange(len(sparsities)):
		for j in xrange(len(ks)):
				#sparsity = sparsities[j]
				#k = int(N*sparsity/2)
				k = ks[j]
				if k >= N:
						continue
				RI_letters = random_idx.generate_letter_id_vectors(N,k)
				total_vec = []
				print N,k
				print '=========='

				# iterate over ordered vs unordered
				for cluster_sz in cluster_sizes:
						print "~~~~~~~~~~"
						print 'cz = ', cluster_sz
						# calculate language vectors
						lang_vectors = random_idx.generate_RI_lang(N, RI_letters, cluster_sz, ordered, languages=languages)
						total_vec.append(lang_vectors)
						# print cosine angles 
						if ordered == 0:
								ord_str = 'unordered!'
						else:
import string
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import sys
from sklearn import manifold, datasets

k = 500
N = 1000
n_components = 2
n_neighbors = 10
method = 'hessian'
alphabet = string.lowercase + ' '
a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt'

one_hot_encoding = random_idx.generate_letter_id_vectors(N, k)
lst = []

with open(a, 'r') as f:
    for line in f:
        for word in line.split():
            beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding)
            lst.append(beta)

tup = tuple(lst)

big_matrix = np.vstack(lst)
big_matrix = big_matrix[0:1000]
print big_matrix.shape

print "compressing data"
def generate_letter_id_vectors(N, k, alph=alphabet):
    # build row-wise k-sparse random index matrix
    # each row is random index vector for letter
    return random_idx.generate_letter_id_vectors(N, K, alph)
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import sys
from sklearn import manifold, datasets


k = 500
N = 1000
n_components = 2 
n_neighbors = 10
method = 'hessian'
alphabet = string.lowercase + ' '
a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt'

one_hot_encoding = random_idx.generate_letter_id_vectors(N, k)
lst = []

with open(a, 'r') as f:
	for line in f:
		for word in line.split():
			beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding)
			lst.append(beta)

tup = tuple(lst)

big_matrix = np.vstack(lst)
big_matrix = big_matrix[0:1000]
print big_matrix.shape

print "compressing data"
def generate_letter_id_vectors(N, k, alph=alphabet):
    # build row-wise k-sparse random index matrix
    # each row is random index vector for letter
    return random_idx.generate_letter_id_vectors(N, K, alph)