def find_letter_partner(test_letter, english_vector=english_vector): # testing letter blocks for their "block partners" test_vec = random_idx.id_vector(N,test_letter,alph, RI_letters, ordered=ordered) #test_vec = test_vec/np.linalg.norm(test_vec) ''' sub_eng = np.copy(english_vector) for r in xrange(len(blocks)): block = blocks[r] if test_letter != block[0]: sub_eng[:, RI_blocks[r,:] != 0] = 1e-2 print sub_eng ''' cz = len(test_letter) #if cz > 1: # for i in xrange(len(alph)): # english_vector -= RI_letters[i,:] #english_vector /= np.linalg.norm(english_vector) #factored_eng = np.multiply(english_vector, np.roll(letter, 1)) factored_eng = np.multiply(english_vector, np.roll(test_vec, 1)) #factored_eng = np.roll(np.multiply(english_vector, letter), -1) #factored_RI_letters = RI_letters, np.roll(letter,1)) #if len(test_letter) == 1: likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_letters, alph, display=1) '''else: #RI_blocks, RI_block_vectors = generate_RI_block_vectors(cz=cz) #likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_block_vectors, RI_blocks, display=1) ''' return likely_block_partner, values, partners
def find_letter_partner(test_letter, lang_vector): # testing letter blocks for their "block partners" test_vec = random_idx.id_vector(N, test_letter, alph, RI_letters, ordered=ordered) #test_vec = test_vec/np.linalg.norm(test_vec) ''' sub_eng = np.copy(english_vector) for r in xrange(len(blocks)): block = blocks[r] if test_letter != block[0]: sub_eng[:, RI_blocks[r,:] != 0] = 1e-2 print sub_eng ''' cz = len(test_letter) #if cz > 1: # for i in xrange(len(alph)): # english_vector -= RI_letters[i,:] #english_vector /= np.linalg.norm(english_vector) #factored_eng = np.multiply(english_vector, np.roll(letter, 1)) factored_lang = np.multiply(lang_vector, np.roll(test_vec, 1)) #factored_eng = np.roll(np.multiply(english_vector, letter), -1) #factored_RI_letters = RI_letters, np.roll(letter,1)) #if len(test_letter) == 1: likely_block_partner, values, partners = utils.find_language(test_letter, factored_lang, RI_letters, alph, display=1) return likely_block_partner, values, partners
def generate_RI_block_vectors(cz=2): # generate letter block values for total comparison RI_blocks = utils.generate_ordered_clusters(alph,cluster_sz=cz) #print RI_blocks RI_block_vectors = np.zeros((len(RI_blocks),N)) for i in xrange(len(RI_blocks)): block = RI_blocks[i] #print block block_vec = random_idx.id_vector(N,block,alph, RI_letters,ordered=ordered) RI_block_vectors[i,:] = block_vec return RI_blocks, RI_block_vectors
def create_full_words_vec(vocab_file, N=N, k=k): fread = open(vocab_file, "r") fwrite = open("../intermediate/alice_full_words_vec", "w") total_lang = np.zeros((1,N)) for line in fread: word = line[:line.index(":")] word_vector = random_idx.id_vector(N, word, alph, RI_letters, ordered) total_lang += word_vector pickle.dump(total_lang, fwrite) fread.close() fwrite.close() return total_lang
def generate_RI_block_vectors(cz=2): # generate letter block values for total comparison RI_blocks = utils.generate_ordered_clusters(alph, cluster_sz=cz) #print RI_blocks RI_block_vectors = np.zeros((len(RI_blocks), N)) for i in xrange(len(RI_blocks)): block = RI_blocks[i] #print block block_vec = random_idx.id_vector(N, block, alph, RI_letters, ordered=ordered) RI_block_vectors[i, :] = block_vec return RI_blocks, RI_block_vectors
def vocab_vector(lv, lang_vectors, filepath="preprocessed_texts/alice-only-spaced.txt"): f = open(filepath, "r"); text = f.read() text = text.split(" ") #text = ''.join([x for x in text if x in alphabet])[0:10000]; vocab_vec = np.zeros((1,N)) max_length = 0 for word in text: #print "generating vocab vector of cluster size", len(word) word_vec = random_idx.id_vector(N, word, alphabet, lv, ordered) vocab_vec += word_vec if len(word) > max_length: max_length = len(word) f.close() return vocab_vec, max_length
def predict2(lvl_n, pref): v = "" prefix = np.zeros((1, 10000)) pref_reversed = pref[::-1] for i in pref_reversed: v = i + v p = random_idx.id_vector(N, v, alph, RI_letters, ordered) prefix = np.add(p, prefix) sprefix = np.roll(prefix, 1) t = np.multiply(lvl_n, sprefix) q = Queue.PriorityQueue() for i in range(26): result = np.dot(t, RI_letters[i]) q.put((-result, result, len(pref) + 1, pref, alph[i])) return q
def predict(pref, length, lang_vec): ngram = lang_vec clamp_to_binary(ngram, 0) #clamp(ngram, -27, 27) #clamp(ngram, -10, 10) prefix = random_idx.id_vector(N, word[0:length], alph, RI_letters, ordered) sprefix = np.roll(prefix, 1) prefix_ngram = np.multiply(ngram, sprefix) #print prefix_ngram q = Queue.PriorityQueue() for i in range(26): #may need to np.transpose(vector) result = np.dot(prefix_ngram, RI_letters[i]) #q.put((-n ,n)) #priority, value #ranks the next letter by their dot products q.put((-result, result, length + 1, pref, alph[i])) return q
def predict(pref, length, lang_vec): ngram = lang_vec clamp_to_binary(ngram, 0) #clamp(ngram, -27, 27) #clamp(ngram, -10, 10) prefix = random_idx.id_vector(N, word[0:length], alph, RI_letters, ordered) sprefix = np.roll(prefix, 1) prefix_ngram = np.multiply(ngram, sprefix) #print prefix_ngram q = Queue.PriorityQueue() for i in range(26): #may need to np.transpose(vector) result = np.dot(prefix_ngram, RI_letters[i]) #q.put((-n ,n)) #priority, value #ranks the next letter by their dot products q.put((-result, result, length+1, pref, alph[i])) return q
def predict2(lvl_n, pref): v = "" prefix = np.zeros((1, 10000)) pref_reversed = pref[::-1]; for i in pref_reversed: v = i+v p = random_idx.id_vector(N, v, alph, RI_letters, ordered) prefix = np.add(p,prefix) sprefix = np.roll(prefix, 1); t = np.multiply(lvl_n, sprefix) q = Queue.PriorityQueue() for i in range(26): result = np.dot(t, RI_letters[i]) q.put((-result, result, len(pref)+1, pref, alph[i])) return q
k = 500 N = 1000 n_components = 2 n_neighbors = 10 method = 'hessian' alphabet = string.lowercase + ' ' a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt' one_hot_encoding = random_idx.generate_letter_id_vectors(N, k) lst = [] with open(a, 'r') as f: for line in f: for word in line.split(): beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding) lst.append(beta) tup = tuple(lst) big_matrix = np.vstack(lst) big_matrix = big_matrix[0:1000] print big_matrix.shape print "compressing data" fig = plt.figure(figsize=(15, 8)) tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0) Y = tsne.fit_transform(big_matrix)
def id_vector(N, cluster, alphabet, RI_letters, ordered=0): return random_idx.id_vector(N, cluster, alphabet, RI_letters, ordered)
def id_vector(N, cluster, alphabet, RI_letters,ordered=0): return random_idx.id_vector(N, cluster, alphabet, RI_letters, ordered)