def find_letter_partner(test_letter, english_vector=english_vector):
        # testing letter blocks for their "block partners"
        test_vec = random_idx.id_vector(N,test_letter,alph, RI_letters, ordered=ordered)
        #test_vec = test_vec/np.linalg.norm(test_vec)
        '''
        sub_eng = np.copy(english_vector)
        for r in xrange(len(blocks)):
            block = blocks[r]
            if test_letter != block[0]:
                sub_eng[:, RI_blocks[r,:] != 0] = 1e-2
        print sub_eng
        '''
        cz = len(test_letter)
        #if cz > 1:
        #    for i in xrange(len(alph)):
        #        english_vector -= RI_letters[i,:]
        #english_vector /= np.linalg.norm(english_vector)

        #factored_eng = np.multiply(english_vector, np.roll(letter, 1))
        factored_eng = np.multiply(english_vector, np.roll(test_vec, 1))
        #factored_eng = np.roll(np.multiply(english_vector, letter), -1)
        #factored_RI_letters = RI_letters, np.roll(letter,1))


        #if len(test_letter) == 1:
        likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_letters, alph, display=1)
        '''else:
                #RI_blocks, RI_block_vectors = generate_RI_block_vectors(cz=cz)
                #likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_block_vectors, RI_blocks, display=1)
        '''
        return likely_block_partner, values, partners
Ejemplo n.º 2
0
def find_letter_partner(test_letter, lang_vector):
    # testing letter blocks for their "block partners"
    test_vec = random_idx.id_vector(N,
                                    test_letter,
                                    alph,
                                    RI_letters,
                                    ordered=ordered)
    #test_vec = test_vec/np.linalg.norm(test_vec)
    '''
        sub_eng = np.copy(english_vector)
        for r in xrange(len(blocks)):
            block = blocks[r]
            if test_letter != block[0]:
                sub_eng[:, RI_blocks[r,:] != 0] = 1e-2
        print sub_eng
        '''

    cz = len(test_letter)
    #if cz > 1:
    #    for i in xrange(len(alph)):
    #        english_vector -= RI_letters[i,:]
    #english_vector /= np.linalg.norm(english_vector)

    #factored_eng = np.multiply(english_vector, np.roll(letter, 1))
    factored_lang = np.multiply(lang_vector, np.roll(test_vec, 1))
    #factored_eng = np.roll(np.multiply(english_vector, letter), -1)
    #factored_RI_letters = RI_letters, np.roll(letter,1))

    #if len(test_letter) == 1:
    likely_block_partner, values, partners = utils.find_language(test_letter,
                                                                 factored_lang,
                                                                 RI_letters,
                                                                 alph,
                                                                 display=1)
    return likely_block_partner, values, partners
def generate_RI_block_vectors(cz=2):
        # generate letter block values for total comparison
        RI_blocks = utils.generate_ordered_clusters(alph,cluster_sz=cz)
        #print RI_blocks
        RI_block_vectors = np.zeros((len(RI_blocks),N))
        for i in xrange(len(RI_blocks)):
            block = RI_blocks[i]
            #print block
            block_vec = random_idx.id_vector(N,block,alph, RI_letters,ordered=ordered)
            RI_block_vectors[i,:] = block_vec
        return RI_blocks, RI_block_vectors
Ejemplo n.º 4
0
def create_full_words_vec(vocab_file, N=N, k=k):
    fread = open(vocab_file, "r")
    fwrite = open("../intermediate/alice_full_words_vec", "w")
    total_lang = np.zeros((1,N))

    for line in fread:
        word = line[:line.index(":")]
        word_vector = random_idx.id_vector(N, word, alph, RI_letters, ordered)
        total_lang += word_vector
    
    pickle.dump(total_lang, fwrite)
    fread.close()
    fwrite.close()
    return total_lang
def generate_RI_block_vectors(cz=2):
    # generate letter block values for total comparison
    RI_blocks = utils.generate_ordered_clusters(alph, cluster_sz=cz)
    #print RI_blocks
    RI_block_vectors = np.zeros((len(RI_blocks), N))
    for i in xrange(len(RI_blocks)):
        block = RI_blocks[i]
        #print block
        block_vec = random_idx.id_vector(N,
                                         block,
                                         alph,
                                         RI_letters,
                                         ordered=ordered)
        RI_block_vectors[i, :] = block_vec
    return RI_blocks, RI_block_vectors
def vocab_vector(lv, lang_vectors, filepath="preprocessed_texts/alice-only-spaced.txt"):
    f = open(filepath, "r");
    text = f.read()
    text = text.split(" ")
    #text = ''.join([x for x in text if x in alphabet])[0:10000];
    vocab_vec = np.zeros((1,N))
    max_length = 0
    for word in text:
        #print "generating vocab vector of cluster size", len(word)
        word_vec = random_idx.id_vector(N, word, alphabet, lv, ordered)
        vocab_vec += word_vec
        if len(word) > max_length:
            max_length = len(word)
    f.close()
    return vocab_vec, max_length
def predict2(lvl_n, pref):
    v = ""
    prefix = np.zeros((1, 10000))
    pref_reversed = pref[::-1]
    for i in pref_reversed:
        v = i + v
        p = random_idx.id_vector(N, v, alph, RI_letters, ordered)
        prefix = np.add(p, prefix)

    sprefix = np.roll(prefix, 1)
    t = np.multiply(lvl_n, sprefix)

    q = Queue.PriorityQueue()

    for i in range(26):
        result = np.dot(t, RI_letters[i])
        q.put((-result, result, len(pref) + 1, pref, alph[i]))

    return q
def predict(pref, length, lang_vec):
    ngram = lang_vec
    clamp_to_binary(ngram, 0)
    #clamp(ngram, -27, 27)
    #clamp(ngram, -10, 10)
    prefix = random_idx.id_vector(N, word[0:length], alph, RI_letters, ordered)
    sprefix = np.roll(prefix, 1)
    prefix_ngram = np.multiply(ngram, sprefix)
    #print prefix_ngram

    q = Queue.PriorityQueue()

    for i in range(26):
        #may need to np.transpose(vector)
        result = np.dot(prefix_ngram, RI_letters[i])
        #q.put((-n ,n))
        #priority, value
        #ranks the next letter by their dot products
        q.put((-result, result, length + 1, pref, alph[i]))
    return q
def predict(pref, length, lang_vec):
    ngram = lang_vec
    clamp_to_binary(ngram, 0)
    #clamp(ngram, -27, 27)
    #clamp(ngram, -10, 10)
    prefix = random_idx.id_vector(N, word[0:length], alph, RI_letters, ordered)
    sprefix = np.roll(prefix, 1)
    prefix_ngram = np.multiply(ngram, sprefix)
    #print prefix_ngram

    q = Queue.PriorityQueue()
    
    for i in range(26):
        #may need to np.transpose(vector)
        result = np.dot(prefix_ngram, RI_letters[i])
        #q.put((-n ,n))
        #priority, value
        #ranks the next letter by their dot products
        q.put((-result, result, length+1, pref, alph[i]))
    return q
def predict2(lvl_n, pref):
    v = ""
    prefix = np.zeros((1, 10000))
    pref_reversed = pref[::-1];
    for i in pref_reversed:
        v = i+v
        p = random_idx.id_vector(N, v, alph, RI_letters, ordered)
        prefix = np.add(p,prefix)


    sprefix = np.roll(prefix, 1);
    t = np.multiply(lvl_n, sprefix)

    q = Queue.PriorityQueue()

    for i in range(26):
        result = np.dot(t, RI_letters[i])
        q.put((-result, result, len(pref)+1, pref, alph[i]))

    return q
Ejemplo n.º 11
0
k = 500
N = 1000
n_components = 2
n_neighbors = 10
method = 'hessian'
alphabet = string.lowercase + ' '
a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt'

one_hot_encoding = random_idx.generate_letter_id_vectors(N, k)
lst = []

with open(a, 'r') as f:
    for line in f:
        for word in line.split():
            beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding)
            lst.append(beta)

tup = tuple(lst)

big_matrix = np.vstack(lst)
big_matrix = big_matrix[0:1000]
print big_matrix.shape

print "compressing data"

fig = plt.figure(figsize=(15, 8))

tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Y = tsne.fit_transform(big_matrix)
def id_vector(N, cluster, alphabet, RI_letters, ordered=0):
    return random_idx.id_vector(N, cluster, alphabet, RI_letters, ordered)
Ejemplo n.º 13
0
k = 500
N = 1000
n_components = 2 
n_neighbors = 10
method = 'hessian'
alphabet = string.lowercase + ' '
a = '../preprocessed_texts/english/with_spaces/alice_in_wonderland.txt'

one_hot_encoding = random_idx.generate_letter_id_vectors(N, k)
lst = []

with open(a, 'r') as f:
	for line in f:
		for word in line.split():
			beta = random_idx.id_vector(N, word, alphabet, one_hot_encoding)
			lst.append(beta)

tup = tuple(lst)

big_matrix = np.vstack(lst)
big_matrix = big_matrix[0:1000]
print big_matrix.shape

print "compressing data"

fig = plt.figure(figsize=(15, 8))

tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
Y = tsne.fit_transform(big_matrix)
def id_vector(N, cluster, alphabet, RI_letters,ordered=0):
    return random_idx.id_vector(N, cluster, alphabet, RI_letters, ordered)