def find_letter_partner(test_letter, lang_vector): # testing letter blocks for their "block partners" test_vec = random_idx.id_vector(N, test_letter, alph, RI_letters, ordered=ordered) #test_vec = test_vec/np.linalg.norm(test_vec) ''' sub_eng = np.copy(english_vector) for r in xrange(len(blocks)): block = blocks[r] if test_letter != block[0]: sub_eng[:, RI_blocks[r,:] != 0] = 1e-2 print sub_eng ''' cz = len(test_letter) #if cz > 1: # for i in xrange(len(alph)): # english_vector -= RI_letters[i,:] #english_vector /= np.linalg.norm(english_vector) #factored_eng = np.multiply(english_vector, np.roll(letter, 1)) factored_lang = np.multiply(lang_vector, np.roll(test_vec, 1)) #factored_eng = np.roll(np.multiply(english_vector, letter), -1) #factored_RI_letters = RI_letters, np.roll(letter,1)) #if len(test_letter) == 1: likely_block_partner, values, partners = utils.find_language(test_letter, factored_lang, RI_letters, alph, display=1) return likely_block_partner, values, partners
def find_letter_partner(test_letter, english_vector=english_vector): # testing letter blocks for their "block partners" test_vec = random_idx.id_vector(N,test_letter,alph, RI_letters, ordered=ordered) #test_vec = test_vec/np.linalg.norm(test_vec) ''' sub_eng = np.copy(english_vector) for r in xrange(len(blocks)): block = blocks[r] if test_letter != block[0]: sub_eng[:, RI_blocks[r,:] != 0] = 1e-2 print sub_eng ''' cz = len(test_letter) #if cz > 1: # for i in xrange(len(alph)): # english_vector -= RI_letters[i,:] #english_vector /= np.linalg.norm(english_vector) #factored_eng = np.multiply(english_vector, np.roll(letter, 1)) factored_eng = np.multiply(english_vector, np.roll(test_vec, 1)) #factored_eng = np.roll(np.multiply(english_vector, letter), -1) #factored_RI_letters = RI_letters, np.roll(letter,1)) #if len(test_letter) == 1: likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_letters, alph, display=1) '''else: #RI_blocks, RI_block_vectors = generate_RI_block_vectors(cz=cz) #likely_block_partner, values, partners = utils.find_language(test_letter, factored_eng, RI_block_vectors, RI_blocks, display=1) ''' return likely_block_partner, values, partners
bilinguals.append((languages[i],languages[j])) #print bilinguals bilingual_vectors = np.zeros((len(bilinguals),N)) for i in xrange(len(bilinguals)): lang1, lang2 = bilinguals[i] lang1_idx = languages.index(lang1) lang2_idx = languages.index(lang2) bilingual_vectors[i,:] = final_lang[lang1_idx,:]/np.linalg.norm(final_lang[lang1_idx,:]) + final_lang[lang2_idx,:]/np.linalg.norm(final_lang[lang2_idx,:]) ''' print '\n' # compare with "unknown text" #final_unknown = sum(unknown_tots) utils.find_language(unknown_txt, final_unknown, final_lang, languages, display=1) ''' print '\n' # compare with "unknown text" on bilinguals print '========' utils.find_language(unknown_txt, final_unknown, np.vstack((final_lang, bilingual_vectors)), languages + bilinguals, display=1) ''' print '=========' print 'N = ' + str(N) + '; k = ' + str(k) + '; max size letters clusters are ' + str(cluster_max) + '\n' cosangles = utils.cosangles(final_lang, languages, display=0) print "variance of language values: " + str(utils.var_measure(cosangles)) '''
testf = test_fn[i] actual_lang = re.findall('(\w+)_\d+_p.txt$', testf)[0] unknown_tots = [] #print len(testf),testf[91:93] #if testf == main_base + test_dir + '/da_432_p.txt': for cluster_sz in cluster_sizes: for ordered in ordy: #print testf[71:] # calculate unknown vector unknown_vector = random_idx.generate_RI_text( N, RI_letters, cluster_sz, ordered, testf) unknown_tots.append(unknown_vector) final_unknown = sum(unknown_tots) likely_lang = utils.find_language(testf, final_unknown, final_lang, languages, display=0) #print testf[91:], '=> ',likely_lang if lang_map[actual_lang] == likely_lang: correct += 1 print "correct: ", correct, "; total: ", total, "; final percentage correct: ", float( correct) / total ############################### # dimension reduction plot to view vectors in 2-d print '=========' print 'N = ' + str(N) + '; k = ' + str( k) + '; max size letters clusters are ' + str(cluster_max) + '\n'
##print "~~~~~~~~~~" total_vectors = [] # calculate language vectors lang_vectors = random_idx.generate_RI_lang(N, RI_letters, cluster_sz, ordered, languages=languages) #lang_vectors = random_idx.generate_RI_lang_words(N, RI_letters, languages=languages) total_vectors.append(lang_vectors) # print cosine angles #print '==========' # print 'N = ' + str(N) + '; k = ' + str(k) + '; letters clusters are ' + str(cluster_sz) + ', ' + ord_str + '\n' cosangles = utils.cosangles(lang_vectors, languages) variance = utils.var_measure(cosangles) #print "variance of language values: " + str(utils.var_measure(cosangles)) final_lang = sum(total_vectors) ############################### # iterate through test files and calculate correctness print "Now waiting for input" while True: sentence = raw_input().decode(sys.stdin.encoding or locale.getpreferredencoding(True)) unknown_tots = [] sentence = cleaner(sentence) unknown_vector = random_idx.generate_RI_sentence(N, RI_letters, cluster_sz, ordered,sentence) unknown_tots.append(unknown_vector) final_unknown = sum(unknown_tots) likely_lang = utils.find_language(sentence, final_unknown, final_lang, languages,display=0) print "likely language:", likely_lang
correct = 0 for i in trange(total): testf = test_fn[i] actual_lang = re.findall('(\w+)_\d+_p.txt$', testf)[0] unknown_tots = [] #print len(testf),testf[91:93] #if testf == main_base + test_dir + '/da_432_p.txt': for cluster_sz in cluster_sizes: for ordered in ordy: #print testf[71:] # calculate unknown vector unknown_vector = random_idx.generate_RI_text(N, RI_letters, cluster_sz, ordered,testf) unknown_tots.append(unknown_vector) final_unknown = sum(unknown_tots) likely_lang = utils.find_language(testf, final_unknown, final_lang, languages,display=0) #print testf[91:], '=> ',likely_lang if lang_map[actual_lang] == likely_lang: correct +=1 print "correct: ", correct, "; total: ", total,"; final percentage correct: ", float(correct)/total ############################### # dimension reduction plot to view vectors in 2-d print '=========' print 'N = ' + str(N) + '; k = ' + str(k) + '; max size letters clusters are ' + str(cluster_max) + '\n' cosangles = utils.cosangles(final_lang, languages, display=0) print "variance of language values: " + str(utils.var_measure(cosangles))
total_vectors.append(lang_vectors) # print cosine angles #print '==========' # print 'N = ' + str(N) + '; k = ' + str(k) + '; letters clusters are ' + str(cluster_sz) + ', ' + ord_str + '\n' cosangles = utils.cosangles(lang_vectors, languages) variance = utils.var_measure(cosangles) #print "variance of language values: " + str(utils.var_measure(cosangles)) final_lang = sum(total_vectors) ############################### # iterate through test files and calculate correctness print "Now waiting for input" while True: sentence = raw_input().decode( sys.stdin.encoding or locale.getpreferredencoding(True)) unknown_tots = [] sentence = cleaner(sentence) unknown_vector = random_idx.generate_RI_sentence( N, RI_letters, cluster_sz, ordered, sentence) unknown_tots.append(unknown_vector) final_unknown = sum(unknown_tots) likely_lang = utils.find_language(sentence, final_unknown, final_lang, languages, display=0) print "likely language:", likely_lang