def fun(grid, alt): n = len(grid) tl, tr, bl, br = grid[0][n-2], grid[0][n-1], grid[n-1][n-2], alt s_top = similarity(modifier(tl), tr) s_bot = similarity(modifier(bl), br) # return min(s_top, s_bot) return s_top * s_bot
def remove_noise(data, cutoff, radius, colors=None): if colors == []: colors = None sim = similarity.similarity(data, cutoff, radius, colors=colors) sim.bin_data() sim.delete_bins() newdata = [] newcolors = [] for bin in sim.bins: for b in bin: d = sim.data[b] newdata.append(sim.data[b]) if colors != None: newcolors.append(colors[b]) if colors != None: indices = [] * len(set(newcolors)) for n in newcolors: if n not in indices: indices.append(n) for i in range(len(newcolors)): newcolors[i] = indices.index(newcolors[i]) return newdata, newcolors
def filter(self, s, id_set): """ 返回需要预警的id_set """ word_list = self.tf_idf_hd.get_top_n_tf_idf(s) word_list_len = len(word_list) print "/".join(word_list) repeat_tid_set = set() ret_set = id_set for i in range(word_list_len): key_word_list = [self.word_key_pre + word for word in word_list] if word_list_len > 1: del key_word_list[i] tid_set_s = self.r_hd.sinter(key_word_list) tid_set = set([int(i) for i in tid_set_s]) #fid_set为重复的id集合, 加到总重复id集合里 repeat_tid_set |= tid_set key_word_list = [self.word_key_pre + word for word in word_list] if repeat_tid_set: repeat_tid_list = list(repeat_tid_set) if word_list_len < self.sim_judge_limit: title_key_list = [self.title_id_pre + str(i) for i in repeat_tid_list] #取出所有title判断相似度 title_list = self.r_hd.mget(title_key_list) idx = -1 for title in title_list: idx += 1 if similarity(s, title) > 0.5: break if idx >= 0: l_id_set_s = self.r_hd.smembers(self.uid_pre + str(repeat_tid_list[idx])) l_id_set = set([int(i) for i in l_id_set_s]) overtime_uid_set = self.check_for_uid_overtime(repeat_tid_set, id_set) ret_set = (id_set - l_id_set) | overtime_uid_set self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set) else: #如果没有相似的title, 则insert self.insert_s_to_redis(s, key_word_list, ret_set) else: tid_uid_key_list = [self.uid_pre + str(tid) for tid in repeat_tid_set] l_id_set_s = self.r_hd.sunion(tid_uid_key_list) l_id_set = set([int(i) for i in l_id_set_s]) overtime_uid_set = self.check_for_uid_overtime(repeat_tid_set, id_set) print "overtime uid set:", overtime_uid_set print "repeat tid set:", repeat_tid_set ret_set = (id_set - l_id_set) | overtime_uid_set self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set) else: #如果一个都没有对上 则直接新增 self.insert_s_to_redis(s, key_word_list, id_set) return ret_set
def xor_rows_then_compare(grid, alt): if(len(grid) == 2): return 0 grid = copy.deepcopy(grid) grid[2].append(alt) top = reduce(lambda a, b: a^b, grid[0]) bot = reduce(lambda a, b: a^b, grid[2]) a_top = reduce(lambda a, b: a&b, grid[0], top) a_bot = reduce(lambda a, b: a&b, grid[2], bot) top_xor = pymorph.open(top^a_top) bot_xor = pymorph.open(bot^a_bot) return similarity(top_xor, bot_xor)
def compare_vectors(word_vector1, word_vector2): all_words = list(set(word_vector1).union(set(word_vector2))) #print all_words frequency_dict1 = word_frequencies(word_vector1) #print frequency_dict1 frequency_dict2 = word_frequencies(word_vector2) #print frequency_dict2 frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words] frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words] #print frequency_vector1,frequency_vector2 return similarity(frequency_vector1, frequency_vector2)
def compare_vectors(word_vector1, word_vector2): """Numerical similarity between lists of words. Higher is better. Uses cosine similarity. Result range: 0 (bad) - 1 (uses all the same words in the same proportions) """ all_words = list(set(word_vector1).union(set(word_vector2))) frequency_dict1 = word_frequencies(word_vector1) frequency_dict2 = word_frequencies(word_vector2) frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words] frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words] return similarity(frequency_vector1, frequency_vector2)
def match( E1, E2, kpts1, kpts2, desc1, desc2, cang, crat, cdesc, th_e, th_p, verb): ''' E1, E2: hyperedges lists of img1 and img2, respectly ''' # indices_taken = [] hyperedge_matches = [] point_matches = [] sel_point_matches = set() if verb: count = 0 size = len(E1) * len(E2) for i, e_i in enumerate(E1): max_similarity = -float('inf') for j, e_j in enumerate(E2): p = [np.array(kpts1[e_i[k]].pt) for k in xrange(3)] q = [np.array(kpts2[e_j[k]].pt) for k in xrange(3)] dp = [np.array(desc1[e_i[k]]) for k in xrange(3)] dq = [np.array(desc2[e_j[k]]) for k in xrange(3)] _point_idx, _sim, sim_a, sim_r, sim_d = similarity( p, q, dp, dq, cang, crat, cdesc ) if verb: count += 1 print '{}/{} = {:.2}%'.format(count, size, count / size * 100) if _sim > max_similarity: best_index = j max_similarity = _sim s_ang = sim_a s_ratios = sim_r s_desc = sim_d e_idx = [(e_i[l], e_j[m]) for l, m in _point_idx] if max_similarity >= th_e: hyperedge_matches.append( (i, best_index, max_similarity, s_ang, s_ratios, s_desc) ) for l, m in e_idx: dist = LA.norm(np.subtract(desc1[l], desc2[m])) sim = exp(-dist / SIGMA) if not (l, m) in sel_point_matches and sim >= th_p: point_matches.append(cv2.DMatch(l, m, dist)) sel_point_matches.add((l, m)) return hyperedge_matches, point_matches
def isomorphic(G1, G2, name=None): if G1.number_of_nodes() != G2.number_of_nodes(): print "Non-isomorphic: different number of nodes." return False [B1, B2] = stable_colouring([G1, G2]) print "%d colours found." % len(B1) L1 = [np.sum(M) for M in B1] L2 = [np.sum(M) for M in B2] if L1 != L2: print "Non-isomorphic: different stable colouring." return False #field = next_prime_field(len(B1)) field = GF(2) p = field.getCharacteristic() print "Testing similarity over GF(%d)" % p # Convert to numpy matrices, ignoring colours with zero entries C1 = [numpy_to_nzmath(B1[i], field) for i in range(len(B1)) if L1[i] != 0 ] C2 = [numpy_to_nzmath(B2[i], field) for i in range(len(B2)) if L2[i] != 0 ] assert len(C1) == len(C2) if similarity(C1, C2, name) is None: print "Non-isomorphic: no simultaneous similarity in GF(%d)." % p return False print "Isomorphic or too difficult to tell." return True
def echo_all(updates): found = 0 for update in updates["result"]: try: text = update["message"]["text"] prev_score = 0 instances = faq.query.all() print(instances) for instance in instances: score, match = similarity(text, instance.question) if match and prev_score < score: Atext = instance.answer chat = update["message"]["chat"]["id"] found = 1 prev_score = score if found != 1: Atext = "Sorry I didn't quite get you" chat = update["message"]["chat"]["id"] send_message(Atext, chat) else: send_message(Atext, chat) except Exception as e: print(e)
# Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = np.mat(tm.get_matrix(sort=True)) attributeNames = tm.get_words(sort=True) # Query vector q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) # Method 1 ('for' loop - slow) N = np.shape(X)[0]; # get the number of data objects sim = np.zeros((N,1)) # allocate a vector for the similarity for i in range(N): x = X[i,:] # Get the i'th data object (here: document) sim[i] = q/linalg.norm(q) * x.T/linalg.norm(x) # Compute cosine similarity # Method 2 (one line of code with no iterations - faster) sim = (q*X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum(axis=1))) # Method 3 (use the "similarity" function) sim = similarity(X, q, 'cos'); # Display the result print('Query vector:\n {0}\n'.format(q)) print('Similarity results:\n {0}'.format(sim))
def w(s): sys.stdout.write(s) if __name__ == '__main__': conn = sqlite3.connect(DATABASE) files = os.listdir(ROOT_DIR)[15:30] nb = len(files) for x in xrange(nb): f = open(os.path.join(ROOT_DIR, files[x])) count = len(json.loads(f.read())['tags']) print "%-3d - %s (%d tags)" % (x, files[x], count) w("\n\n") # Print the matrix. w(" ") for x in xrange(nb): w(" %5d" % x) w("\n") for x in xrange(nb): w("%3d" % x) fx = open(os.path.join(ROOT_DIR, files[x])) tx = json.loads(fx.read()) for y in xrange(nb): fy = open(os.path.join(ROOT_DIR, files[y])) ty = json.loads(fy.read()) sim = similarity.similarity(tx, ty, conn) w(" %5.2f" % sim) w("\n")
def test_similarity(self): self.assertAlmostEqual(similarity([1, 1], [-1, -1]), -1) self.assertAlmostEqual(similarity([1, 1], [1, 1]), 1) self.assertAlmostEqual(similarity([0, 1], [1, 0]), 0)
def test_exact_similiarity(self): s = similarity("layer 4 neuron", "layer 4 neuron", use_inter_similarity=False) self.assertEqual(s[0], 1.0)
print filename # compute Mel filterank features mel.compute_mfb(filename) # compute dynamic features dynfeat.compute_dynamic_features(filename) selecfeat.mutual_information_features(filename) ''' for filename in filenames: # select a small set of features trainFilenames = filenames[:] # copy the filename list trainFilenames.remove(filename) selectFeatures = selecfeat.select_features(filename, trainFilenames) # compute the full similarity matrix and the upper diagonal (frame to frame distance) similarityUpperDiag = similarity.similarity(filename) similarityMatrix = similarity.full_similarity(selectFeatures) # get the segment dates similarSegmentsInd = similarity.segments_indices(filename) # plot the upper diagonal nPoints = len(similarityUpperDiag) rate = numpy.load(settings.DIR_SAMPLE_RATE + filename + '.npy') x = timeconv.timeconv(numpy.arange(nPoints), rate, 'feat', 'second') plt.plot(x, similarityUpperDiag) plt.vlines(timeconv.timeconv(similarSegmentsInd, rate, 'feat', 'second'), min(similarityUpperDiag), 1) gtmap = numpy.loadtxt(settings.DIR_LABELS + filename + '.csv', dtype=int, delimiter=',')
doc_theta = [random.dirichlet([alpha[m]] * K) for m in range(M)] #beta = random.gamma(1, 1) beta = random.beta(1, 1) #corpus_phi = [ random.dirichlet([ beta ] * K) for k in range(K) ] corpus_phi = [random.dirichlet([beta] * V) for k in range(K)] # Network. graph = networkx.Graph() for i in range(M): for j in range(M): if i == j: continue s_ij = similarity.similarity(doc_theta[i], doc_theta[j], 'kld') s_ji = similarity.similarity(doc_theta[j], doc_theta[i], 'kld') if s_ij < T and s_ji < T: graph.add_edge(i, j) E_size_comp = (M * (M - 1)) / 2 E_size_graph = len(graph.edges()) print('%d / %d - %f' % (E_size_graph, E_size_comp, float(E_size_graph) / E_size_comp)) with file(sys.argv[1] + '.edges', 'w') as opened: opened.write(json.dumps(graph.edges())) docs = []
def distance(q1, q2): if q1 == q2: return 0 return similarity(q1, q2)
#beta = random.gamma(1, 1) beta = random.beta(1, 1) #corpus_phi = [ random.dirichlet([ beta ] * K) for k in range(K) ] corpus_phi = [ random.dirichlet([ beta ] * V) for k in range(K) ] # Network. graph = networkx.Graph() for i in range(M): for j in range(M): if i == j: continue s_ij = similarity.similarity(doc_theta[i], doc_theta[j], 'kld') s_ji = similarity.similarity(doc_theta[j], doc_theta[i], 'kld') if s_ij < T and s_ji < T: graph.add_edge(i, j) E_size_comp = (M * (M-1)) / 2 E_size_graph = len(graph.edges()) print('%d / %d - %f' % (E_size_graph, E_size_comp, float(E_size_graph) / E_size_comp)) with file(sys.argv[1] + '.edges', 'w') as opened: opened.write(json.dumps(graph.edges())) docs = []
def test_exact_similiarity(self): s = similarity('layer 4 neuron', 'layer 4 neuron', use_inter_similarity=False) self.assertEqual(s[0], 1.0)
print filename # compute Mel filterank features mel.compute_mfb(filename) # compute dynamic features dynfeat.compute_dynamic_features(filename) selecfeat.mutual_information_features(filename) ''' for filename in filenames: # select a small set of features trainFilenames = filenames[:] # copy the filename list trainFilenames.remove(filename) selectFeatures = selecfeat.select_features(filename, trainFilenames) # compute the full similarity matrix and the upper diagonal (frame to frame distance) similarityUpperDiag = similarity.similarity(filename) similarityMatrix = similarity.full_similarity(selectFeatures) # get the segment dates similarSegmentsInd = similarity.segments_indices(filename) # plot the upper diagonal nPoints = len(similarityUpperDiag) rate = numpy.load(settings.DIR_SAMPLE_RATE + filename + '.npy') x = timeconv.timeconv(numpy.arange(nPoints), rate, 'feat', 'second') plt.plot(x, similarityUpperDiag) plt.vlines(timeconv.timeconv(similarSegmentsInd, rate, 'feat', 'second'), min(similarityUpperDiag), 1) gtmap = numpy.loadtxt(settings.DIR_LABELS + filename + '.csv', dtype=int, delimiter=',') plt.vlines(gtmap[:,0], min(similarityUpperDiag), 1, color='r') plt.title('Segmentation based on frame to frame similarity : ' + filename) plt.show()
plt.ylabel("Density") plt.subplot(1, 3, 3) wsm = np.mean(writing_score) ws_m = writing_score - wsm sns.distplot(writing_score, hist=True, kde=True, bins=int(100 / 10), color='red', hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 1}) plt.xlabel("Writing score") plt.ylabel("Density") # %% sim = similarity(writing_score, reading_score, 'cor') print(sim) score_attribute = attributeNames[5:8] print(score_attribute) i = 0 j = 1 fig7, ax8 = plt.subplots() for att in range(3): ax8.arrow(0, 0, V[att, i], V[att, j]) ax8.text(V[att, i], V[att, j], score_attribute[att]) ax8.set_xlim([-1, 1]) ax8.set_ylim([-1, 1]) ax8.set_xlabel('PC' + str(i + 1)) ax8.set_ylabel('PC' + str(j + 1)) ax8.grid()
def w(s): sys.stdout.write(s) if __name__ == "__main__": conn = sqlite3.connect(DATABASE) files = os.listdir(ROOT_DIR)[15:30] nb = len(files) for x in xrange(nb): f = open(os.path.join(ROOT_DIR, files[x])) count = len(json.loads(f.read())["tags"]) print "%-3d - %s (%d tags)" % (x, files[x], count) w("\n\n") # Print the matrix. w(" ") for x in xrange(nb): w(" %5d" % x) w("\n") for x in xrange(nb): w("%3d" % x) fx = open(os.path.join(ROOT_DIR, files[x])) tx = json.loads(fx.read()) for y in xrange(nb): fy = open(os.path.join(ROOT_DIR, files[y])) ty = json.loads(fy.read()) sim = similarity.similarity(tx, ty, conn) w(" %5.2f" % sim) w("\n")
# exercise 3.2.2 import numpy as np from similarity import similarity # Generate two data objects with M random attributes M = 5; x = np.mat(np.random.rand(1,M)) y = np.mat(np.random.rand(1,M)) # Two constants a = 1.5 b = 1.5 # Check the statements in the exercise print "Cosine scaling: %.4f " % (similarity(x,y,'cos') - similarity(a*x,y,'cos'))[0,0] print "ExtendedJaccard scaling: %.4f " % (similarity(x,y,'ext') - similarity(a*x,y,'ext'))[0,0] print "Correlation scaling: %.4f " % (similarity(x,y,'cor') - similarity(a*x,y,'cor'))[0,0] print "Cosine translation: %.4f " % (similarity(x,y,'cos') - similarity(b+x,y,'cos'))[0,0] print "ExtendedJaccard translation: %.4f " % (similarity(x,y,'ext') - similarity(b+x,y,'ext'))[0,0] print "Correlation translation: %.4f " % (similarity(x,y,'cor') - similarity(b+x,y,'cor'))[0,0]
def closest_center_index(vector): """Get the index of the closest cluster center to `self.vector`.""" similarity_to_vector = lambda center: similarity(center,vector) center = max(self.centers, key=similarity_to_vector) return self.centers.index(center)
import similarity !curl 'http://www.gutenberg.org/files/11/11-0.txt' -o aliceText.txt # take all words from alice and store them in memory aliceFile = open("aliceText.txt") wordCorpus = [] for line in aliceFile: # remove newlines line = line.strip().lower() # get words words = line.split(" ") for word in words: if word.isalnum(): if word not in wordCorpus: wordCorpus.append(word) print similarity.similarity("rabbi",wordCorpus)
def filter(self, s, id_set): """ 返回需要预警的id_set """ word_list = self.tf_idf_hd.get_top_n_tf_idf(s) word_list_len = len(word_list) print "/".join(word_list) repeat_tid_set = set() ret_set = id_set for i in range(word_list_len): key_word_list = [self.word_key_pre + word for word in word_list] if word_list_len > 1: del key_word_list[i] tid_set_s = self.r_hd.sinter(key_word_list) tid_set = set([int(i) for i in tid_set_s]) #fid_set为重复的id集合, 加到总重复id集合里 repeat_tid_set |= tid_set key_word_list = [self.word_key_pre + word for word in word_list] if repeat_tid_set: repeat_tid_list = list(repeat_tid_set) if word_list_len < self.sim_judge_limit: title_key_list = [ self.title_id_pre + str(i) for i in repeat_tid_list ] #取出所有title判断相似度 title_list = self.r_hd.mget(title_key_list) idx = -1 for title in title_list: idx += 1 if similarity(s, title) > 0.5: break if idx >= 0: l_id_set_s = self.r_hd.smembers(self.uid_pre + str(repeat_tid_list[idx])) l_id_set = set([int(i) for i in l_id_set_s]) overtime_uid_set = self.check_for_uid_overtime( repeat_tid_set, id_set) ret_set = (id_set - l_id_set) | overtime_uid_set self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set) else: #如果没有相似的title, 则insert self.insert_s_to_redis(s, key_word_list, ret_set) else: tid_uid_key_list = [ self.uid_pre + str(tid) for tid in repeat_tid_set ] l_id_set_s = self.r_hd.sunion(tid_uid_key_list) l_id_set = set([int(i) for i in l_id_set_s]) overtime_uid_set = self.check_for_uid_overtime( repeat_tid_set, id_set) print "overtime uid set:", overtime_uid_set print "repeat tid set:", repeat_tid_set ret_set = (id_set - l_id_set) | overtime_uid_set self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set) else: #如果一个都没有对上 则直接新增 self.insert_s_to_redis(s, key_word_list, id_set) return ret_set
def closest_center_index(vector): """Get the index of the closest cluster center to `self.vector`.""" similarity_to_vector = lambda center: similarity(center, vector) center = max(self.centers, key=similarity_to_vector) return self.centers.index(center)
# # Project the centered data onto principal component space K = centeredMatrix * V #print np.size(K,0) #print np.size(K,1) # # Compute variance explained by principal components var = (S * S) / (S * S).sum() print sum( var[:2]), "The amount of variation explained as a function of two PCA" # # This matrix is used to check the correlation KTranspose = np.mat(K).T # # Checking for correlation correlationMat = np.mat(similarity(KTranspose, KTranspose, 'cor')) #print np.size(correlationMat,0) #print np.size(correlationMat,1) for i in range(len(correlationMat)): for j in range(len(correlationMat)): if i != j: if round(correlationMat[i, j]) == -1 or round( correlationMat[i, j]) == 1: print 'correlation between', i, 'and', j # # Plot the first Direction f = plt.figure(1) plt.title('Direction of First Component ') plt.plot([np.arange(13)], V[0, :], 'o', color='black') plt.xlabel('Attributes') plt.ylabel('Weights')
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNamesWithStop = tm.get_words(sort=True) # Display the result print('Now with stopwords !!!') print attributeNamesWithStop print X """ 3.1.5 calculating similarity Using the similarity lib. """ #q is our desired similarity query the words are "solving", "rank" & "matrix" q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) sim = similarity(X, q, 'cos') print('Similarity results:\n {0}'.format(sim)) """ 3.2.1 """
def test_inter_similiarity_PV(self): s = similarity("PV neuron", "fast-spiking neuron", use_inter_similarity=True) self.assertEqual(s[0], 0.9, "inter similarity works for PV and fast-spiking") s_reverse = similarity("fast-spiking neuron", "PV neuron") self.assertEqual(s_reverse[0], 0.9, "inter similarity works in both directions")
#!/usr/bin/env python import time import pandas as pd from collections import OrderedDict from similarity import similarity from helpertools import helperTools import sys start_time = time.time() obj1 = helperTools() obj = similarity() df = pd.read_csv("/home/hadoop/50review.csv") domainThesaurus = OrderedDict({ "SERVICE": ["SERVICE", "WAITER", "STAFF", "SERVER"], "ROOM": ["BED", "ROOM", "BATHROOM"], "SHOPPING": ["MALL", "SHOPPING", "STORE", "MARKET"], "CLEANLINESS": ["DIRTY", "GRUBBY", "CLEAN", "NEAT"], "FOOD": [ "EAT", "DISHES", "DINNER", "FOOD", "BREAKFAST", "DELICIOUS", "MEAL", "RESTAURANT", "LUNCH" ], "VALUE": ["PRICE", "CHEAP", "WORTH", "MONEY", "EXPENSIVE", "PAY"], "TRANSPORTATION": ["RELATEDWAY", "STOP", "TRANSPORTATION", "BUS"], "FAMILY/FRIENDS": [ "MOTHER", "FRIEND", "FATHER", "FAMILY", "DAUGHTER", "HUSBAND", "CHILD", "SON", "KID", "WIFE" ], "LOCATION": ["FAR", "NEAR", "LACATION"], "VIEW": ["VIEW"],
# exercise 3.2.2 import numpy as np from similarity import similarity # Generate two data objects with M random attributes M = 5 x = np.mat(np.random.rand(1, M)) y = np.mat(np.random.rand(1, M)) # Two constants a = 1.5 b = 1.5 # Check the statements in the exercise print("Cosine scaling: %.4f " % (similarity(x, y, 'cos') - similarity(a * x, y, 'cos'))[0, 0]) print("ExtendedJaccard scaling: %.4f " % (similarity(x, y, 'ext') - similarity(a * x, y, 'ext'))[0, 0]) print("Correlation scaling: %.4f " % (similarity(x, y, 'cor') - similarity(a * x, y, 'cor'))[0, 0]) print("Cosine translation: %.4f " % (similarity(x, y, 'cos') - similarity(b + x, y, 'cos'))[0, 0]) print("ExtendedJaccard translation: %.4f " % (similarity(x, y, 'ext') - similarity(b + x, y, 'ext'))[0, 0]) print("Correlation translation: %.4f " % (similarity(x, y, 'cor') - similarity(b + x, y, 'cor'))[0, 0])
# exercise 3.2.2 import numpy as np from similarity import similarity # Generate two data objects with M random attributes M = 5 x = np.mat(np.random.rand(1, M)) y = np.mat(np.random.rand(1, M)) # Two constants a = 1.5 b = 1.5 # Check the statements in the exercise print "Cosine scaling: %.4f " % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0] print "ExtendedJaccard scaling: %.4f " % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0] print "Correlation scaling: %.4f " % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0] print "Cosine translation: %.4f " % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0] print "ExtendedJaccard translation: %.4f " % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0] print "Correlation translation: %.4f " % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
i = 1 # Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' similarity_measure = 'smc' # Load the CBCL face database # Load Matlab data file to python dict structure X = loadmat('../Data/wildfaces_grayscale.mat')['X'] N, M = shape(X) # Search the face database for similar faces # Index of all other images than i noti = range(0,i) + range(i+1,N) # Compute similarity between image i and all others sim = similarity(X[i,:], X[noti,:], similarity_measure) sim = sim.tolist()[0] # Tuples of sorted similarities and their indices sim_to_index = sorted(zip(sim,noti)) # Visualize query image and 5 most/least similar images figure(figsize=(12,8)) subplot(3,1,1) imshow(np.reshape(X[i],(40,40)).T, cmap=cm.gray) xticks([]); yticks([]) title('Query image') ylabel('image #{0}'.format(i)) for ms in range(5):
if __name__ == '__main__': p = [ np.array([119.91277313232422, 252.8047332763672]), np.array([139.75482177734375, 284.2823181152344]), np.array([109.2437744140625, 257.5870056152344]) ] dp = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] # q = [trans(rot(pp, 10 * math.pi / 180), 2, 3) for pp in p] q = [ np.array([374.65393066, 144.14517212]), np.array([161.24133301, 217.84576416]), np.array([99.27768707, 161.40586853]) ] dq = [[1, 2, 3.1], [4, 5.1, 6], [7.1, 8, 9]] point_match, max_sim, sim_a, sim_r, sim_d = similarity( p, q, dp, dq, 1, 1, 0.4, True) print p print q print 'max sim -> {}'.format(max_sim) print 'sim angles : {}'.format(sim_a) print 'sim ratios : {}'.format(sim_r) print 'sim desc : {}'.format(sim_d) xp, yp = zip(*p) xq, yq = zip(*q) plt.plot(xp + (xp[0], ), yp + (yp[0], ), 'r-') plt.plot(xq + (xq[0], ), yq + (yq[0], ), 'b-') plt.xlim((min(xp + xq), max(xp + xq))) plt.ylim((min(yp + yq), max(yp + yq))) plt.gca().invert_yaxis() plt.show()
def test_inter_similiarity_PV(self): s = similarity('PV neuron', 'fast-spiking neuron', use_inter_similarity=True) self.assertEqual(s[0], 0.9, 'inter similarity works for PV and fast-spiking') s_reverse = similarity('fast-spiking neuron', 'PV neuron') self.assertEqual(s_reverse[0], 0.9, 'inter similarity works in both directions')
def testing_news_all_values(d, test_news, threshold_prob_fake, min_common_en, component_selector,Dice_intersection__intensity): ## Initialising the count for prediction "fake news" count_predictions_Fake = 0 predictions_Fake = [] predictions_Fake_value = [] ## There must be at least n_tested_news tested news # Initialising the index in the list of test news to read len_test_news = len(test_news) # Initialising the count of valid_news_NotFake valid_tested_news = 0 for i in range(len_test_news): ## Reading test news number i test_news_dict = test_news[i] ## Filtering the knowledge base graph through the test news knowledge_filtered, error_size_KF = knowledge_filtered_fake(d, min_common_en, test_news_dict) ## If there is not enough size in the filtered knowledge graph, pass to the next news. if error_size_KF == 1: predictions_Fake_value.append(float("nan")) ## Otherwise continue the process else: ## Appending the fake news to the filtered knowledge graph. The fake news is the last position knowledge_filtered[test_news_dict["SOURCE"]] = test_news_dict ## It is necessary to apply a previous filter (regarding Entity Names and Related Words) to the # obtained filtered knowledge graph including the fake news rwords_news_min = 10 rwords_en_min = 1 knowledge_filtered = previous_filter(knowledge_filtered, rwords_news_min, rwords_en_min) ## Obtainaning the similarity and the dissimilariry matrixes with the selected componentes # Fixing some basic parameters for the similarity measure optionSimbSim = "Ichino_yaguchi" gamma = 0.2 # In case of Ichino-Yaguchi similarity # Similarity calculations dis_matrix = similarity(knowledge_filtered, component_selector, optionSimbSim, Dice_intersection__intensity, gamma) ## Automathic selection of parameters epsilon and min_samples in DBSCAN algorithm epsilon, min_samples, error_parameters_DBSCAN = DBSCAN_parameters_epsilon_minsamples(dis_matrix) ## If there is not enough size in yhr filtered knowledge graph, pass to the next news if error_parameters_DBSCAN == 1: predictions_Fake_value.append(float("nan")) ## Otherwise continue the process else: ## DBSAN algorithm results dbscan_labels, dbscan_n_clusters, dbscan_n_noise, var_exp, label_fake, \ prob_fake = dbscan_clustering(dis_matrix, epsilon, min_samples, None, False) predictions_Fake_value.append(prob_fake) ## Deciding if the test news is a fake new or not and adding it to the count of # false predictions of NotFake if (prob_fake > threshold_prob_fake): count_predictions_Fake += 1 predictions_Fake.append(i) valid_tested_news += 1 print("i = ", i, "; valid_tested_news = ", valid_tested_news) ## Computing the number of predictions "NotFake" count_predictions_NotFake = valid_tested_news - count_predictions_Fake return count_predictions_Fake, count_predictions_NotFake, predictions_Fake, predictions_Fake_value
arr_ins_textValid.append(arr_ins_text) arr_ins_imgValid.append(arr_ins_img) arr_ins_timeValid.append(arr_ins_time) arr_labelValid.append(arr_label) executor.shutdown() endtime = datetime.datetime.now() print(" Time Cost: ", (endtime - starttime)) config = tf.ConfigProto() # config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.15 # tf.reset_default_graph() sess = tf.Session(config=config) print("Loading Model", end="") similarityV = similarity(batch_size1) saver = tf.train.Saver(max_to_keep=3) sess.run(tf.global_variables_initializer()) if loadModel: model_file = tf.train.latest_checkpoint('./model/') saver.restore(sess, model_file) print(" finally", end="") print(" finished") def npyCos(vector1, vector2): return np.dot( vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
def calc_price_sim(price1, price2): return similarity(price1, price2)
def main(args): print('Start test') creds = ReadDictJson(args.credentails) if not creds: print('Failed to load credentials file {}. Exiting'.format(args.credentails)) return False s3def = creds['s3'][0] s3 = s3store(s3def['address'], s3def['access key'], s3def['secret key'], tls=s3def['tls'], cert_verify=s3def['cert_verify'], cert_path=s3def['cert_path'] ) trainingset = '{}/{}/'.format(s3def['sets']['trainingset']['prefix'] , args.trainingset) print('Load training set {}/{} to {}'.format(s3def['sets']['trainingset']['bucket'],trainingset,args.trainingset_dir )) s3.Mirror(s3def['sets']['trainingset']['bucket'], trainingset, args.trainingset_dir) trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir) trainingsetDescription = json.load(open(trainingsetDescriptionFile)) config = { 'batch_size': args.batch_size, 'trainingset': trainingsetDescription, 'input_shape': [args.training_crop[0], args.training_crop[1], args.train_depth], 'classScale': 0.001, # scale value for each product class 'augment_rotation' : 5., # Rotation in degrees 'augment_flip_x': False, 'augment_flip_y': True, 'augment_brightness':0., 'augment_contrast': 0., 'augment_shift_x': 0.0, # in fraction of image 'augment_shift_y': 0.0, # in fraction of image 'scale_min': 0.75, # in fraction of image 'scale_max': 1.25, # in fraction of image 'ignore_label': trainingsetDescription['classes']['ignore'], 'classes': trainingsetDescription['classes']['classes'], 'epochs': 1, 'area_filter_min': 25, 'weights': None, 'channel_order': args.channel_order, 's3_address':s3def['address'], 's3_sets':s3def['sets'], 'initialmodel':args.initialmodel, 'training_dir': None, # used by LoadModel 'learning_rate': 1e-3, # used by LoadModel 'clean' : True, 'test_archive': trainingset, 'run_archive': '{}{}/'.format(trainingset, args.initialmodel), 'min':args.min, } trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir) trainingsetDescription = json.load(open(trainingsetDescriptionFile)) strategy = None if(args.strategy == 'mirrored'): strategy = tf.distribute.MirroredStrategy(devices=args.devices) else: device = "/gpu:0" if args.devices is not None and len(args.devices) > 0: device = args.devices[0] strategy = tf.distribute.OneDeviceStrategy(device=device) # Prepare datasets for similarity computation objTypes = {} for objType in trainingsetDescription['classes']['objects']: if objType['trainId'] not in objTypes: objTypes[objType['trainId']] = copy.deepcopy(objType) # set name to category for objTypes and id to trainId objTypes[objType['trainId']]['name'] = objType['category'] objTypes[objType['trainId']]['id'] = objType['trainId'] results = {'class similarity':{}, 'config':config, 'image':[]} for objType in objTypes: results['class similarity'][objType] = {'union':0, 'intersection':0} with strategy.scope(): # Apply training strategy model = LoadModel(config, s3) accuracy = tf.keras.metrics.Accuracy() # Display model model.summary() #train_dataset = input_fn('train', args.trainingset_dir, config) val_dataset = input_fn('val', args.trainingset_dir, config) trainingsetdesc = {} validationsetdec = {} for dataset in config['trainingset']['sets']: if dataset['name'] == 'val': validationsetdec = dataset if dataset['name'] == 'train': trainingsetdesc = dataset print("Begin inferences") dtSum = 0.0 accuracySum = 0.0 total_confusion = None iterator = iter(val_dataset) numsteps = int(validationsetdec['length']/config['batch_size']) if(config['min']): numsteps=min(args.min_steps, numsteps) try: for i in tqdm(range(numsteps)): image, annotation = iterator.get_next() initial = datetime.now() logits = model.predict(image, batch_size=config['batch_size'], steps=1) segmentation = tf.argmax(logits, axis=-1) dt = (datetime.now()-initial).total_seconds() dtSum += dt imageTime = dt/config['batch_size'] for j in range(config['batch_size']): img = tf.squeeze(image[j]).numpy().astype(np.uint8) ann = tf.squeeze(annotation[j]).numpy().astype(np.uint8) seg = tf.squeeze(segmentation[j]).numpy().astype(np.uint8) accuracy.update_state(ann,seg) seg_accuracy = accuracy.result().numpy() accuracySum += seg_accuracy imagesimilarity, results['class similarity'], unique = jaccard(ann, seg, objTypes, results['class similarity']) confusion = tf.math.confusion_matrix(ann.flatten(),seg.flatten(), config['classes']).numpy().astype(np.int64) if total_confusion is None: total_confusion = confusion else: total_confusion += confusion results['image'].append({'dt':imageTime,'similarity':imagesimilarity, 'accuracy':seg_accuracy.astype(float), 'confusion':confusion.tolist()}) except Exception as e: print("Error: test exception {} step {}".format(e, i)) numsteps = i except: print("Error: test exception step {}".format(i)) numsteps = i num_images = numsteps*config['batch_size'] average_time = dtSum/num_images average_accuracy = accuracySum/num_images sumIntersection = 0 sumUnion = 0 sumAccuracy = 0.0 dataset_similarity = {} for key in results['class similarity']: intersection = results['class similarity'][key]['intersection'] sumIntersection += intersection union = results['class similarity'][key]['union'] sumUnion += union class_similarity = similarity(intersection, union) # convert to int from int64 for json.dumps dataset_similarity[key] = {'intersection':int(intersection) ,'union':int(union) , 'similarity':class_similarity} results['class similarity'] = dataset_similarity total_similarity = similarity(sumIntersection, sumUnion) now = datetime.now() date_time = now.strftime("%m/%d/%Y, %H:%M:%S") test_summary = {'date':date_time, 'model':config['initialmodel']} test_summary['model']=config['initialmodel']} test_summary['accuracy']=average_accuracy test_summary['class_similarity']=dataset_similarity test_summary['similarity']=total_similarity test_summary['confusion']=total_confusion.tolist() test_summary['images']=num_images test_summary['image time']=average_time test_summary['batch size']=config['batch_size'] test_summary['test store'] =s3def['address'] test_summary['test bucket'] = s3def['sets']['trainingset']['bucket'] test_summary['results'] = results print ("Average time {}".format(average_time)) print ('Similarity: {}'.format(dataset_similarity)) # If there is a way to lock this object between read and write, it would prevent the possability of loosing data training_data = s3.GetDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json) if training_data is None: training_data = [] training_data.append(test_summary) s3.PutDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json, training_data) test_url = s3.GetUrl(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json) print("Test results {}".format(test_url))
def calc_ordering_sim(ordering1, ordering2, matrix): return similarity(ordering1, ordering2, matrix)
# Image to use as query i = 1 # Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' similarity_measure = 'smc' # Load the CBCL face database # Load Matlab data file to python dict structure X = loadmat('../Data/wildfaces_grayscale.mat')['X'] N, M = shape(X) # Search the face database for similar faces # Index of all other images than i noti = range(0, i) + range(i + 1, N) # Compute similarity between image i and all others sim = similarity(X[i, :], X[noti, :], similarity_measure) sim = sim.tolist()[0] # Tuples of sorted similarities and their indices sim_to_index = sorted(zip(sim, noti)) # Visualize query image and 5 most/least similar images figure(figsize=(12, 8)) subplot(3, 1, 1) img_hw = int(sqrt(len(X[0]))) imshow(np.reshape(X[i], (img_hw, img_hw)).T, cmap=cm.gray) xticks([]) yticks([]) title('Query image') ylabel('image #{0}'.format(i))
def calc_cuisine_sim(cuisine1, cuisine2, matrix): return similarity(cuisine1, cuisine2, matrix)
for i in range(0, len(file_name)): for j in range(0, len(file_name)): if i != j: sentList1 = file_text[i] sentList2 = file_text[j] print(file_name[i], "&", file_name[j]) for sentList1Text in sentList1: max = 0 sim = 0 comparedStatement = None for sentList2Text in sentList2: sim = similarity(sentList1Text, sentList2Text) if sim > 0.5: if max < sim: max = sim comparedStatement = sentList2Text print(sentList1Text, "&", comparedStatement) print(max) simList.append(max) simListArr = np.array(simList) print("Similarity:", np.sqrt(np.mean(simListArr**2))) print("") # Calculate execution time end = time.time() dur = end - start
np.array([119.91277313232422, 252.8047332763672]), np.array([139.75482177734375, 284.2823181152344]), np.array([109.2437744140625, 257.5870056152344]) ] dp = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] # q = [trans(rot(pp, 10 * math.pi / 180), 2, 3) for pp in p] q = [ np.array([374.65393066, 144.14517212]), np.array([161.24133301, 217.84576416]), np.array([99.27768707, 161.40586853]) ] dq = [[1, 2, 3.1], [4, 5.1, 6], [7.1, 8, 9]] point_match, max_sim, sim_a, sim_r, sim_d = similarity( p, q, dp, dq, 1, 1, 0.4, True ) print p print q print 'max sim -> {}'.format(max_sim) print 'sim angles : {}'.format(sim_a) print 'sim ratios : {}'.format(sim_r) print 'sim desc : {}'.format(sim_d) xp, yp = zip(*p) xq, yq = zip(*q) plt.plot(xp + (xp[0],), yp + (yp[0],), 'r-') plt.plot(xq + (xq[0],), yq + (yq[0],), 'b-') plt.xlim((min(xp + xq), max(xp + xq))) plt.ylim((min(yp + yq), max(yp + yq)))