def calculate_asymmetry(frame, training_data, headpose_data): # These are the pairs from left to right features across the face # All of these following numbers are 1 less than their documentation, because they are indices in the data array pairs = [[4, 5], [3, 6], [2, 7], [1, 8], [0, 9], # Eyebrows [19, 28], [20, 27], [21, 26], [22, 25], [23, 30], [24, 29], # Eyes [14, 18], [24, 29], # Nose [31, 37], [32, 36], [33, 35], [42, 38], [41, 39], [41, 39], [43, 45], [46, 48]] # Lips middle = [10, 11, 12, 13, 16, 34, 40, 44, 47] result_val = 0 for i in range(len(training_data)): frame = i for pair in pairs: mid_feature = 12 p1 = training_data[frame][pair[0]] # print p1 #p1 = mapPoint(headpose_data[frame], p1) p2 = training_data[frame][pair[1]] #p2 = mapPoint(headpose_data[frame], p2) mid_point = training_data[frame][mid_feature] #mid_point = mapPoint(headpose_data[frame], mid_point) dis1 = distance(p1, mid_point) dis2 = distance(p2, mid_point) result_val += abs(dis1 - dis2) return result_val
def test_points_near_linf(self): x = self.x d = self.d dd, ii = self.kdtree.query(x, k=self.kdtree.n, p=np.inf, distance_upper_bound=d) eps = 1e-8 hits = 0 for near_d, near_i in zip(dd,ii): if near_d == np.inf: continue hits += 1 assert_almost_equal(near_d,distance(x,self.data[near_i],np.inf)) assert_(near_d < d+eps, "near_d=%g should be less than %g" % (near_d,d)) assert_equal(np.sum(distance(self.data,x,np.inf) < d+eps),hits)
def NewNodeEdgeDistibutions(graphs): # node + intra-edge nodedistnfilename = sys.argv[1]+"_NewNodeDistnProbabilities.txt" intraedgedistnfilename = sys.argv[1]+"_NewIntraEdgeDistnProbabilities.txt" os.remove(nodedistnfilename) if os.path.exists(nodedistnfilename) else None os.remove(intraedgedistnfilename) if os.path.exists(intraedgedistnfilename) else None computationtime = {} for g in graphs: print('Processing graph:', g) #graph = Graph.Read_Ncol(join(dir_path, g), directed=False) print ("Extracting Node _ IntraEdge Distn Probabilities: %s" % g) graph = graphs[g] #.simplify() start_time = time.time() coreness = GraphBase.coreness(graph) highestcore = max(coreness) kshell = [[] for k in range(highestcore+1)] for v in graph.vs: i = coreness[v.index] kshell[i].append(v) nodedistn = [] intraedgedistn = [] for k in range(highestcore+1): subgraph = graph.subgraph(kshell[k]) subgraph.simplify() nodedistn.append(float(subgraph.vcount())) intraedgedistn.append(float(subgraph.ecount())) sumn = sum(nodedistn) normalizednodedistn = [] if not(sumn ==0): normalizednodedistn = [x/sumn for x in nodedistn] sumn = sum(intraedgedistn) normalizedintraedgedistn = [] if not(sumn ==0): normalizedintraedgedistn = [x/sumn for x in intraedgedistn] computationtime[g] = time.time() - start_time saveFeature(g,normalizednodedistn, nodedistnfilename) saveFeature(g,normalizedintraedgedistn, intraedgedistnfilename) distance(nodedistnfilename) distance(intraedgedistnfilename) saveDict(computationtime, sys.argv[1]+"_NewNCKDTimings.txt") return
def _compute_df(self, feature, timeline, distance): """Pre-compute feature distance dendrogram Parameters ---------- feature : timeline : Temporal units distance : """ # initialize feature distance matrix with zeros n = len(timeline) M = np.zeros((n, n)) for s, segment in enumerate(timeline): s_feature = feature[segment] for t in range(s+1, n): t_feature = feature[timeline[t]] # feature distance between two segments M[s, t] = distance(s_feature, t_feature) # feature distance is symmetric M[t, s] = M[s, t] y = scipy.spatial.distance.squareform(M, checks=False) df = scipy.cluster.hierarchy.complete(y) return df
def generate_distance_matrix(dist, peak_idxs, mode_dists, method='euclidean'): """------------------------------------------------------------------------- Iteratively calculates the distance of the input distribution from each (mode candidate, tonic candidate) pair. This is a generic function, that is independent of distribution type or any other parameter value. ---------------------------------------------------------------------------- dist : Input distribution that is to be estimated peak_idxs : List of indices of dist's peaks mode_dists : List of candidate mode distributions method : The distance method to be used. The available distances are listed in distance() function. -------------------------------------------------------------------------""" result = np.zeros((len(peak_idxs), len(mode_dists))) # Iterates over the peaks, i.e. the tonic candidates for i, cur_peak_idx in enumerate(peak_idxs): trial = dist.shift(cur_peak_idx) # Iterates over mode candidates for j, cur_mode_dist in enumerate(mode_dists): # Calls the distance function for each entry of the matrix result[i][j] = distance(trial.vals, cur_mode_dist.vals, method=method) return np.array(result)
def find_nearest_connector(self, detection_coord ): """ Search the given buckets of connectors for the one that is nearest to the given coordinates. Buckets farther than SEARCH_RADIUS are not searched, in which case a default ConnectorInfo object is returned. Returns: nearest_connector, distance to the nearest connector """ # Find nearby blocks detection_coord_int = detection_coord.astype(int) search_roi = ( detection_coord_int - self.SEARCH_RADIUS, detection_coord_int + self.SEARCH_RADIUS ) nearby_block_starts = getIntersectingBlocks(self._blockshape, search_roi) nearby_block_starts = map(tuple, nearby_block_starts) # Accumulate connectors found in nearby blocks nearby_connectors = [] for block_start in nearby_block_starts: if block_start in self._blocks: nearby_connectors += self._blocks[block_start] # Closure. Distance from current point to given connector. def distance( conn ): return scipy.spatial.distance.euclidean( (conn.x_nm, conn.y_nm, conn.z_nm), detection_coord ) # Find closest connector. if nearby_connectors: nearest_connector = min(nearby_connectors, key=distance) min_distance = distance( nearest_connector ) else: # No connectors nearby. Emit default values. nearest_connector = ConnectorInfo(-1, -1, -1, -1, [], []) min_distance = 9999999.0 return nearest_connector, min_distance
def test_consistency_with_neighbors(self): M = self.T1.sparse_distance_matrix(self.T2, self.r) r = self.T1.query_ball_tree(self.T2, self.r) for i,l in enumerate(r): for j in l: assert_equal(M[i,j],distance(self.T1.data[i],self.T2.data[j])) for ((i,j),d) in M.items(): assert_(j in r[i])
def nearest_neighbors(self, item_ratings, distance=distance.euclidean, limit=5): distances = {} with h5py.File(self.h5filename, 'r') as model: user_vector = self.__item_rating_dictionary_to_user_vector(item_ratings) ratings = model['ratings'] for i in xrange(len(ratings)): distances[str(i + 1)] = distance(user_vector, ratings[i]) return sorted(distances.iteritems(), key=operator.itemgetter(1))[:limit]
def testFunction(v): total = 0.0 for item in data_set_done: tmp = np.array([item]) dotProduct = np.dot(tmp,v) # dotproduct should be a constant #print "dotProduct is ", dotProduct dotProduct = dotProduct * v # dotproduct is a column major vector #print "dotProduct is ", dotProduct total = total + distance(tmp,dotProduct)**2 return total
def data_clustering(data, distance=Euclidean, linkage=AVERAGE): """ Return the hierarchical clustering of the data set's rows. :param Orange.data.Table data: Data set to cluster. :param Orange.distance.Distance distance: A distance measure. :param str linkage: """ matrix = distance(data) return dist_matrix_clustering(matrix, linkage=linkage)
def feature_clustering(data, distance=PearsonR, linkage=AVERAGE): """ Return the hierarchical clustering of the data set's columns. :param Orange.data.Table data: Data set to cluster. :param Orange.distance.Distance distance: A distance measure. :param str linkage: """ matrix = distance(data, axis=0) return dist_matrix_clustering(matrix, linkage=linkage)
def test_distance_matrix(): m = 10 n = 11 k = 4 xs = np.random.randn(m,k) ys = np.random.randn(n,k) ds = distance_matrix(xs,ys) assert_equal(ds.shape, (m,n)) for i in range(m): for j in range(n): assert_almost_equal(distance(xs[i],ys[j]),ds[i,j])
def centroide( matrix , dim): tam = matrix.shape[1] coord = np.zeros(shape=(1, dim)) r=0 for i in range(dim): for j in range(tam): coord[0][i] = coord[0][i] + matrix[i][j] coord = coord*1.0/(tam) dist ,r = distance(matrix , coord) return coord,r
def similarity(model,phr1,phr2,opts={}): num = distance(model,phr1,phr2,opts) res = 1-num # if opts.distance=="correlation": # res = 1 - ((num + 1)/2) # elif opts.distance=="euclidean": # res = 1 / (1 + (num)) # elif opts.distance=="seuclidean": # res = 1 / (1 + (num)) # else: # cosine # res = 1-num return res
def SGD(alpha, is_base_model, model_file, is_batch_mode): matrix, y_label = readInMatrix(model_file, is_base_model) lambda_regularization = 0.05 if is_batch_mode: threshold = 1 else: threshold = 0.01 W = np.zeros((5,feature_vector_length)) matrix_csr = matrix.tocsr() lines_train_file = matrix.shape[0] count_threshold = 5 count = 0 for iteration in range(100000): if iteration % 10000 == 0: print iteration if not is_batch_mode: random_row = random_pick_data_vector(0, lines_train_file - 1) # 1 * 1000 x_i = matrix_csr.getrow(random_row) y_c = y_label[random_row] else: random_100_rows = random_pick_data_vectors_100(0, lines_train_file - 1) x_i = matrix_csr[random_100_rows,:] y_c = y_label[random_100_rows] new_l_splash = gradient(y_c, W, x_i, lambda_regularization) newW = W + alpha * new_l_splash if iteration == 0: W = newW else: dist = distance(newW, W) # print dist if dist < threshold: if count > count_threshold: break else: count_threshold += 1 else: count_threshold = 0 W = newW return W
def mode_estimate(dist, mode_dists, distance_method='euclidean', metric='pcd', step_size=7.5): """------------------------------------------------------------------------- Compares the recording's distribution with each candidate mode with respect to the given tonic and returns the resultant distance vector to higher level functions. Here the input distribution is expected to be aligned according to the tonic and tonic isn't explicitly used in this function. This is a wrapper function that handles the required preliminary tasks and calls generate_distance_matrix() accordingly. ---------------------------------------------------------------------------- dist : Distribution of the input recording mode_dists : List of PitchDistribution objects. These are the model pitch distributions of candidate modes. distance_method : The choice of distance method. See the full list at distance() metric : Whether PCD or PD is used step_size : The step-size of the pitch distribution. Unit is cents -------------------------------------------------------------------------""" #TODO: step_size and pD/pcd information can be retrieved from the dist object #try and test that # There are no preliminaries, simply generate the distance vector. if (metric == 'pcd'): distance_vector = np.array(generate_distance_matrix(dist, [0], mode_dists, method=distance_method))[0] elif (metric == 'pD'): distance_vector = np.zeros(len(mode_dists)) # For each trial, a new instance of PitchDistribution is created and its # attributes are copied from dist. For each trial, it needs to be zero # padded according to the current mode distribution length. The entries # of the vector is generated iteratively, one-by-one. for i in range(len(mode_dists)): trial = pD.PitchDistribution(dist.bins, dist.vals, kernel_width=dist.kernel_width, source=dist.source, ref_freq=dist.ref_freq, segment=dist.segmentation) trial, mode_trial = pd_zero_pad(trial, mode_dists[i], step_size=step_size) distance_vector[i] = distance(trial, mode_trial, method=distance_method) return distance_vector
def test_distance_vectorization(): np.random.seed(1234) x = np.random.randn(10,1,3) y = np.random.randn(1,7,3) assert_equal(distance(x,y).shape,(10,7))
def test_distance_l1(): assert_almost_equal(distance([0,0],[1,1],1),2)
def test_found_all(self): r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps) for i, l in enumerate(r): c = np.ones(self.T2.n,dtype=bool) c[l] = False assert_(np.all(distance(self.data2[c],self.data1[i],self.p) >= self.d/(1.+self.eps)))
def test_found_all(self): c = np.ones(self.T.n,dtype=bool) l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps) c[l] = False assert_(np.all(distance(self.data[c],self.x,self.p) >= self.d/(1.+self.eps)))
def simulate_step(self, action=None, loc=None, probability=1): done = False wall = False if action == None: action = self.brain.get_action() if loc == None: loc = self.location last_loc = deepcopy(loc) if action == 0: # Go up loc[0] = loc[0] - 1 elif action == 1: loc[1] = loc[1] + 1 elif action == 2: loc[0] = loc[0] + 1 elif action == 3: loc[1] = loc[1] - 1 elif action == 4: pass elif action == 5: pass #self.communicate() elif action == 6: pass #self.communicate() if loc[0] <= 0: loc[0] = 0 if loc[1] <= 0: loc[1] = 0 if loc[0] > self.grid.size1 - 1: loc[0] = self.grid.size1 - 1 if loc[1] > self.grid.size1 - 1: loc[1] = self.grid.size1 - 1 if self.grid.Map[loc[0], loc[1]] > 0: loc = last_loc action = 10 wall = True #### The reward r = 0 # total reward for step k = 50 # Gain on distance k2 = -.2 # Gain on the gradient if self.meeting_point is None: r += -1 else: r += k / ( distance([self.location], [self.meeting_point]).squeeze() + 1) if action == None: action = self.last_action if not wall: if action == 6: r -= .03 elif action == 5: r -= .03 elif action == 4: r += -.01 else: r += -.03 if self.grid.have_met() == True: r += 50 done = True else: r += -1 action = 4 r_fromgrad = k2 * (self.grid.Map[loc[0], loc[1]]) #print('The reward from the gradient is %f'%r_fromgrad) r += r_fromgrad #The gradient contribution to the reward return loc, action, r, done
def compute_distance(_list, distance): return [ distance(el1, el2) for i, el1 in enumerate(_list[:-1]) for j, el2 in enumerate(_list[i + 1:]) ]
def min_dist_label_and_model(self): features_data = np.array(self.data[['Latitude', 'Longitude', 'Country', 'Province', 'City']]) # print features_data for d in features_data: distance(d) pass
# fix the leanring rate problem learning_rate = 1.0/math.sqrt(index) index = index + 1 tmp = data_set_done[i] # convert to right form tmp = np.array([tmp]) #print "dataset ", tmp # array([[]]) #print "tranpose xt is ", np.transpose(tmp) #print "dot result ", np.dot(np.transpose(tmp),tmp) b = np.dot(np.transpose(tmp),tmp) b = np.dot(b,v1) # result will be a 110 * 1 vector #print "times learning plus ", tmp + learning_rate * b v1 = v1 + learning_rate * b #print "v1 only sum ", v1 v1 = normalizeVector(v1) if i % 400 == 0: tPoint.append(i) test = testFunction(v1) print "test " ,test testResult.append(test) print testResult plt.plot(tPoint, testResult, '-') #plt.axis([0, len(data_set_done), 0, 5000]) plt.ylabel('test function result') plt.show() print "global v1, ", v1 dis = distance(prev_v,v1) print "distance ", dis if dis <= 0.001: break prev_v = v1
## Se itera sobre corpus, frases train_output={} # [Pseudo: 4 ] Por cada corpus de entrenamiento for (filename, phrases) in train_data: filename_old=filename.replace('input', 'gs') train_output[filename_old]=[] # [Pseudo: 4.a ] Por cada frase de corpos de entrenamiento for phr1,phr2 in phrases: # [Pseudo: 4.a.i ] Preprocesamiento phr1,phr2=preprocessing(phr1,phr2,opts) # [Pseudo: 4.a.ii ] Sumar vectores frase uno # [Pseudo: 4.a.iii ] Sumar vectores frase dos # [Pseudo: 4.a.iv ] Calcular distancia num=distance(model,phr1,phr2,opts) train_output[filename_old].append([num]) # [Pseudo: 5 ] Entrenar regresor verbose('Training model') if opts.method=="svr": method = train_model_srv(train_gs, train_output,args={'kernel':'rbf'}) ## PARA AGREGAR UN MÉTODO MÁS # if opts.method == "nombre": # method = train_model_nombre(train_gs, train_output,args={'kernel':'rbf'}) # train_model_nombre tienen que estar en utils filenames_sys=[] distances=[] # [Pseudo: 6 ] Por cada corpus de prueba
def test_distance_vectorization(): x = np.random.randn(10, 1, 3) y = np.random.randn(1, 7, 3) assert_equal(distance(x, y).shape, (10, 7))
def test_distance_linf(): assert_almost_equal(distance([0, 0], [1, 1], np.inf), 1)
def test_distance_l1(): assert_almost_equal(distance([0, 0], [1, 1], 1), 2)
def test_distance_l2(): assert_almost_equal(distance([0, 0], [1, 1], 2), np.sqrt(2))
def test_found_all(self): c = np.ones(self.T.n,dtype=np.bool) l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps) c[l] = False assert_(np.all(distance(self.data[c],self.x,self.p) >= self.d/(1.+self.eps)))
test_file = ['test1.csv', 'test2.csv', 'test3.csv', 'test4.csv', 'test5.csv'] #train_file = ['train1.csv'] #test_file = ['test1.csv'] distances = ['euclidean', 'cityblock', 'cosine'] k = 50 length = 0 MAD_list1 = [] MAD_list2 = [] for i in range(len(distances)): for j in range(len(train_file)): train, test = reading_csv(train_file[j], test_file[j]) train_pivot = train.pivot_table(index='user', columns='movie', values='rating', aggfunc='first', fill_value=0) distance_df = distance(train_pivot, distances[i]) users_similar = similar_user(train) users_neighbor = neighbors(distance_df) MAD_list1 += madlist1(test, train_pivot, k) MAD_list2 += madlist2(test, train_pivot, k) length += len(test.index) MAD1 = find_MAD1(MAD_list1, test, length) MAD2 = find_MAD2(MAD_list2, test, length) #proper algorithm is the one that is designed based on user i and each movie j they did not see, top k most similar users to i who have seen j, used this to infer i's rating on j print("MAD of proper algorithm is", distances[i], " is::", MAD2) #Basic algorithm is the simple algorithm which gives each user movie pair a rating that is equal to average score over all users who rated that movie print("MAD of basic algorithm is", distances[i], " is::", MAD1) print time.clock() - start
def getClustersHier(positions, diameter): cl = HierarchicalClustering(positions, lambda x,y: distance(x,y)) return cl.getlevel(diameter)
#------------------------------ '''c DISTANCE FORMULA Representing Points In this lesson, you will learn three different ways to define the distance between two points: Euclidean Distance Manhattan Distance Hamming Distance Before diving into the distance formulas, it is first important to consider how to represent points in your code. In this exercise, we will use a list, where each item in the list represents a dimension of the point. For example, the point (5, 8) could be represented in Python like this: pt1 = [5, 8] Points aren’t limited to just two dimensions. For example, a five-dimensional point could be represented as [4, 8, 15, 16, 23]. Ultimately, we want to find the distance between two points. We’ll be writing functions that look like this: distance([1, 2, 3], [5, 8, 9]) Note that we can only find the difference between two points if they have the same number of dimensions! ''' '''DISTANCE FORMULA Euclidean Distance Euclidean Distance is the most commonly used distance formula. To find the Euclidean distance between two points, we first calculate the squared distance between each dimension. If we add up all of these squared differences and take the square root, we’ve computed the Euclidean distance. Let’s take a look at the equation that represents what we just learned:
def re_ranking(feat, k1, k2, lambda_value, MemorySave=False, Minibatch=2000): #function included from https://github.com/zhunzhong07/person-re-ranking/tree/master/python-version #inputs: #feat: apperance feature from last layer from each gallery image #k1: number of K nearest neighbors to consider for given test image #k2: number of subset images to consider for each neighbor for test image #lambda_value= percentage weightage for reranking matrix, if 1 then it uses only the apperance feature all_num = feat.shape[0] query_num = feat.shape[0] feat = feat.astype(np.float16) print('computing original distance') if MemorySave: original_dist = np.zeros(shape=[all_num, all_num], dtype=np.float16) i = 0 while True: it = i + Minibatch if it < np.shape(feat)[0]: original_dist[i:it, ] = np.power(cdist(feat[i:it, ], feat), 2).astype(np.float16) else: original_dist[i:, :] = np.power(cdist(feat[i:, ], feat), 2).astype(np.float16) break i = it else: # original_dist = cdist(feat,feat).astype(np.float16) # original_dist = np.power(original_dist,2).astype(np.float16) original_dist = distance(feat).astype(np.float16) del feat gallery_num = original_dist.shape[0] original_dist = np.transpose(original_dist / np.max(original_dist, axis=0)) V = np.zeros_like(original_dist).astype(np.float16) initial_rank = np.argsort(original_dist).astype(np.int32) print('starting re_ranking') for i in range(all_num): # k-reciprocal neighbors forward_k_neigh_index = initial_rank[i, :k1 + 1] backward_k_neigh_index = initial_rank[forward_k_neigh_index, :k1 + 1] fi = np.where(backward_k_neigh_index == i)[0] k_reciprocal_index = forward_k_neigh_index[fi] k_reciprocal_expansion_index = k_reciprocal_index for j in range(len(k_reciprocal_index)): candidate = k_reciprocal_index[j] candidate_forward_k_neigh_index = initial_rank[ candidate, :int(np.around(k1 / 2)) + 1] candidate_backward_k_neigh_index = initial_rank[ candidate_forward_k_neigh_index, :int(np.around(k1 / 2)) + 1] fi_candidate = np.where( candidate_backward_k_neigh_index == candidate)[0] candidate_k_reciprocal_index = candidate_forward_k_neigh_index[ fi_candidate] if len( np.intersect1d(candidate_k_reciprocal_index, k_reciprocal_index) ) > 2 / 3 * len(candidate_k_reciprocal_index): k_reciprocal_expansion_index = np.append( k_reciprocal_expansion_index, candidate_k_reciprocal_index) k_reciprocal_expansion_index = np.unique(k_reciprocal_expansion_index) weight = np.exp(-original_dist[i, k_reciprocal_expansion_index]) V[i, k_reciprocal_expansion_index] = weight / np.sum(weight) original_dist = original_dist[:query_num, ] if k2 != 1: V_qe = np.zeros_like(V, dtype=np.float16) for i in range(all_num): V_qe[i, :] = np.mean(V[initial_rank[i, :k2], :], axis=0) V = V_qe del V_qe del initial_rank invIndex = [] for i in range(gallery_num): invIndex.append(np.where(V[:, i] != 0)[0]) jaccard_dist = np.zeros_like(original_dist, dtype=np.float16) for i in range(query_num): temp_min = np.zeros(shape=[1, gallery_num], dtype=np.float16) indNonZero = np.where(V[i, :] != 0)[0] indImages = [] indImages = [invIndex[ind] for ind in indNonZero] for j in range(len(indNonZero)): temp_min[0, indImages[j]] = temp_min[0, indImages[j]] + np.minimum( V[i, indNonZero[j]], V[indImages[j], indNonZero[j]]) jaccard_dist[i] = 1 - temp_min / (2 - temp_min) final_dist = jaccard_dist * (1 - lambda_value) + original_dist * lambda_value #del original_dist #del V #del r # final_dist = final_dist[:query_num,query_num:] return final_dist
def all_features(vec1, sent1, vec2, sent2): #dist1=distance.cdist(vecList,vecList,'euclidean') from scipy.spatial import distance eu_dist1 = distance.euclidean(vec1, vec2) eu_dist2 = distance.cityblock(vec1, vec2) eu_dist3 = distance.cosine(vec1, vec2) #eu_dist4=distance.correlation(vec1,vec2) #eu_dist5=distance.chebyshev(vec1,vec2) eu_dist6 = distance.dice(vec1, vec2) eu_dist7 = distance.jaccard(vec1, vec2) #eu_dist8=distance.hamming(vec1,vec2) #print 'euclidean dist = ',eu_dist1 #print 'cityblock dist = ',eu_dist2 #print 'cosine dist = ',eu_dist3 #print 'correlation dist = ',eu_dist4 #print 'chebyshev dist = ',eu_dist5 #print 'dice dist = ',eu_dist6 #print 'jaccard dist = ',eu_dist7 #print 'hamming dist = ',eu_dist8 #print DocSentWordMap[-1] #-------------------------------------------------------------- def lcs_length(a, b): table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)] for i, ca in enumerate(a, 1): for j, cb in enumerate(b, 1): table[i][j] = (table[i - 1][j - 1] + 1 if ca == cb else max( table[i][j - 1], table[i - 1][j])) return table[-1][-1] #----------------------------------------------------------------- #--------------------------------------------------------------------- sent1 = set(sent1) sent2 = set(sent2) common = len(sent1.intersection(sent2)) s1ins2 = (common * 1.00) / len(sent1) s2ins1 = (common * 1.00) / len(sent2) #print '% word of S1 in s2 = ',s1ins2 #print '% word of S2 in s2 = ',s2ins1 in_and_ex = common + (len(sent1) + len(sent2) - common) #print 'inclusion and exclution = ',in_and_ex word_overlap = (common * 1.00) / min(len(sent1), len(sent2)) #print 'word overlap coefficient = ',word_overlap v1 = [str(x) for x in vec1] v2 = [str(x) for x in vec2] v1 = ' '.join(v1) v2 = ' '.join(v2) from Levenshtein import distance Levenshtein_dist = distance(v1, v2) #print 'levenshtein distance',Levenshtein_dist lcs_dist = lcs_length(sent1, sent2) #print 'LCS =',lcs_dist v1 = unicode(v1, 'utf-8') v2 = unicode(v2, 'utf-8') import jellyfish jaro_dist = 0 #jaro_dist=jellyfish.jaro_distance(v1,v2) #print 'jaro distance',jaro_dist total_dist = (eu_dist1 + eu_dist2 + eu_dist3 + eu_dist6 + eu_dist7 + s1ins2 + s2ins1 + in_and_ex + word_overlap + jaro_dist + Levenshtein_dist + lcs_dist) #print 'total dist = ',total_dist total_dist = (total_dist) / 12 #print 'total dist = ',total_dist return total_dist
def test_in_ball(self): l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps) for i in l: assert_(distance(self.data[i],self.x,self.p) <= self.d*(1.+self.eps))
def k_means(data, K): """ Implements the K-means algorithm following the slides' notation. """ MAX_ITERATIONS = 50 N = len(data) C, C_with_labels, m = {}, {}, [None] * N iters = 0 # initialize cluster representatives random_idxs = np.random.randint(0, N, size=K) # print('K = {}, random_idxs = {}'.format(K,random_idxs)) # return(0,0,0) for i in range(len(random_idxs)): point = data[random_idxs[i], 2:] point_idx, point_label = data[random_idxs[i], 0], data[random_idxs[i], 1] C.setdefault(i, []).append((point[0], point[1])) C_with_labels.setdefault(i, []).append( [point_idx, point_label, point[0], point[1]]) # in case there is a centroid is not the closest to ANY point, then there will be no points in its array and the mean will be nan. to prevent this, I store the old clusters and check if the a cluster in the updated clusters is empty, then just assign it the old value i.e. leave it unchanged C_old = C.copy() while (iters < MAX_ITERATIONS): cluster_means = dict() for c in C: if (len(C[c]) > 0): # in case there is no point belonging to a cluster cluster_means[c] = get_cluster_mean(C[c]) C_old[c] = C[c] else: # use the old centroid and do not update its value cluster_means[c] = get_cluster_mean(C_old[c]) # print('C =') # for c in C: print(C[c], len(C[c])); # print ('cluster_means = {}'.format(cluster_means)) # input('...\n') # reassign points in D to closest cluster mean C = {c: [] for c in C} C_with_labels = {c: [] for c in C} for i in range(N): point = data[i, 2:] point = (point[0], point[1]) point_idx, point_label = data[i, 0], data[i, 1] # get the closest cluster to this point closest_dist = float('inf') closest_cluster = None for c in cluster_means: dist = distance(point, cluster_means[c]) if dist < closest_dist: closest_dist = dist closest_cluster = c # print ('Closest cluster/centroid to the point {} is {}:{}'.format(point, closest_cluster, cluster_means[closest_cluster])) # assign this point to the closest cluster C[closest_cluster].append(point) C_with_labels[closest_cluster].append( [point_idx, point_label, point[0], point[1]]) # update m s.t. m_i is cluster ID of ith point in D m[i] = closest_cluster # print ('Closest centroid to the point {} is {}'.format(point, m[i])) # input('...') # if there is no change in the centroids, then stop new_cluster_means = {c: get_cluster_mean(C[c]) for c in C} if (centroids_did_not_change(new_cluster_means, cluster_means)): if DEBUG: print('Old cluster_means = {}'.format(cluster_means)) print('New cluster_means = {}'.format(new_cluster_means)) print('No change in the centroids, breaking out of the loop.') break # if this is the last iteration then break out of the loop after the assignment of points to the cluster and before recomputing the new centroids if iters == MAX_ITERATIONS - 1: if DEBUG: print( 'this is the last ({}) iteration, break out of the loop after the assignment of points to the cluster and before recomputing the new centroids' .format(iters)) break iters += 1 if DEBUG: print('iters = ', iters) for c in C_with_labels: C_with_labels[c] = np.asarray(C_with_labels[c]) return (C, C_with_labels, m)
def test_all_in_ball(self): r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps) for i, l in enumerate(r): for j in l: assert_(distance(self.data1[i],self.data2[j],self.p) <= self.d*(1.+self.eps))
def test_found_all(self): r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps) for i, l in enumerate(r): c = np.ones(self.T2.n,dtype=np.bool) c[l] = False assert_(np.all(distance(self.data2[c],self.data1[i],self.p) >= self.d/(1.+self.eps)))
def test_distance_l2(): assert_almost_equal(distance([0,0],[1,1],2),np.sqrt(2))
def NodeEdgeDistribution(graphs): #graphs = {} Coreness = {} KCoreSignatures = {} KCoreTimings = {} CountTimings = {} NodeDistnTimings = {} EdgeMatrices = {} EdgeDistnTimings = {} AllTimings = {} nodedistnfilename = sys.argv[1]+"_NodeDistnProbabilities.txt" edgedistnfilename = sys.argv[1]+"_EdgeProbabilities.txt" os.remove(nodedistnfilename) if os.path.exists(nodedistnfilename) else None os.remove(edgedistnfilename) if os.path.exists(edgedistnfilename) else None maximumcore = 0 for g in graphs: print('Processing graph:', g) #graph = Graph.Read_Ncol(join(dir_path, g), directed=False) print ("Extracting Node Distn Probabilities: %s" % g) graph = graphs[g].simplify() start_time = time.time() coreness = GraphBase.coreness(graph) KCoreTimings[g] = time.time() - start_time Coreness[g] = coreness l = len(coreness) start_time = time.time() d = {n:coreness.count(n) for n in range(max(coreness)+1)} CountTimings[g] = time.time() - start_time print('max(coreness): ', max(coreness)) start_time = time.time() KCoreSignature = [d[key] / (l * 1.0) for key in sorted(d)] NodeDistnTimings[g] = time.time() - start_time KCoreSignatures[g] = KCoreSignature saveFeature(g,KCoreSignature, nodedistnfilename) highestcore = max(Coreness[g]) if (maximumcore < highestcore): maximumcore = highestcore saveDict(NodeDistnTimings, sys.argv[1]+"_NodeDistnTimings.txt") for g in graphs: start_time = time.time() EdgeMatrix = getEdgeProbabilities(g, graphs[g], Coreness[g], maximumcore) EdgeMatrices[g] = EdgeMatrix EdgeDistnTimings[g] = time.time() - start_time saveFeature(g,EdgeMatrix, edgedistnfilename) saveDict(EdgeDistnTimings, sys.argv[1]+"_EdgeProbabilitiesTiming.txt") for g in graphs: t0= KCoreTimings[g] t1 = CountTimings[g] t2 = NodeDistnTimings[g] t3 = EdgeDistnTimings[g] t4 = t0+t1+t2+t3 AllTimings[g]=[t0,t1 ,t2, t3, t4] #AllTimings[g]=[t3] distance(nodedistnfilename) distance(edgedistnfilename) saveDict(AllTimings, sys.argv[1]+"_AllNCKDTimings.txt") return
def test_distance_linf(): assert_almost_equal(distance([0,0],[1,1],np.inf),1)
def recommendationScores(ratings_dict_train, ratings_dict_test): """ Calculates recommendation scores for each user/business combination in the test set using collaborative filtering. :param ratings_dict_test: Nested dictionary with structure {user_id: {business_id: rating}} :param ratings_dict_training: Nested dictionary with structure {user_id: {business_id: rating}} :return: nested dictionary, identical to ratings_dict_test except that it contains recommendation scores instead of ratings """ # Initialize recommendation scores dictionary rec_scores = {} # For every user/business combo in the test set, # calculate recommendation score based on top 5 most similar users for test_user, test_ratings in ratings_dict_test.iteritems(): #print test_user, test_ratings # Initialize dictionary to hold distances between current test user and all training users distances = {} # Initialize key-value pair in recommendation scores dictionary to hold the # recommendation scores for this user for all businesses rec_scores[test_user] = {} # For each training user, calculate the distance between them and current test user for train_user, train_ratings in ratings_dict_train.iteritems(): # Do not consider the same user if train_user == test_user: continue # Do not consider users who haven't reviewed any of the same businesses elif len(set(test_ratings.keys()) & set(train_ratings.keys())) == 0: continue # Calculate distance based on common business ratings else: distances[train_user] = distance(test_ratings, train_ratings) # For each business for the current user, calculate recommendation score for bus_id in test_ratings.iterkeys(): # First narrow down similar users based on who has also reviewed the current question similar_users = [user_id for user_id, ratings in ratings_dict_train.iteritems() if bus_id in ratings] # Subset distance dictionary created in previous step to only hold these users similar_users_dist = {user_id: distances[user_id] for user_id in similar_users if user_id in distances} # Sort these users by distance to take the most similar 5 # The "top 5" will have less than 5 if fewer than 5 other users have reviewed the business top5 = sorted(similar_users_dist, key=similar_users_dist.get, reverse=True)[0:5] # Calculate recommendation score as sum(similarity * rating) for top 5's ratings of this business rec_score_unscaled = np.sum([1.0/(distances[user_id]+0.1) * ratings_dict_train[user_id][bus_id] for user_id in top5]) # Scale the score by the number of users who actually contributed to it rec_score = rec_score_unscaled/len(top5) # Add to recommendation scores dictionary rec_scores[test_user][bus_id] = rec_score # Print if there's a strange value if rec_score == 0 or rec_score == np.inf: print "Test User: "******", Business: " + str(bus_id) print " " + str(len(similar_users_dist)) + " users also reviewed this business" print " " + str(top5) print " Recommendation Score: " + str(rec_score_unscaled) + "/" + str(len(top5)) + '=' + str(rec_score) print "\n" return rec_scores
def test_in_ball(self): l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps) for i in l: assert_( distance(self.data[i], self.x, self.p) <= self.d * (1. + self.eps))
def distance(user, measurement): measurementsDB = shelve.open('measurmentsDB') savedMeasurement = measurementsDB[user] meanMeasurement = mean(savedMeasurement, axis=0) distMatrix = distance(meanMeasurement, measurement, 'cityblock') distArray = distMatrix.diagonal()
coords_n=[] for i in range(0,len(coords)): coords_n.append([x[i],y[i]]) #store the new coordinates in a list # N=[] #list to store count of each root hair D=[] #list to store the distances #centroid length(cl) for (x,y) in coords_n: k=distance(x,y,xc,yc) D.append(k) cs=(sum(D))**.5 CSN1.append(cs) print("Area of the shape") print(cs) Filename.append('42.jpg') #print(len(Filename)) #print(len(CSN1)) data={'ID': Filename, 'Area':CSN1} NS=pd.DataFrame(data)