def test_pairwise_distances(): """ Test the pairwise_distance helper function. """ rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Tests that precomputed metric returns pointer to, and not copy of, X. S = np.dot(X, X.T) S2 = pairwise_distances(S, metric="precomputed") assert_true(S is S2) # Test with sparse X and Y X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski")
def kmeans(X, k, centroids=None, steps=20, verbose=0): if not centroids: centroids = X[np.random.choice(np.arange(X.shape[0]), size=k)] if sp.sparse.issparse(centroids): centroids = centroids.toarray() for step in xrange(steps): D = euclidean_distances(centroids, X, squared=0) # since rows are normalized, it's cosine clusters = D.argmin(axis=0) new_centroids, k = cluster_centroids(X, clusters, k) J = np.abs((new_centroids ** 2).sum() - (centroids ** 2).sum()) if verbose and step % 10 == 0: print 'step %d... J=%0.4f' % (step, J) if J < 1e-6: break centroids = new_centroids if verbose: print 'converged after step=%d, final J=%0.4f' % (step, J) D = euclidean_distances(centroids, X, squared=0) clusters = D.argmin(axis=0) return clusters, k, centroids
def get_top_k_match(k, source, targets, source_embeddings,target_embeddings ): result_dict_average = {} result_dict_average_tfidf = {} result_dict_sum = {} for t in targets: distance_average = euclidean_distances(vector_averaging(source.split(" "),source_embeddings,DIMENSION),vector_averaging(t.split(" "),target_embeddings,DIMENSION))[0][0] distance_average_tfidf = euclidean_distances(vector_averaging_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0] # distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0] # distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0] result_dict_average[t] = distance_average result_dict_average_tfidf[t] = distance_average_tfidf # result_dict_sum[t] = distance_sum sorted_result_average = sorted(result_dict_average.items(), key=operator.itemgetter(1)) sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(), key=operator.itemgetter(1)) # sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1)) return sorted_result_average[:k], sorted_result_average_tfidf[:k] #sorted_result_sum[:k]
def make_sample_df(labels, np, labeled_data, limit, algorithm_name, dims, cores): used_labels = np.unique(labels)[0:3] label_dfs = [] label = used_labels[0] # sub-sample the stratified subset subset = labeled_data[labeled_data[:,0] == label,1:] # select all those elements with this label num_samples = min(limit,subset.shape[0]) indices = np.arange(subset.shape[0]) np.random.shuffle(indices) label_pts = subset[indices[:num_samples],:] # repeat for the same number of pts from one opposing label first_comparators = labeled_data[labeled_data[:,0] == label_opposites[label][0],1:] num_samples = min(limit,first_comparators.shape[0]) indices = np.arange(first_comparators.shape[0]) np.random.shuffle(indices) opposing_pts = first_comparators[indices[:num_samples],:] distances = euclidean_distances(label_pts,opposing_pts) num_records = distances.size label_dfs.append(pd.DataFrame({"distances": distances.ravel(), "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "opposing label": [label_dict[label_opposites[label][0]] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]})) # repeat for the same number of pts from the other opposing label second_comparators = labeled_data[labeled_data[:,0] == label_opposites[label][1],1:] num_samples = min(limit,second_comparators.shape[0]) indices = np.arange(second_comparators.shape[0]) np.random.shuffle(indices) opposing_pts = second_comparators[indices[:num_samples],:] distances = euclidean_distances(label_pts,opposing_pts) num_records = distances.size label_dfs.append(pd.DataFrame({"distances": distances.ravel(), "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "opposing label": [label_dict[label_opposites[label][1]] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]})) return label_dfs
def _fit_process_bagirov(self, X): """ Clusters using the global K-means algorithm Bagirov variation :param X: :return: """ # Create a KNN structure for fast search self._neighbors = NearestNeighbors() self._neighbors.fit(X) # Compute the centroid of the dataset centroids = sum(X) / X.shape[0] assignments = [0 for i in range(X.shape[0])] centroids.shape = (1, X.shape[1]) # compute the distance of the examples to the centroids mindist = np.zeros(X.shape[0]) for i in range(X.shape[0]): mindist[i] = \ euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0] for k in range(2, self.n_clusters + 1): newCentroid = self._compute_next_centroid(X, centroids, assignments, mindist) centroids = np.vstack((centroids, newCentroid)) km = KMeans(n_clusters=k, init=centroids, n_init=1) km.fit(X) assignments = km.labels_ for i in range(X.shape[0]): mindist[i] = \ euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0] return km.cluster_centers_, km.labels_, km.inertia_
def test_euclidean_distances_with_norms(dtype, y_array_constr): # check that we still get the right answers with {X,Y}_norm_squared # and that we get a wrong answer with wrong {X,Y}_norm_squared rng = np.random.RandomState(0) X = rng.random_sample((10, 10)).astype(dtype, copy=False) Y = rng.random_sample((20, 10)).astype(dtype, copy=False) # norms will only be used if their dtype is float64 X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) Y = y_array_constr(Y) D1 = euclidean_distances(X, Y) D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) assert_allclose(D2, D1) assert_allclose(D3, D1) assert_allclose(D4, D1) # check we get the wrong answer with wrong {X,Y}_norm_squared wrong_D = euclidean_distances(X, Y, X_norm_squared=np.zeros_like(X_norm_sq), Y_norm_squared=np.zeros_like(Y_norm_sq)) with pytest.raises(AssertionError): assert_allclose(wrong_D, D1)
def bhargavi_gowda_score(X, labels): """ Score from: Bhargavi, M. & Gowda, S. D. "A novel validity index with dynamic cut-off for determining true clusters" Pattern Recognition , 2015, 48, 3673 - 3687 :param X: :param labels: :return: """ llabels = np.unique(labels) poslabels = maplabels(llabels) nclust = len(llabels) nex = len(labels) # Centroid of the data centroid = np.zeros((1, X.shape[1])) centroid += np.sum(X, axis=0) centroid /= X.shape[0] # Compute SSB and intracluster distance ccentroid = np.zeros((nclust, X.shape[1])) dist = 0.0 for idx in llabels: center = np.zeros((1, X.shape[1])) center_mask = labels == idx center += np.sum(X[center_mask], axis=0) center /= center_mask.sum() ccentroid[poslabels[idx]] = center dvector = euclidean_distances(centroid.reshape(1, -1), ccentroid[poslabels[idx]].reshape(1, -1), squared=True) dist += dvector.sum() * center_mask.sum() SSB = dist / len(labels) # Compute SSW dist = 0.0 Intra = 0.0 for idx in llabels: center_mask = labels == idx dvector = euclidean_distances(X[center_mask], ccentroid[poslabels[idx]].reshape(1, -1), squared=True) dist += dvector.sum() sdvector = euclidean_distances(X[center_mask], ccentroid[poslabels[idx]].reshape(1, -1), squared=False) Intra += sdvector.sum() SSW = dist / len(labels) SST = SSB + SSW # Centroids distance matrix cdistances = euclidean_distances(ccentroid, squared=False) Inter = np.sum(cdistances)/(nclust**2) return(np.abs((SSW/SSB)*SST) - (Intra/Inter) - (nex - nclust))
def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20): if ftr_type == 'ftr': #use input features self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))] dataMat = [ts.ftr for ts in self.slctData] elif ftr_type == 'data': #use input data dataMat = [ts.val for ts in self.slctData] else: print 'unknown ftr_type for ftr_type:', ftr_type if pca_dim > len(dataMat): pca_dim = int(math.ceil(len(dataMat)/2.0)) if type == 'euclidean': #euclidean distance based on time series data self.simMat = skmpw.euclidean_distances(dataMat) elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance pca = skd.PCA(n_components=pca_dim) dataMat = pca.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance nmf = skd.NMF(n_components=pca_dim) dataMat = nmf.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance ica = skd.FastICA(n_components=pca_dim) dataMat = ica.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type =='cosine': self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance pca = skd.PCA(n_components=pca_dim) dataMat = pca.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance nmf = skd.NMF(n_components=pca_dim) dataMat = nmf.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type =='ica_cos': #extract feature based on ICA, then use cosine distance ica = skd.FastICA(n_components=pca_dim) dataMat = ica.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') else: print 'unknown type for similarity matrix: ', type #rearrange the order of data in simMat self.slctDataMat = dataMat if orderFlag: link = spc.hierarchy.linkage(self.simMat) dend = spc.hierarchy.dendrogram(link, no_plot=True) order = dend['leaves'] self.slctData = [self.slctData[i] for i in order] #rearrange order self.simMat = [self.simMat[i] for i in order] for i in xrange(len(self.simMat)): self.simMat[i] = [self.simMat[i][j] for j in order] self.slctDataMat = [self.slctDataMat[i] for i in order] # self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response self.clstData = self.slctData self.clstSimMat = self.simMat
def bagOfWordsModel(): #simple vectorization example from sklearn.feature_extraction.text import CountVectorizer corpus = [ 'UNC played Duke in basketball', 'Duke lost the basketball game', 'I ate a sandwich' ] vectorizer = CountVectorizer() print vectorizer.fit_transform(corpus).todense() print vectorizer.vocabulary_ #viewing the euclidean distance between features vectors from sklearn.metrics.pairwise import euclidean_distances counts = [[0, 1, 1, 0, 0, 1, 0, 1],[0, 1, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 0, 0, 1, 0]] print ('Distances between 1st and 2nd documents:',euclidean_distances(counts[0],counts[1])) print ('Distances between 1st and 3rd documents:',euclidean_distances(counts[0],counts[2])) print ('Distances between 2nd and 3rd documents:',euclidean_distances(counts[1],counts[2])) #filtering stop words vectorizer = CountVectorizer(stop_words='english') print vectorizer.fit_transform(corpus).todense() print vectorizer.vocabulary_ # stemming and lemmatization """stemming = removes all patterns of characters that appear to be affixes,resulting in a token that is not necessarily a valid word. and lemmatization = finding the roots of a word ex jumping becomes jump Lemmatization frequently requires a lexical resource, like WordNet, and the word's part of speech. Stemming algorithms frequently use rules instead of lexical resources to produce stems and can operate on any token, even without its context. stem mesh hayfara2 been gathering as a noun and gathering as a verb w hay2lebhom homma el etneen le gather lemmatization bey7tag el context 3ashan yeraga3 el verbs lel root w el nouns zay ma heya stemming uses rules to remove characters that appear as zyadat fa momken yebawaz kelma ex: was>= wa, lemmatization uses el context """ from nltk import word_tokenize from nltk.stem import PorterStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk import pos_tag wordnet_tags = ['n','v'] corpus = [ 'He ate the sandwiches', 'Every sandwich was eaten by him' ] stemmer = PorterStemmer() print 'Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus] lemmatizer = WordNetLemmatizer() tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus] print 'Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus] #TF-IDF => the frequencies of the tokens are put into considerations from sklearn.feature_extraction.text import CountVectorizer corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich'] vectorizer = CountVectorizer(stop_words='english') """The binary argument is defaulted to False,so instead of a binary representation we get a number of occurences for each token""" print vectorizer.fit_transform(corpus).todense()
def distToSeed(tweetVecs, seedTweetVecs): #seedNews = [] distToSeedTweets = pairwise.euclidean_distances(tweetVecs, seedTweetVecs[range(10),:]) distToSeedTweets = np.mean(distToSeedTweets)#/len(tweetVecs) distToSeedNews = pairwise.euclidean_distances(tweetVecs, seedTweetVecs[range(10, 20),:]) distToSeedNews = np.mean(distToSeedNews)#/len(tweetVecs) return distToSeedTweets, distToSeedNews
def cluster_centers(data, n_clusters): centers_idxs = [] data_new = data.copy() for i in range(n_clusters): dist_matrix = euclidean_distances(data_new, data_new) c_idx = dist_matrix.sum(axis=1).argsort()[::-1][0] centers_idxs.append(c_idx) data_new = np.delete(data_new, c_idx, axis=0) return euclidean_distances(data, data), np.array(centers_idxs)
def test_euclidean_distances(): """ Check the pairwise Euclidean distances computation""" X = [[0]] Y = [[1], [2]] D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) X = csr_matrix(X) Y = csr_matrix(Y) D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]])
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): random_state = check_random_state(random_state) n_samples, n_features = X.shape centers = np.empty((n_clusters, n_features)) assert x_squared_norms is not None, 'x_squared_norms None in _k_init' # Set the number of local seeding trials if none is given if n_local_trials is None: n_local_trials = 2 + int(np.log(n_clusters)) # Pick the first center randomly center_id = random_state.randint(0, n_samples-1) centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals) # Compute distances to center candidates distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # Decide which candidate is the best best_candidate = None best_pot = None best_dist_sq = None for trial in range(n_local_trials): # Compute potential when including center candidate new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial]) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far if (best_candidate is None) or (new_pot < best_pot): best_candidate = candidate_ids[trial] best_pot = new_pot best_dist_sq = new_dist_sq # Permanently add best center candidate found in local tries centers[c] = X[best_candidate] current_pot = best_pot closest_dist_sq = best_dist_sq return centers
def test_pairwise_parallel(): rng = np.random.RandomState(0) for func in (np.array, csr_matrix): X = func(rng.random_sample((5, 4))) Y = func(rng.random_sample((3, 4))) S = euclidean_distances(X) S2 = _parallel_pairwise(X, None, euclidean_distances, n_jobs=-1) assert_array_almost_equal(S, S2) S = euclidean_distances(X, Y) S2 = _parallel_pairwise(X, Y, euclidean_distances, n_jobs=-1) assert_array_almost_equal(S, S2)
def sammon(data, target_dim=2, max_iterations=250, max_halves=10): """ Adopted from the Matlab implementation by Dr. Gavin C. Cawley. Matlab source can be found here: https://people.sc.fsu.edu/~jburkardt/m_src/profile/sammon_test.m """ TolFun = 1 * 10 ** (-9) D = euclidean_distances(data, data) N = data.shape[0] scale = np.sum(D.flatten('F')) D = D + np.identity(N) D_inv = np.linalg.inv(D) y = np.random.randn(N, target_dim) one = np.ones((N, target_dim)) d = euclidean_distances(y, y) + np.identity(N) d_inv = np.linalg.inv(d) delta = D - d E = np.sum(np.sum(np.power(delta, 2) * D_inv)) for i in range(max_iterations): delta = d_inv - D_inv deltaone = np.dot(delta, one) g = np.dot(delta, y) - y * deltaone dinv3 = np.power(d_inv, 3) y2 = np.power(y, 2) H = np.dot(dinv3, y2) - deltaone - 2 * np.multiply(y, np.dot(dinv3, y)) + np.multiply(y2, np.dot(dinv3, one)) s = np.divide(-np.transpose(g.flatten('F')), np.transpose(np.abs(H.flatten('F')))) y_old = y for j in range(max_halves): [rows, columns] = y.shape y = y_old.flatten('F') + s y = y.reshape(rows, columns) d = euclidean_distances(y, y) + np.identity(N) d_inv = np.linalg.inv(d) delta = D - d E_new = np.sum(np.sum(np.power(delta, 2) * D_inv)) if E_new < E: break else: s = 0.5 * s E = E_new E = E * scale return (y, E)
def get_sim(dt_frame,n_rows=2000,plt_flag=False,sort_flag=True,out_file="sim.png",plot_every=1): if sort_flag: dist=euclidean_distances(dt_frame.values,dt_frame.values[0]) dt_temp=dt_frame.copy() dt_temp["dist"]=dist dt_sort=dt_temp.sort("dist").drop("dist",axis=1) else: dt_sort=dt_frame.copy() dist_full=euclidean_distances(dt_sort[0:n_rows].values) plt.figure() plt.imshow(dist_full[::plot_every,::plot_every],extent=(0,n_rows,n_rows,0)) plt.colorbar() plt.savefig(out_file) if plt_flag: plt.show()
def rbf_kernel(Z, X, gamma=None): """ Compute the rbf (gaussian) kernel between X and Y:: K(x, y) = exp(-γ ||x-y||²) for each pair of rows x in X and y in Y. Parameters ---------- X : array of shape (n_samples_X, n_features) Y : array of shape (n_samples_Y, n_features) gamma : float Returns ------- kernel_matrix : array of shape (n_samples_X, n_samples_Y) """ if gamma is None: gamma = 1.0 / X.shape[1] K = pw.euclidean_distances(X, Z, squared=True) K *= -gamma np.exp(K, K) # exponentiate K in-place return K
def partition_FOV_KMeans(self,tradeoff_weight=.5,fx=.25,fy=.25,n_clusters=4,max_iter=500): """ Partition the FOV in clusters that are grouping pixels close in space and in mutual correlation Parameters ------------------------------ tradeoff_weight:between 0 and 1 will weight the contributions of distance and correlation in the overall metric fx,fy: downsampling factor to apply to the movie n_clusters,max_iter: KMeans algorithm parameters Outputs ------------------------------- fovs:array 2D encoding the partitions of the FOV mcoef: matric of pairwise correlation coefficients distanceMatrix: matrix of picel distances Example """ _,h1,w1=self.shape self.resize(fx,fy) T,h,w=self.shape Y=np.reshape(self,(T,h*w)) mcoef=np.corrcoef(Y.T) idxA,idxB = np.meshgrid(list(range(w)),list(range(h))); coordmat=np.vstack((idxA.flatten(),idxB.flatten())) distanceMatrix=euclidean_distances(coordmat.T); distanceMatrix=old_div(distanceMatrix,np.max(distanceMatrix)) estim=KMeans(n_clusters=n_clusters,max_iter=max_iter); kk=estim.fit(tradeoff_weight*mcoef-(1-tradeoff_weight)*distanceMatrix) labs=kk.labels_ fovs=np.reshape(labs,(h,w)) fovs=cv2.resize(np.uint8(fovs),(w1,h1),old_div(1.,fx),old_div(1.,fy),interpolation=cv2.INTER_NEAREST) return np.uint8(fovs), mcoef, distanceMatrix
def getClusterFeatures(tweetClusters, documents, feaVecs, seedTweetVecs, snp_comp, symCompHash): cLabels, tLabels, centroids, docDist = tweetClusters cTexts = [] cDocs_zip = [] cComps = [] cDensity = [] cDistToST = [] for clbl in cLabels: dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl] vecsIn = feaVecs[dataIn, :] textsIn = [documents[docid] for docid in dataIn] textsIn = Counter(textsIn).items() dataIn_zip = [(documents.index(text), num) for text, num in textsIn] compsIn = compInCluster(textsIn, snp_comp, symCompHash, False, True) inDist = pairwise.euclidean_distances(vecsIn, vecsIn) distToST = distToSeed(vecsIn, seedTweetVecs) cTexts.append(textsIn) cComps.append(compsIn) cDocs_zip.append(dataIn_zip) cDensity.append(np.mean(inDist)) cDistToST.append(distToST) if 0: print clbl, cDensity[-1] for item in textsIn: print item print compsIn return docDist, cDensity, cTexts, cComps, cDocs_zip, cDistToST
def PrecomputeSimilarities(self): from sklearn.metrics.pairwise import euclidean_distances if self.verbose > 10: print 'Precomputing similarities...' X=np.matrix(self.usedTrainingData['length']).transpose() self.Similarities = \ np.exp(-self.choice_parameter*euclidean_distances(X))
def neighbor_pixel_check(cluster_coords, max_distance=2): """ Check that all events in the cluster have a maximum distance smaller than max_distance :param cluster_coords: :param max_distance: :return: """ hot_pix_flag = True all_distances = euclidean_distances(cluster_coords, cluster_coords) if args.debug == "yes": print all_distances for distances in all_distances: for distance in distances: if distance < max_distance: pass else: hot_pix_flag = False break if hot_pix_flag == False: break return hot_pix_flag
def NN(self,datas, centroids): # start = time.time() ####### find which centroids the x is closet to, and put x into the centroids location ##### group = [[] for n in range(len(centroids))] # group = [[np.zeros(len(centroids[0]))] for n in range(len(centroids))] all_distances = euclidean_distances(centroids, datas, squared=True) labels = np.empty(len(datas), dtype=np.int32) labels.fill(-1) mindist = np.empty(len(datas)) mindist.fill(np.infty) for center_id in range(len(centroids)): dist = all_distances[center_id] labels[dist < mindist] = center_id mindist = np.minimum(dist, mindist) #for k in range(len(centroids)): #group.append(list) for i in range(len(labels)): group[labels[i]].append(datas[i]-centroids[labels[i]]) # end for # print("End of NN: ",(time.time()-start)) return group
def image_similarity(self, img1): """ returns closest nth image to image """ list_img_score = [] closest = float("Inf") closest_id = "" value_img = self.img_dict_train[img1] current = euclidean_distances(self.matrix_test, value_img.reshape(1, -1)) values_array = np.squeeze(np.asarray(current)) # current = current.tolist() # print values_array # print np.argmax(current) max_indexes = values_array.argsort()[:-100][::1] max_list = max_indexes.tolist() # print len(max_indexes) # print max_indexes # current = current.tolist() # print 'error' for idx in max_list: # print idx # tuple_=[] print self.img_dict_test[idx], values_array[idx] float_val = float(values_array[idx]) rounded_val = round(float_val, 5) # tuple_.append(self.img_dict_test[idx[0]]) # tuple_.append(current[idx]) list_img_score.append([self.img_dict_test[idx], rounded_val]) return list_img_score
def action(self, tweets_list): corpus = [] for tweet in tweets_list: #corpus += [t["text"]] tweet_str = tweet["text"].encode("utf-8") tweet_str = unicode(tweet_str,'utf-8') corpus.append(tweet_str) print(corpus) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) M,P=X.shape dist_corpus=euclidean_distances(X) stwf=stopwords.words('french') stwf.append('les') vectorizer=CountVectorizer(stop_words=stwf) X = vectorizer.fit_transform(corpus) dico=vectorizer.vocabulary_ #Tous les print regroupés ici print("Results of Birch algorithm") clusters = birch_algo(X.toarray(), None) quit()
def between_scatter_matrix_score(X, labels): """ Computes the between scatter matrix score of a labeling of a clustering :param X: :param labels: :return: """ llabels = np.unique(labels) # Centroid of the data centroid = np.zeros((1, X.shape[1])) centroid += np.sum(X, axis=0) centroid /= X.shape[0] dist = 0.0 for idx in llabels: center = np.zeros((1, X.shape[1])) center_mask = labels == idx center += np.sum(X[center_mask], axis=0) center /= center_mask.sum() dvector = euclidean_distances(centroid, center, squared=True) dist += dvector.sum() * center_mask.sum() return dist / len(labels)
def trimmedrbf_kernel(X, Y=None, gamma=None, robust_gamma = None): """ Compute the rbf (gaussian) kernel between X and Y:: K(x, y) = exp(-gamma ||x-y||**2) for each pair of rows x in X and y in Y. Parameters ---------- X : array of shape (n_samples_X, n_features) Y : array of shape (n_samples_Y, n_features) gamma : float Returns ------- kernel_matrix : array of shape (n_samples_X, n_samples_Y) """ X, Y = check_pairwise_arrays(X, Y) if gamma is None: gamma = 1.0 / X.shape[1] K = euclidean_distances(X, Y, squared=True) print K print "SHape kernel" + str(np.where(np.sqrt(K) > robust_gamma)[0].shape) K[np.where(np.sqrt(K) > robust_gamma)] = robust_gamma**2 K *= -gamma np.exp(K, K) # exponentiate K in-place return K
def sumACluster(dist, vecsIn, topK_t, sameTweetThred): if dist == "cosine": distMatrix = pairwise.cosine_distances(vecsIn) elif dist == "eu": distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn) sameTweetClusters = [[0]] for seqid, text in enumerate(vecsIn[1:], start=1): added = None for stcid, stc in enumerate(sameTweetClusters): sameFlag = False if distMatrix[seqid][stc[0]] <= sameTweetThred: sameFlag = True if sameFlag: stc.append(seqid) added = (stcid, stc) break if added is None: sameTweetClusters.append([seqid]) else: sameTweetClusters[added[0]] = added[1] sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)] numIn = len(sameTweetClusterNum) top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)] top = [(sameTweetClusters[item[0]][0], item[1]) for item in top] return top
def cluster_sentence_vectors(sentences, X, N_CLUSTERS=5): """ given vector results and number of clusters return cluster objects """ kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=45) cluster_assignments = kmeans.fit_predict(X) centroids = kmeans.cluster_centers_ cluster_dict = { x: {"vector": centroids[x], "sentences": [], "reduced": False} for x in xrange(len(centroids))} temp_cluster_keywords = {x: {} for x in xrange(len(centroids))} for i, sent in enumerate(sentences): sent['feature_vector'] = X.toarray()[i] cluster_num = cluster_assignments[i] sent["cluster_num"] = cluster_num dist_to_centroid = euclidean_distances(centroids[cluster_num], sent["feature_vector"])[0][0] sent["dist_to_centroid"] = dist_to_centroid # add to cluster object cluster_dict[cluster_num]["sentences"].append(sent) # merge keyword dictionaries together temp_cluster_keywords[cluster_num] = merge_kwd_counts([temp_cluster_keywords[cluster_num], sent["key_terms"]]) NUM_KEYWORDS = 5 for cluster_num in temp_cluster_keywords: clustered = sorted(temp_cluster_keywords[cluster_num].items(), key=lambda x: x[1])[0:NUM_KEYWORDS] cluster_dict[cluster_num]["keywords"] = [x[0] for x in clustered] return cluster_dict
def calculate_similarity(movie_name_1, movie_name_2, min_common_users=0): movie1 = movie_name_to_id_dictionary[movie_name_1] movie2 = movie_name_to_id_dictionary[movie_name_2] #This is the set of UNIQUE user ids who reviewed movie1 users_who_rated_movie1 = set((movielens_df[(movielens_df.MovieId == movie1)].UserId).tolist()) #This is the set of UNIQUE user ids who reviewed movie2 users_who_rated_movie2 = set((movielens_df[(movielens_df.MovieId == movie2)].UserId).tolist()) #Compute the common users who rated both movies: # hint convert both to set and do the intersection common_users = users_who_rated_movie1.intersection(users_who_rated_movie2) #Using the code you wrote in t2a, get the reviews for the movies and common users movie1_reviews = get_movie_reviews(movie1, common_users) movie2_reviews = get_movie_reviews(movie2, common_users) #Now you have the data frame for both movies # Use the euclidean_distances function from sklean (imported already) # to compute the distance between their rating values distance = euclidean_distances(movie1_reviews['Rating'].values, movie2_reviews['Rating'].values) if len(common_users) < min_common_users: return [[float('inf')]] return distance
def make_connectivity_matrix(self): """ Computes the connectivity matrix of this Population. Each point is connected to each other within a radius. """ if self.connectivity_matrix: return points_arr = np.array([[p.x, p.y] for p in self.points]) distance_mat = euclidean_distances(points_arr, points_arr) # Every point p will be connected to each other point whose distance # to p is less than a cut-off value. This value is computed as the # mean of {min_nonzero(dist_mat(p)) | p is a point}, times a factor min_nonzero = lambda r: min(r[r > 0]) # apply_along_axis(f, axis=1, arr) applies f to each row min_neighbor_distances = np.apply_along_axis(min_nonzero, axis=1, arr=distance_mat) factor = 2.2 neighbor_cutoff = np.mean(min_neighbor_distances) * factor connectivity_matrix = distance_mat < neighbor_cutoff self.connectivity_matrix = connectivity_matrix
def kmeans_step(frame, K): rng = np.random.RandomState(2) cluster_ids = np.zeros(X.shape[0]) centroids = rng.randn(K, 2) nsteps = frame // 3 for i in range(nsteps + 1): old_centroids = centroids if i < nsteps or frame % 3 > 0: dist = euclidean_distances(X, centroids) cluster_ids = dist.argmin(1) if i < nsteps or frame % 3 > 1: centroids = np.array( [X[cluster_ids == k].mean(0) for k in range(K)]) nans = np.isnan(centroids) centroids[nans] = old_centroids[nans] # plot data c = cluster_ids if frame > 0 else 'w' plt.scatter(X[:, 0], X[:, 1], c=c, s=50, edgecolors='k', vmin=0, vmax=K - 1, alpha=0.6) # plot centroids plt.scatter(old_centroids[:, 0], old_centroids[:, 1], marker='o', c=range(K), s=200) plt.scatter(old_centroids[:, 0], old_centroids[:, 1], marker='o', c='black', s=50) # plot new centers if third frame if frame % 3 == 2: for i in range(K): plt.annotate('', xy=centroids[i], xytext=old_centroids[i], arrowprops=dict(arrowstyle='->', linewidth=1, color='k')) plt.scatter(centroids[:, 0], centroids[:, 1], marker='o', c=range(K), s=200) plt.scatter(centroids[:, 0], centroids[:, 1], marker='o', c='black', s=50) plt.xlim(-4, 4) plt.ylim(-2, 10) if frame % 3 == 1: plt.title("Assign data to nearest centroid", size=14) elif frame % 3 == 2: plt.title("Update centroids to cluster means", size=14) else: plt.title(" ", size=14)
def find_rating(nearby_rid, cuisine): print("I am starting to find rating algo") #print nearby_rid # First read csv files and store it in dataframe # Second convert dataframe to array df_restaurant = pd.read_csv('data/restaurant.csv', header=0) array_restaurant = df_restaurant.values #print array_restaurant df_cuisine = pd.read_csv('data/cuisine.csv', header=0) array_cuisine = df_cuisine.values # # Perform natural join on cuisine and restaurant based on key 'rid' and store it in dataframe # # convert that dataframe into an array # combine = df_cuisine.set_index('rid').join(df_restaurant.set_index('id')) # array_combine = combine.values # #print array_combine #--------------------------------------------------------------------------- # Select only those restaurant from all which are nearby # Convert 2d numpy array to 1d array. For eg. [[1, 2, 3]] into [1, 2, 3] nearby_rid = nearby_rid.ravel() filter_nearby = df_restaurant.loc[df_restaurant['id'].isin(nearby_rid)] array_filter_nearby = filter_nearby.values #print array_filter_nearby filter_cuisine_id = array_cuisine[array_cuisine[:, 2] == 'Italian'] #print "I WANT THISSSSSSSSSS" #print filter_cuisine_id filter_cuisine_id = filter_cuisine_id[:, 1] #print filter_cuisine_id.astype(int) filter_cuisine = filter_nearby.loc[filter_nearby['id'].isin( filter_cuisine_id.astype(int))] print(filter_cuisine) filter_cuisine = filter_cuisine.values #--------------------------------------------------------------------------- # Extract latitude and longitude of above filtered restaurant lat_long = filter_cuisine[:, 2:4] #print lat_long # Apply clustering algo on filtered restaurant data kmeans = KMeans(n_clusters=3, random_state=0).fit(lat_long) # Cluster number for all the above filtered restaurant in which cluster they fall print(kmeans.labels_) #print kmeans.predict([[18.95618666,72.81199761], [18.99120402, 72.81458057]]) print("Clustering centre") print(kmeans.cluster_centers_) #---------------------------------------------------------------------------- # calculate distance of each cluster from user's current location distance = euclidean_distances([[19.044497, 72.8204535]], kmeans.cluster_centers_) print(np.transpose(distance)) print(len(distance)) # append cluster number with above distance array, for knowing which cluster distance is that # because after we are sorting these distances distance_cluster_centre = np.insert(np.transpose(distance), 1, np.array([0, 1, 2]), axis=1) print(distance_cluster_centre) # sorted distances print("sorted distance") arr = distance_cluster_centre[distance_cluster_centre[:, 0].argsort()] #------------------------------------------------------------------------------ # make numpy array with columns [id, lat, long, rating, cid] # cid = cluster id id_after_cuisine = filter_cuisine[:, 0] id_lat_long = np.insert(lat_long, 0, id_after_cuisine, axis=1) id_lat_long_cid = np.insert(id_lat_long, 3, kmeans.labels_, axis=1) id_lat_long_rating_cid = np.insert(id_lat_long_cid, 3, filter_cuisine[:, 8], axis=1) print(id_lat_long_rating_cid) # convert above array to dataframe columns = ['id', 'latitude', 'longitude', 'rating', 'cid'] df = pd.DataFrame(id_lat_long_rating_cid, columns=columns) #----------------------------------------------------------------------------------------------- # SORT CLUSTER ACCORDING TO CLUSTER CENTRE DISTANCES FROM USER'S LOCATION # select [[12.313, 12.375843, 24.7364],[0, 2, 1]] - [[centre distances][cluster id]] print(np.array(arr[:, 1][0])) #initialize empty dataframe sorted_cluster = pd.DataFrame() # sort cluster according to cluster centre distance for i in range(0, len(arr[:, 1])): # dataframe of single cluster single_cluster = df.loc[df['cid'].isin(np.array(arr[:, 1][i]).ravel())] single_cluster = single_cluster.sort_values(by='rating', ascending=False) sorted_cluster = sorted_cluster.append(single_cluster) print(sorted_cluster) #df_groupby = sorted_cluster.groupby('cid') #print len(df_groupby) #for group in df_groupby: # print group #print df_groupby.sort_values('rating', ascending=False) #print df_groupby.get_group(0) # convert dataframe to array and extract only rid sorted_cluster_rid = sorted_cluster.as_matrix(columns=None)[:, 0] # convert long datatype of rid into int return sorted_cluster_rid.astype(int) #df_groupby = df.groupby('cid') #print len(df_groupby) #for group in df_groupby: # print group #print df_groupby.sort_values('rating', ascending=False) #print df_groupby.get_group(0) #-------------------------------------------------------------------------------------------------- # featureset_all = np.delete(filter_cuisine, np.s_[2:10], axis=1) # print "CONVERT THIS ARRAY TO DATFRAMEEEEEEEEEEEEEE" # print featureset_all # #featureset_all = featureset_all[0:6,:] # featureset_X = np.delete(featureset_all, np.s_[0], axis=1) # print featureset_X # featureset_Y = np.delete(featureset_all, np.s_[1:], axis=1) # print featureset_Y # columns=['cuisine','homedelivery','smoking','alcohol','wifi', 'valetparking','rooftop'] # df = pd.DataFrame(featureset_X ,columns=columns) # print "CONVERTEDDDDDDDDDDDDDDDDDDD" # print df # cols_to_retain = ['cuisine', 'homedelivery', 'smoking', 'alcohol', 'wifi', 'valetparking', 'rooftop'] # #cols_to_retain = ['homedelivery', 'smoking', 'alcohol', 'wifi'] # feature = df[cols_to_retain].to_dict( orient = 'records' ) # print "DICTIONARYYYYYYYYY" # print feature # vec = DictVectorizer() # X = vec.fit_transform(feature).toarray() # print X # columns=['id'] # df = pd.DataFrame(featureset_Y ,columns=columns) # cols_to_retain = ['id'] # Y = df[cols_to_retain].to_dict( orient = 'records' ) # vec = DictVectorizer() # Y = vec.fit_transform(Y).toarray() # print Y # X_train, X_test, Y_train_labels, Y_test_labels = train_test_split(X, Y, test_size=0.3, random_state=100) # print "-----------Training feature---------------" # print X_train # print "------------Testing feature--------------" # print X_test # print "------------Training label--------------" # print Y_train_labels # print "-----------Testing label---------------" # print Y_test_labels # print "--------------------------" # clf_entropy = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5) # clf_entropy.fit(X_train, Y_train_labels) # print "Fitting done" # # Make predictions # y_pred_en = clf_entropy.predict(X_test) # print y_pred_en # # columns=['cuisine','homedelivery','smoking','alcohol','wifi', 'valetparking','rooftop'] # # df = pd.DataFrame([['Italian', 'yes', 'no', 'yes', 'no', 'no', 'no'], ['Italian', 'yes', 'no', 'yes', 'no', 'no', 'no']] ,columns=columns) # # cols_to_retain = ['cuisine', 'homedelivery', 'smoking', 'alcohol', 'wifi', 'valetparking', 'rooftop'] # # feature = df[cols_to_retain].to_dict( orient = 'records' ) # # print feature # # vec = DictVectorizer() # # user_input = vec.fit_transform(feature).toarray() # # print user_input # print clf_entropy.predict([[0. ,1. ,1. ,1. , 0., 0., 1., 0., 1., 0., 0., 1., 0.]]) print("shraddha")
from sklearn.manifold import MDS df = pd.DataFrame.from_csv('./train_lyrics_1000.csv') X_train = df['lyrics'].values names = df['title'].values count_vect = CountVectorizer() dtm = count_vect.fit_transform(X_train.ravel()) vocab = count_vect.get_feature_names() dtm = dtm.toarray() vocab = np.array(vocab) dist = euclidean_distances(dtm) dist = np.round(dist, 1) mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] for x, y, name in zip(xs, ys, names): color = 'skyblue' plt.scatter(x, y, c=color) plt.text(x, y, name) plt.show()
__author__ = 'zelalem' # import textmining from math import * import numpy as np from scipy.spatial.distance import pdist, euclidean, squareform # # doc1 = 'John and Bob are brothers.' doc2 = 'John went to the store. The store was closed.' doc3 = 'Bob went to the store too.' tdm = textmining.TermDocumentMatrix() tdm.add_doc(doc1) tdm.add_doc(doc2) tdm.add_doc(doc3) tdm.write_csv('/home/zelalem/Downloads//matrix.csv', cutoff=1) a = list(tdm.rows(cutoff=1))[1:] x = a[0] y = a[1] print a[2] from sklearn.metrics.pairwise import euclidean_distances X = [[0, 1], [1, 1]] print euclidean_distances(X, X)
def angular_distances(X, Y): return euclidean_distances(normalize(X), normalize(Y))
def metrics(self, epsilon=1.0, k=None): """ Calculate the metrics (correctness and coverage) for all possible pairs of groups. Epsilon defines threshold (if a mapped initial point is further away from a target point by a length > epsilon, the point is considered false. """ num_dimensions = self.original_X.shape[1] correctness = np.zeros((self.num_clusters, self.num_clusters)) coverage = np.zeros((self.num_clusters, self.num_clusters)) for initial in range(self.num_clusters): for target in range(self.num_clusters): x_init = [ self.original_X[i] for i in range(len(self.original_X)) if self.original_Y[i] == initial ] x_target = [ self.original_X[i] for i in range(len(self.original_X)) if self.original_Y[i] == target ] # Construct the explanation between the initial and target regions if initial == target: d = np.zeros((1, num_dimensions)) elif initial == 0: d = self.delta[target - 1] elif target == 0: d = -1.0 * self.delta[initial - 1] else: d = -1.0 * self.delta[initial - 1] + self.delta[target - 1] if k is not None: d = truncate(d, k) r_init = self.transformer(x_init + d) #r_target = self.transformer(x_target) r_target = [ self.latent_X[i] for i in range(len(self.latent_X)) if self.latent_Y[i] == target ] dists = euclidean_distances(r_init, Y=r_target) close_enough = 1.0 * (dists <= epsilon) if initial == target: threshold = 2.0 else: threshold = 1.0 correctness[initial, target] = np.mean( 1.0 * (np.sum(close_enough, axis=1) >= threshold)) coverage[initial, target] = np.mean( 1.0 * (np.sum(close_enough, axis=0) >= threshold)) self.correctness = correctness self.coverage = coverage return correctness, coverage
def findTPs(self): locals = self.locals model = self.supportmodel epsilon = self.options['epsilon'] R = model.R + 10**(-7) ts = {} ts['x'] = [] ts['f'] = [] ts['neighbor'] = [] ts['purturb'] = [] [N, attr] = locals.shape tmp_x = [] if model.support == 'GP': for i in range(N): for j in range(i, N): for k in range(10): x0 = locals[i] + 0.1 * (k + 1) * (locals[j] - locals[i]) sep = fsolve(func=fsolve_R_GP, x0=x0, fprime=Hess, args=model, xtol=10**(-6)) tmp_x.append(sep) tmp_x = np.array(tmp_x) [dummy, I, J] = np.unique(np.round(10 * tmp_x), axis=0, return_index=True, return_inverse=True) tmp_x = tmp_x[I, :] for i in range(list(tmp_x.shape)[0]): sep = tmp_x[i] [f, g, H] = my_R_GP2(sep, model) [D, V] = la.eig(H) ind = [] if np.sum(D < 0) == 1: sep1 = sep + epsilon * V[np.where(D < 0)[0]] sep2 = sep - epsilon * V[np.where(D < 0)[0]] if attr == 2: res1 = minimize(fun=my_R_GP1, x0=sep1, args=model, method='Nelder-Mead') [temp1, val] = [res1.x, res1.fun] res2 = minimize(fun=my_R_GP1, x0=sep2, args=model, method='Nelder-Mead') [temp2, val] = [res2.x, res2.fun] else: res1 = minimize(fun=my_R_GP1, x0=sep1, args=model, hess=True) [temp1, val] = [res1.x, res1.fun] res2 = minimize(fun=my_R_GP1, x0=sep2, args=model, hess=True) [temp2, val] = [res2.x, res2.fun] [dummy, ind1] = [ np.min( euclidean_distances(temp1.reshape(1, -1), locals)), np.argmin( euclidean_distances(temp1.reshape(1, -1), locals)) ] [dummy, ind2] = [ np.min( euclidean_distances(temp2.reshape(1, -1), locals)), np.argmin( euclidean_distances(temp2.reshape(1, -1), locals)) ] if ind1 != ind2: ts['x'].append(sep) ts['f'].append(f) ts['neighbor'].append([ind1, ind2]) ts['purturb'].append([sep1, sep2]) if model.support == 'SVDD': for i in range(N): for j in range(i, N): for k in range(10): x0 = locals[i] + 0.1 * (k + 1) * (locals[j] - locals[i]) sep = fsolve(func=fsolve_R, x0=x0, args=model, maxfev=300, xtol=10**(-6)) tmp_x.append(sep) tmp_x = np.array(tmp_x) [dummy, I, J] = np.unique(np.round(10 * tmp_x), axis=0, return_index=True, return_inverse=True) tmp_x = tmp_x[I, :] for i in range(list(tmp_x.shape)[0]): sep = tmp_x[i] [f, g, H] = my_R2(sep, model) [D, V] = la.eig(H) ind = [] if np.sum(D < 0) == 1: sep1 = sep + epsilon * V[np.where(D < 0)[0]] sep2 = sep - epsilon * V[np.where(D < 0)[0]] if attr == 2: res1 = minimize(fun=my_R1, x0=sep1, args=model, method='Nelder-Mead') [temp1, val] = [res1.x, res1.fun] res2 = minimize(fun=my_R1, x0=sep2, args=model, method='Nelder-Mead') [temp2, val] = [res2.x, res2.fun] else: res1 = minimize(fun=my_R1, x0=sep1, args=model, hess=True) [temp1, val] = [res1.x, res1.fun] res2 = minimize(fun=my_R1, x0=sep2, args=model, hess=True) [temp2, val] = [res2.x, res2.fun] [dummy, ind1] = [ np.min( euclidean_distances(temp1.reshape(1, -1), locals)), np.argmin( euclidean_distances(temp1.reshape(1, -1), locals)) ] [dummy, ind2] = [ np.min( euclidean_distances(temp2.reshape(1, -1), locals)), np.argmin( euclidean_distances(temp2.reshape(1, -1), locals)) ] if ind1 != ind2: ts['x'].append(sep) ts['f'].append(f) ts['neighbor'].append([ind1, ind2]) ts['purturb'].append([sep1, sep2]) ts['x'] = np.array(ts['x']) print(ts['x']) ts['f'] = np.array(ts['f']) ts['neighbor'] = np.array(ts['neighbor']) ts['purturb'] = np.array(ts['purturb']) self.ts = ts
def between(self, A, B): return euclidean_distances(A, B)
def within(self, A): return euclidean_distances(A, A)
y = np.array(movie) simr = pearsonr(x, y) # simmink = minkowski(x, y, 3) # simr_hybrid = pearsonr(vI, vJ) # simmink_hybrid = minkowski(vI, vJ, 5) x = x.reshape(1, -1) y = y.reshape(1, -1) # vI = vI.reshape(1, -1) # vJ = vJ.reshape(1, -1) sim = cosine_similarity([movie], [movief]) sime = euclidean_distances(x, y) # sim_hybrid = cosine_similarity(vI, vJ) # sime_hybrid = euclidean_distances(vI, vJ) q = "SELECT m.title FROM movies m JOIN trailers t on t.imdbid = m.imdbid WHERE t.id = ? AND t.best_file = 1" c = conn.cursor() c.execute(q, (key, )) title = c.fetchone() if (type(title) is tuple): # if (len(ratingsI) > 0 and len(ratingsJ) > 0): # simratings = cosine_similarity(ratingsI.reshape(1, -1), ratingsJ.reshape(1, -1)) # similarities_ratings.append((title[0], simratings)) similarities_cosine.append((title[0], sim))
def calc_distance(x, y): nx = np.asarray(x).reshape(1, -1) ny = np.asarray(y).reshape(1, -1) dist = euclidean_distances(nx, ny) return dist[0][0]
def fillBox(self, solv, molNum, checkCollisions=False, replaceCollisions=False, applyPCBs=True, progress=None): import math, random import numpy as np from lib.chemicalGraph.molecule.solvent.Solvent import Solvent solventMolecules = Solvent(solv) #.__class__) remaining = -1 # to track removng collissions totalDim = self.getDrawer().getBoxDimension() boxmin = self.getDrawer().getCellOrigin() boxmax = [ boxmin[0] + totalDim[0], boxmin[1] + totalDim[1], boxmin[2] + totalDim[2] ] #calculate the volume for a single solvent molecule center = [0., 0., 0.] pos = solv.massCenter() solv.moveBy( [center[0] - pos[0], center[1] - pos[1], center[2] - pos[2]]) #solvDiameter = solv.diameter()+ _SEPARATION_BETWEEN_SOLVENTS_ solvDiameter = math.pow(self.getDrawer().getBoxVolume() / molNum, 1. / 3.) if solvDiameter == 0: return row = math.floor(totalDim[1] / solvDiameter) col = math.floor(totalDim[0] / solvDiameter) dep = math.floor(totalDim[2] / solvDiameter) progressMax = molNum progressCount = 0 if progress != None: progress.setLabelText("Adding solvent") progress.setRange(0, molNum - 1) progress.setValue(0) solvRadius = solvDiameter / 2 newPos = solv.massCenter() refCoor = boxmin #boxmax[0] -= solvDiameter/2. #boxmax[1] -= solvDiameter/2. #boxmax[2] -= solvDiameter/2. if progress != None: progress.setLabelText("Adding solvent") progress.setRange(progressCount, progressMax) progress.setValue(progressCount) # lopps over a sequence of adding solvent and removing collisions originalMolecules = self.getMixture().molecules() originalCoords = self.getMixture().getAtomsCoordinatesAsArray() solvRadius = solvDiameter / 2.0 + 1.5 #print "anadiendo... ", progressMax-progressCount while progressCount < progressMax: # adds solvent #random rotation for solvent atoms rx = random.uniform(0, 360) ry = random.uniform(0, 360) rz = random.uniform(0, 360) #random displacement for solvent atoms rdx = random.uniform(boxmin[0], boxmax[0]) rdy = random.uniform(boxmin[1], boxmax[1]) rdz = random.uniform(boxmin[2], boxmax[2]) #generate new molecule and assign next position mol = solv.copy() mol.rotateDeg(rx, ry, rz) newPos = np.array([[rdx, rdy, rdz]]) if checkCollisions: atomDistances = euclidean_distances(originalCoords, newPos) if np.min(atomDistances) > solvRadius: #print 'fillBox: añadiendo', progressCount, np.min(atomDistances), newPos mol.moveBy(list(newPos)[0]) #nodeName = self.addMolecule(mol, checkForInconsistentNames=False) #self.shownMolecules.show(nodeName) solventMolecules.addCoordinates(mol) else: #print 'fillBox: colision' if replaceCollisions: progressCount -= 1 progressCount += 1 if progress != None: progress.setValue(progressCount) del mol # add solvent and rename with the given name nodeName = self.addMolecule(solventMolecules, checkForInconsistentNames=True) self.shownMolecules.show(nodeName) #print "fillBox ", mol.molname(),progressMax solv.rename(solventMolecules.molname()) progressMax -= 1
def test_euclidean_distances_known_result(x_array_constr, y_array_constr): # Check the pairwise Euclidean distances computation on known result X = x_array_constr([[0]]) Y = y_array_constr([[1], [2]]) D = euclidean_distances(X, Y) assert_allclose(D, [[1., 2.]])
def estimate_doc2vec_euclidean_dist(self): mat = self.word2vec_model.docvecs.get_normed_vectors() ecl_sim = euclidean_distances(mat, mat) return ecl_sim
def d2(c1, vec): #get distance return math.pow(euclidean_distances([c1], [vec]),2)
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def reachability_distance(a, b): distance = euclidean_distances(id_to_embedding[a].reshape(1, -1), id_to_embedding[b].reshape(1, -1))[0][0] return max(k_distance(b), distance)
#começo com uma lista, mas para usar numpy tem que virar array lista_X = [] # fazer uma array of arrays for i in X: lista_X.append([i]) array_lista_X = np.array([np.array(xi) for xi in lista_X]) X = array_lista_X Y = (X)**2 centroides_num = 10 # numero de centros da RBF # acho o k via random.choice, e não validação cruzada index = np.random.choice(a=n, size=centroides_num) subsample = X[index, :] gamma = 0.5 kernel = np.exp(-gamma * euclidean_distances(X=X, Y=subsample, squared=True)) para = np.linalg.lstsq(kernel, Y)[0] predict_Y = np.dot(kernel, para) plt.plot(X, Y, 'r', label='Dados originais') plt.plot(X, predict_Y, 'b', label='Após o data fit') plt.legend() plt.show()
def find_lines(lines_mask: np.ndarray) -> list: """ Finds the longest central line for each connected component in the given binary mask. :param lines_mask: Binary mask of the detected line-areas :return: a list of Opencv-style polygonal lines (each contour encoded as [N,1,2] elements where each tuple is (x,y) ) """ # Make sure one-pixel wide 8-connected mask lines_mask = skeletonize(lines_mask) class MakeLineMCP(MCP_Connect): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.connections = dict() self.scores = defaultdict(lambda: np.inf) def create_connection(self, id1, id2, pos1, pos2, cost1, cost2): k = (min(id1, id2), max(id1, id2)) s = cost1 + cost2 if self.scores[k] > s: self.connections[k] = (pos1, pos2, s) self.scores[k] = s def get_connections(self, subsample=5): results = dict() for k, (pos1, pos2, s) in self.connections.items(): path = np.concatenate( [self.traceback(pos1), self.traceback(pos2)[::-1]]) results[k] = path[::subsample] return results def goal_reached(self, int_index, float_cumcost): if float_cumcost > 0: return 2 else: return 0 if np.sum(lines_mask) == 0: return [] # Find extremities points end_points_candidates = np.stack( np.where((convolve2d(lines_mask, np.ones((3, 3)), mode='same') == 2) & lines_mask)).T connected_components = skimage_label(lines_mask, connectivity=2) # Group endpoint by connected components and keep only the two points furthest away d = defaultdict(list) for pt in end_points_candidates: d[connected_components[pt[0], pt[1]]].append(pt) end_points = [] for pts in d.values(): d = euclidean_distances(np.stack(pts), np.stack(pts)) i, j = np.unravel_index(d.argmax(), d.shape) end_points.append(pts[i]) end_points.append(pts[j]) end_points = np.stack(end_points) mcp = MakeLineMCP(~lines_mask) mcp.find_costs(end_points) connections = mcp.get_connections() # print(type(connections)) # print(connections.keys()) a = connections[(8, 9)][:, None, ::-1] print(type(a)) print(a) img = np.zeros((lines_mask.shape[0], lines_mask.shape[1], 3), dtype=np.uint8) img += 255 # for c in connections.values(): # c = c.astype(np.uint8) # print(type(c)) # print(c) res = [connections[c][:, None, ::-1] for c in connections.keys()] for c in res: cv2.polylines(img, c, isClosed=True, color=(0, 0, 255), thickness=10) # cv2.fillPoly(img, [c], (255, 0, 0)) Image.fromarray(img).show() if not np.all( np.array(sorted([i for k in connections.keys() for i in k])) == np.arange(len(end_points))): print('Warning : find_lines seems weird') return [c[:, None, ::-1] for c in connections.values()]
gt_pred = [] gt_head = [] # Ground Truth of head entity for line in open(os.path.join(args.output,'test.txt'), 'r'): items = line.strip().split("\t") gt_head.append(items[1]) gt_pred.append(items[3]) gt_tail.append(items[4])""" # In[36]: notmatch = list(set(range(0, total_num)).symmetric_difference(id_match)) # In[37]: notmatch_idx = euclidean_distances(head_emb[notmatch], entities_emb, squared=True).argsort(axis=1) # In[38]: for idx, i in enumerate(notmatch): for j in notmatch_idx[idx, 0:40]: mid = mid_num_dic[j] head_mid_idx[i].append((mid, None)) match_mid_list.append(mid) # In[39]: correct, mid_num = 0, 0 for i, head_ids in enumerate(head_mid_idx): mids = set()
def closest_docs(self, point, docs, num_docs=5): distances = euclidean_distances(point, docs) return distances.argsort()[:, :num_docs]
def count(thresholded, segmented): # find the convex hull of the segmented hand region chull = cv2.convexHull(segmented) # find the most extreme points in the convex hull extreme_top = tuple(chull[chull[:, :, 1].argmin()][0]) extreme_bottom = tuple(chull[chull[:, :, 1].argmax()][0]) extreme_left = tuple(chull[chull[:, :, 0].argmin()][0]) extreme_right = tuple(chull[chull[:, :, 0].argmax()][0]) # find the center of the palm cX = int ((extreme_left[0] + extreme_right[0]) / 2) cY = int ((extreme_top[1] + extreme_bottom[1]) / 2) # find the maximum euclidean distance between the center of the palm # and the most extreme points of the convex hull distance = pairwise.euclidean_distances([(cX, cY)], Y=[extreme_left, extreme_right, extreme_top, extreme_bottom])[0] maximum_distance = distance[distance.argmax()] # calculate the radius of the circle with 80% of the max euclidean distance obtained radius = int(0.5 * maximum_distance) # find the circumference of the circle circumference = (2 * np.pi * radius) # take out the circular region of interest which has # the palm and the circular_roi =np.zeros(thresholded.shape[:2], dtype="uint8") # draw the circular ROI cv2.circle(circular_roi, (cX, cY), radius, 255, 1) # take bit-wise AND between thresholded hand using the circular ROI as the mask # which gives the cuts obtained using mask on the thresholded hand image circular_roi = cv2.bitwise_and(thresholded, thresholded, mask=circular_roi) # compute the contours in the circular ROI (cnts, _) = cv2.findContours(circular_roi.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # initalize the finger count count = 0 # loop through the contours found for c in cnts: (x, y, w, h)= cv2.boundingRect(c) if ((cY + (cY * 0.25)) > (y + h)) and ((circumference * 0.25) > c.shape[0]): count += 1 return count
def test_bd_test(self): x = [1, 2, 3, 4, 5] y = [1, 2, 3, 4, 5] bd_value = bd_test(x, y) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.0) np.random.seed(7654567) x = np.random.normal(0, 1, 50) y = np.random.normal(1, 1, 50) bd_value = bd_test(x, y) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.196408479999) x = np.random.normal(0, 1, 100).reshape(50, 2) y = np.random.normal(3, 1, 100).reshape(50, 2) bd_value = bd_test(x, y) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.5681075200000011) x = np.random.normal(0, 1, 100).reshape(50, 2) y = np.random.normal(10, 1, 100).reshape(50, 2) z = np.random.normal(100, 1, 100).reshape(50, 2) bd_value = bd_test(x, y, z) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 2.0604000000000022) bd_value = bd_test(x, y, z, weight="max") bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 1.3736000000000015) n = 90 x = np.random.normal(0, 1, n) bd_value = bd_test(x, size=np.array([40, 50])) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.009086599999999997) x = [np.random.normal(0, 1, num) for num in [40, 50]] x = np.hstack(x) bd_value = bd_test(x, [40, 50]) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.9639094650205713) x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] x = np.array(x, dtype=np.double) bd_value = bd_test(x, size=np.array([5, 5])) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.7231999999999997) x = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] bd_value = bd_test(x) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 2.403199999999999) from sklearn.metrics.pairwise import euclidean_distances sigma = [[1, 0], [0, 1]] x = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=50) y = np.random.multivariate_normal(mean=[1, 1], cov=sigma, size=50) x = np.row_stack((x, y)) dx = euclidean_distances(x, x) data_size = [50, 50] bd_value = bd_test(dx, size=data_size, dst=True) bd_value = bd_value[0] self.assertAlmostEqual(bd_value, 0.10779759999999977)
def hierarchicalLabelTSVC(self): print("hierarchicalLableTSVC") nOfLocals = self.locals.shape[0] ts = self.ts nOfTS = len(ts['f']) K = self.options['K'] local_clusters_assignments = [] f_sort = np.sort(ts['f'], 0) # small --> large print("f_sort:", f_sort) adjacent = np.zeros([nOfLocals, nOfLocals, nOfTS]) a = [] flag = 0 for m in range(nOfTS): cur_f = f_sort[ -m - 1] # % cutting level:large --> small (small number of clusters --> large number of clusters) # %cur_f=f_sort(i); % cutting level: small --> large (large number of clusters --> small number of clusters) tmp = np.nonzero(ts['f'] < cur_f)[0] if len(tmp) > 0: # % TSs inside the sphere for j in range(len(tmp)): adjacent[ts['neighbor'][tmp[j], 0], ts['neighbor'][tmp[j], 1], m] = 1 adjacent[ts['neighbor'][tmp[j], 1], ts['neighbor'][tmp[j], 0], m] = 1 # %% To connect nodes which can be connected via directly connected edges. for i in range(nOfLocals): for j in range(i): if (adjacent[i, j, m] == 1): adjacent[i, :, m] = np.logical_or( adjacent[i, :, m], adjacent[j, :, m]) adjacent[i, i] = 1 a = [a, cur_f] my_ts = {} my_ts['x'] = ts['x'][tmp, :] my_ts['f'] = ts['f'][tmp, :] my_ts['purturb'] = ts['purturb'][tmp, :] my_ts['neighbor'] = ts['neighbor'][tmp, :] my_ts['cuttingLevel'] = cur_f ind = np.nonzero(ts['f'] == cur_f)[0] my_ts['levelx'] = ts['x'][ind[0], :] tmp_ts = {} ####dictionary tmp_ts[m] = my_ts assignment = cg.connected_components(adjacent[:, :, m])[1] print("assignment:", assignment) print("N_clusters:", np.max(assignment) + 1) if np.max(assignment) == K - 1: print('We can find the number of K clusters') # % clstmodel update self.out_ts = tmp_ts[m] # % cluster assignment into entire data points self.local_ass = assignment self.cluster_labels = self.local_ass[self.match_local].T flag = 1 break local_clusters_assignments = [ local_clusters_assignments, assignment ] # % cannot find k clusters if flag == 0: print( 'Cannot find cluster assignments with K number of clusters, instead that we find cluster assignments the with the nearest number of clusters to K !' ) [dummy, ind] = np.min( euclidean_distances( np.max(local_clusters_assignments, 0).T, K), 0) ####min/max # %ts=[]; self.out_ts = tmp_ts[ind[0]] local_clusters_assignments = local_clusters_assignments[:, ind[0]] self.local_ass = local_clusters_assignments self.cluster_labels = self.local_ass[self.match_local] print(self.cluster_labels)
def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances): """Compute labels and inertia using a full distance matrix. This will overwrite the 'distances' array in-place. Parameters ---------- X : numpy array, shape (n_sample, n_features) Input data. x_squared_norms : numpy array, shape (n_samples,) Precomputed squared norms of X. centers : numpy array, shape (n_clusters, n_features) Cluster centers which data is assigned to. distances : numpy array, shape (n_samples,) Pre-allocated array in which distances are stored. Returns ------- labels : numpy array, dtype=np.int, shape (n_samples,) Indices of clusters that samples are assigned to. inertia : float Sum of distances of samples to their closest cluster center. """ n_samples = X.shape[0] k = centers.shape[0] all_distances = euclidean_distances(centers, X, x_squared_norms, squared=True) labels = np.empty(n_samples, dtype=np.int32) labels.fill(-1) mindist = np.empty(n_samples) mindist.fill(np.infty) n_samples = X.shape[0] k = centers.shape[0] max_cluster_size = get_clusters_size(n_samples, k) labels, mindist = initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size) all_points = np.arange(n_samples) for point in all_points: for point_dist in get_best_point_distances(point, all_distances): cluster_id, point_dist = point_dist # initial assignment if not is_cluster_full(cluster_id, max_cluster_size, labels): labels[point] = cluster_id mindist[point] = point_dist break # refinement of clustering transfer_list = [] best_mindist = mindist.copy() best_labels = labels.copy() # sort all of the points from largest distance to smallest points_by_high_distance = np.argsort(mindist)[::-1] for point in points_by_high_distance: point_cluster = labels[point] # see if there is an opening on the best cluster for this point cluster_id, point_dist = get_best_cluster_for_point(point, all_distances) if not is_cluster_full(cluster_id, max_cluster_size, labels) and point_cluster != cluster_id: labels[point] = cluster_id mindist[point] = point_dist best_labels = labels.copy() best_mindist = mindist.copy() continue # on to the next point for swap_candidate in transfer_list: cand_cluster = labels[swap_candidate] if point_cluster != cand_cluster: # get the current dist of swap candidate cand_distance = mindist[swap_candidate] # get the potential dist of point point_distance = all_distances[cand_cluster, point] # compare if point_distance < cand_distance: labels[point] = cand_cluster mindist[point] = all_distances[cand_cluster, point] labels[swap_candidate] = point_cluster mindist[swap_candidate] = all_distances[point_cluster, swap_candidate] if np.absolute(mindist).sum() < np.absolute(best_mindist).sum(): # update the labels since the transfer was a success best_labels = labels.copy() best_mindist = mindist.copy() break else: # reset since the transfer was not a success labels = best_labels.copy() mindist = best_mindist.copy() transfer_list.append(point) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = best_mindist.sum() return best_labels, inertia
def calculate_distance(cent, player_values): dist = euclidean_distances(cent, player_values) return dist[0][0]
def main(): parser = argparse.ArgumentParser() parser.add_argument('--threshold', type=float, default=0.9) parser.add_argument('--delete', action='store_true', help='Delete the outliers.') parser.add_argument('--doSubDirs', action='store_true', help='Work on all direct subdirectories.') parser.add_argument('--duplicates', action='store_true', help='Identify duplicates rather than outliers.') parser.add_argument( '--checkDirs', action='store_true', help= 'Check directories for high variance, indicating previous clean-up has not worked well.' ) parser.add_argument('directory') args = parser.parse_args() if args.duplicates and args.checkDirs: sys.exit("Combination of --duplicates and --checkDirs is not allowed.") if not os.path.isdir(args.directory): sys.exit("Input directory not found.") if args.doSubDirs: d = next(os.walk(args.directory))[1] else: d = [args.directory] for thisdir in d: print("=== {} ===".format(os.path.join(args.directory, thisdir))) outliers = [] duplicates = [] featList = list() allfiles = os.listdir(os.path.join(args.directory, thisdir)) allfilesFaces = list() for thisfile in allfiles: if thisfile.endswith(".vgg1"): with open(os.path.join(args.directory, thisdir, thisfile), 'rb') as f: reader = csv.reader(f) for row in reader: featList.append(row) #multiple faces in a single image files allfilesFaces.append( os.path.join(args.directory, thisdir, thisfile)) thisEmbeddings = np.vstack(featList) thisEmbeddings = thisEmbeddings.astype(np.float) if args.duplicates: for p1 in range(0, thisEmbeddings.shape[0]): for p2 in range(p1 + 1, thisEmbeddings.shape[0]): dist = euclidean_distances( thisEmbeddings[p1].reshape(1, -1), thisEmbeddings[p2].reshape(1, -1)) if dist < args.threshold: duplicates.append( (allfilesFaces[p1], allfilesFaces[p2], dist)) print("Found {} duplicate pairs from {} images.".format( len(duplicates), len(allfiles))) for p1, p2, dist in duplicates: print("{} - {}: {:0.4f}".format(p1, p2, dist[0][0])) if args.delete: try: os.remove(p2) except OSError: # might already be removed if 3 or more identials print("could not remove: ", p2) elif args.checkDirs: std = np.std(reps, axis=0) #mean = np.mean(reps, axis=0) dists = euclidean_distances(reps, reps) o = np.std(dists) # little reduction of std in cleaned-up version after outlier removal could be a hint, # but could also indicate perfect start, and would need keeping both directories # std < 0.2 means probably mostly images of one person, OK # std > 0.25 means probably images of two or more persons, not OK # std between 0.2 and 0.25 is a bit unclear, either a very varied face, or young to old, or multiple persons print(o) else: #print(type(thisEmbeddings)) #print(type(thisEmbeddings[0][0])) mean = np.mean(thisEmbeddings, axis=0) dists = euclidean_distances(thisEmbeddings, mean.reshape(1, -1)) for path, dist in zip(allfilesFaces, dists): dist = dist.take(0) if dist > args.threshold: outliers.append((path, dist)) print("Found {} outlier(s) from {} images.".format( len(outliers), len(allfiles))) for path, dist in outliers: print(" + {} ({:0.2f})".format(path, dist)) if args.delete: try: os.remove(path) except: a = 3
def _transform(self, X): """guts of transform method; no input validation""" #print "In _transform(), X =\n", X #print "In _transform(), self.cluster_centers_ =\n", self.cluster_centers_ return euclidean_distances(X, self.cluster_centers_)
def _kmedoids_run(X, n_clusters, max_iter, tolerance): ''' Main function for runing the k-medoids clustering ------------- X: the input data ndarray for k-medoids clustering, (#samples, #features) n_cluster: number of clusters max_iter: maximum number of clusters torlerance: the tolerance to stop the iterations, in percentage; i.e.: if tolerance=0.01, it means if the cost function decrease is less than 1%, the iteraction will stop. ''' n_samples = len(X) '''Calcuate the paired eucledian distance ''' dist_mat = euclidean_distances(X) ''' Initialize the medoids''' currentMedoids = np.asarray(_get_init_centers(n_clusters, n_samples)) '''Calcualte the total cost of the initial medoids''' costs_iters = [] dist_meds = dist_mat[currentMedoids] tot_cos = _get_cost(dist_meds, currentMedoids) costs_iters.append(tot_cos) cc = 0 for i in range(max_iter): dist_meds = dist_mat[currentMedoids] '''Associate each data point to the closest medoid And calcualte the total cost''' tot_cos = _get_cost(dist_meds, currentMedoids) '''Get new mediods o''' newMedoids = [] for j in range(n_clusters): o = np.random.choice(n_samples) if (not o in currentMedoids and not o in newMedoids): newMedoids.append(o) newMedoids = np.asarray(newMedoids).astype(int) dist_meds_ = dist_mat[newMedoids] tot_cos_ = _get_cost(dist_meds_, newMedoids) '''Swap newmediods with the current mediod if cost decreases''' if (tot_cos_ - tot_cos) < 0: currentMedoids = newMedoids costs_iters.append(tot_cos_) cc = +1 if abs(costs_iters[cc] / costs_iters[cc - 1] - 1) < tolerance: '''Associated data points to the final calucated medoids (reached by tolerance)''' clsts_membr_ids = [] dis_min = np.min(dist_meds, axis=0) for k in range(n_clusters): clst_mem_ids = np.where(dist_meds[k] == dis_min)[0] clsts_membr_ids.append(clst_mem_ids) return currentMedoids, clsts_membr_ids, costs_iters break costs_iters = np.asarray(costs_iters) '''Associated data points to the final calucated medoids (reached by maximum iters)''' clsts_membr_ids = [] dist_meds = dist_mat[currentMedoids] dis_min = np.min(dist_meds, axis=0) for k in range(n_clusters): clst_mem_ids = np.where(dist_meds[k] == dis_min)[0] clsts_membr_ids.append(clst_mem_ids) return currentMedoids, clsts_membr_ids, costs_iters
def _predict(self, X_predict): """ Auxiliary function to do the kNN prediction based on an approximated geodesic metric, while possibly intersecting each new sample with a Euclidean ball size self.ball_radius first. Parameters ------------ X_predict: np.array of size (D) or of size (N, D) Test points at which a prediction is done. Returns ------------- An np.array of size (N, len(self.n_neighbors)) containing the prediction for the N-th points with all desired choices of neighbors in the N-th row. """ # Handle only case where n_neighbors is a list here if isinstance(self.n_neighbors, (int, long)): tmp_n_neighbors = [self.n_neighbors] else: tmp_n_neighbors = self.n_neighbors if len(X_predict.shape) == 1: # Single sample case tmp_X_predict = np.reshape(X_predict, (1, -1)) else: tmp_X_predict = X_predict n_test_samples = tmp_X_predict.shape[0] if self.ball_radius is None: ball_radius = 1e16 else: ball_radius = self.ball_radius # Container prediction = np.zeros((n_test_samples, len(tmp_n_neighbors))) # Boolean matrix with 1 in (i,j) if training sample X_j is inside the Euclidean ball around test sample i inside_euclidean_ball = np.less( euclidean_distances(tmp_X_predict, self.X_), ball_radius).astype('bool') # Get training samples belonging to a certain level set assignment = [[] for _ in range(self.n_levelsets)] for j in range(self.n_levelsets): assignment[j] = np.where(self.labels_ == j)[0] # Get maximum number of points in the radius for any test point min_idx = tmp_n_neighbors for k in range(n_test_samples): distances = 100.0 * np.ones(self.N) for i in range(self.n_levelsets): idx = np.where(inside_euclidean_ball[k, assignment[i]])[ 0] # Find indices that are inside euclidean ball PX_predict = self.tangents_[i, :].dot(tmp_X_predict[k, :].T) ind_levelset = np.where(self.labels_[idx] == i)[0] # Setting distances inside level set and euclidean ball distances[assignment[i][idx]] = np.abs( self.PX_[assignment[i][idx]] - PX_predict) # distances[assignemnt[ind_levelset]] = np.abs(self.PX_[idx][ind_levelset] - PX_predict) for l, nNei in enumerate(tmp_n_neighbors): idx_for_pred = np.argpartition( distances, tmp_n_neighbors[l])[:tmp_n_neighbors[l]] idx_below_bound = np.where(distances[idx_for_pred] < 90.0)[0] if len(idx_below_bound) < min_idx[l]: min_idx[l] = len(idx_below_bound) if len(idx_for_pred[idx_below_bound]) == 0: raise RuntimeError( "kNN Prediction: No neighbours satisfy the requirements." ) prediction[k, l] = np.mean(self.Y_[idx_for_pred[idx_below_bound]]) if any(np.array(min_idx) < np.array(tmp_n_neighbors)): print "Could use only {0} samples for some predictions".format( min_idx) return prediction