def test_init_corr(self, other, T = 5e-3, outlierprior=1e-1, outlierfrac=1e-2, outliercutoff=1e-2, ): import scipy.spatial.distance as ssd import sys self.transform_points() other.transform_points() init_prob_nm(self.pt_ptrs, other.pt_ptrs, self.pt_w_ptrs, other.pt_w_ptrs, self.dims_gpu, other.dims_gpu, self.N, outlierprior, outlierfrac, T, self.corr_cm_ptrs, self.corr_rm_ptrs) gpu_corr_rm = self.corr_rm[0].get() gpu_corr_rm = gpu_corr_rm.flatten()[:(self.dims[0] + 1) * (other.dims[0] + 1)].reshape(self.dims[0]+1, other.dims[0]+1) s_pt_w = self.pts_w[0].get() s_pt = self.pts[0].get() o_pt_w = other.pts_w[0].get() o_pt = other.pts[0].get() d1 = ssd.cdist(s_pt_w, o_pt, 'euclidean') d2 = ssd.cdist(s_pt, o_pt_w, 'euclidean') p_nm = np.exp( -(d1 + d2) / (2 * T)) for i in range(self.dims[0]): for j in range(other.dims[0]): if abs(p_nm[i, j] - gpu_corr_rm[i, j]) > 1e-7: print "INIT CORR MATRICES DIFFERENT" print i, j, p_nm[i, j], gpu_corr_rm[i, j] ipy.embed() sys.exit(1)
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, y_is_x): # check that pairwise_distances give the same result in sequential and # parallel, when metric has data-derived parameters. with config_context(working_memory=1): # to have more than 1 chunk rng = np.random.RandomState(0) X = rng.random_sample((1000, 10)) if y_is_x: Y = X expected_dist_default_params = squareform(pdist(X, metric=metric)) if metric == "seuclidean": params = {'V': np.var(X, axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(X.T)).T} else: Y = rng.random_sample((1000, 10)) expected_dist_default_params = cdist(X, Y, metric=metric) if metric == "seuclidean": params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) dist = np.vstack(tuple(dist_function(X, Y, metric=metric, n_jobs=n_jobs))) assert_allclose(dist, expected_dist_explicit_params) assert_allclose(dist, expected_dist_default_params)
def ch(X, cIDX, distance="euclidean"): Nclusters = cIDX.max() + 1 Npoints = len(X) n = np.ndarray(shape=(Nclusters), dtype=float) j = 0 for i in range(cIDX.min(), cIDX.max() + 1): aux = np.asarray([float(b) for b in (cIDX == i)]) n[j] = aux.sum() j = j + 1 # Clusters A = np.array([X[np.where(cIDX == i)] for i in range(Nclusters)]) # Centroids v = np.array([np.sum(Ai, axis=0) / float(Ai.shape[0]) for Ai in A]) ssb = 0 for i in range(Nclusters): ssb = n[i] * (cdist([v[i]], [np.mean(X, axis=0)], metric=distance)[0][0] ** 2) + ssb z = np.ndarray(shape=(Nclusters), dtype=float) for i in range(cIDX.min(), cIDX.max() + 1): aux = np.array([(cdist([x], [v[i]], metric=distance)[0][0] ** 2) for x in X[cIDX == i]]) z[i] = aux.sum() ssw = z.sum() return (ssb / (Nclusters - 1)) / (ssw / (Npoints - Nclusters))
def bandwidth(self, X): """ Estimate bandwidth TODO Replace this with a method which treats the data like a distorted doughnut by estimating limit cycle, and integrating an ellipsoid swept along the trajectory of the limit cycle with apropriate lengths """ N = X.shape[0] D = X.shape[1] points_in_cluster = N / float(self.Ncl) # Wanted points in a cluster debugLog(self, "Estimating bandwidth") # Sample points from an n-box for numerical intergration S = 10 * (6 ** D) # Grab subsamples to of points to look at idx = randint(0, N, (S)) v = X[idx] # Sample points y = (X.max() - X.min()) * randn(S, D) + X.min() # Find how close together points in our subsample typically are w = std(cdist(X[list(set(arange(N)).difference(idx))], v).min(0)) # Count points in our box that are approximately this close c = sum(cdist(X, y).min(0) < self.sf * w) # Compute volume from length with sphere prefactor V = nSphereVolume(D) * ((c / float(S)) * (X.max() - X.min()) ** D) # Calculate bandwidth by return ((V * points_in_cluster) / N) ** (1.0 / D)
def cdist_sparse( X, Y, **kwargs ): """ -> |X| x |Y| cdist array, any cdist metric X or Y may be sparse -- best csr """ # todense row at a time, v slow if both v sparse sxy = 2*issparse(X) + issparse(Y) if sxy == 0: if kwargs["metric"] == "cosine": return 1 - cdist( X, Y, **kwargs ) else: return d d = np.empty( (X.shape[0], Y.shape[0]), np.float64 ) if sxy == 2: for j, x in enumerate(X): d[j] = cdist( x.todense(), Y, **kwargs ) [0] elif sxy == 1: for k, y in enumerate(Y): d[:,k] = cdist( X, y.todense(), **kwargs ) [0] else: for j, x in enumerate(X): for k, y in enumerate(Y): d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0] if kwargs["metric"] == "cosine": return 1 - d else: return d
def sim_compute(word, dis_type, topk=100): # Get the index of word and the corresponding vector try: index = word2idx[word] wordvec = myembed[index, :].reshape(1,-1) except KeyError: print "Word %s is not present in Vocablury" % sys.exc_value return # For cosine and correlation the similarity is 1 - distance # Else for others just inverse the distance by multipying with -1 if (dis_type == 'cosine' or dis_type == 'correlation' ): sim = 1 - cdist(wordvec, myembed, dis_type) else: sim = -1 * cdist(wordvec, myembed, dis_type) # Now operations to get sim the shape we need i.e from (1,N) to (N,) final = sim[0].T zipped = zip(range(len(final)), final) del zipped[index] zipped.sort(key=lambda t: t[1], reverse=True) return zipped
def covSEisoU(hyp=None, x=None, z=None, der=None): # Squared Exponential covariance function with isotropic distance measure with # unit magnitude. The covariance function is parameterized as: # # k(x^p,x^q) = exp( -(x^p - x^q)' * inv(P) * (x^p - x^q) / 2 ) # # where the P matrix is ell^2 times the unit matrix. # # The hyperparameters of the function are: # # hyp = [ log(ell) ] if hyp == None: # report number of parameters return [1] ell = np.exp(hyp[0]) # characteristic length scale n,D = x.shape if z == 'diag': A = np.zeros((n,1)) elif z == None: A = spdist.cdist(x/ell, x/ell, 'sqeuclidean') else: # compute covariance between data sets x and z A = spdist.cdist(x/ell, z/ell, 'sqeuclidean') # self covariances if der == None: # compute covariance matix for dataset x A = np.exp(-0.5*A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = np.exp(-0.5*A) * A else: raise Exception("Wrong derivative index in covSEisoU") return A
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'): l = asarray(l) sig = 1 #l[0] #l = l[1:] xs = ascolumn(xs) if ys is None: d = squareform(pdist(xs/l, 'sqeuclidean')) else: ys = ascolumn(ys) d = cdist(xs/l, ys/l, 'sqeuclidean') cov = exp(-d/2) if not deriv: return sig * cov grads = [] if wrt == 'l': #grads.append(cov) # grad of sig for i in xrange(shape(xs)[1]): if ys is None: grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean')) else: grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean') grad /= l[i] ** 3 grads.append(grad) return sig * cov, grads elif wrt == 'y': if shape(xs)[0] != 1: print '*** x not a row vector ***' jac = sig * cov * ((ys - xs) / l**2).T return sig * cov, jac
def computeScipySimilarity(Xs1,Xs2,sparse=False): Xall_new = np.zeros((Xs1.shape[0],4)) if sparse: print Xs1.shape print Xs2.shape Xs1 = np.asarray(Xs1.todense()) Xs2 = np.asarray(Xs2.todense()) for i,(a,b) in enumerate(zip(Xs1,Xs2)): a = a.reshape(-1,a.shape[0]) b = b.reshape(-1,b.shape[0]) #print a.shape #print type(a) dist = cdist(a,b,'cosine') Xall_new[i,0] = dist #Xall_new[i,3] = dist dist = cdist(a,b,'cityblock') Xall_new[i,1] = dist dist = cdist(a,b,'hamming') Xall_new[i,2] = dist dist = cdist(a,b,'euclidean') Xall_new[i,3] = dist Xall_new = pd.DataFrame(Xall_new,columns=['cosine','cityblock','hamming','euclidean']) print "NA:",Xall_new.isnull().values.sum() Xall_new = Xall_new.fillna(0.0) print "NA:",Xall_new.isnull().values.sum() print Xall_new.corr(method='spearman') return Xall_new
def fine_tune_transform(feature1, feature2, init_pair_idx): ind = [] k = 1 while len(ind) < 0.6 * min(len(feature1["pts"]), len(feature2["pts"])) and k < 10: # Step 1. Randomly choose 20 points evenly distributed on the image rand_pts = np.random.rand(20, 2) * (np.amax(feature1["pts"], axis=0) - np.amin(feature1["pts"], axis=0)) * \ np.array([1, 0.8]) + np.amin(feature1["pts"], axis=0) # Step 2. Find nearest points from feature1 dist_mat = spd.cdist(rand_pts, feature1["pts"][init_pair_idx[:, 0]]) tmp_ind = np.argmin(dist_mat, axis=1) # Step 3. Use these points to find a homography tf = cv2.findHomography(feature1["pts"][init_pair_idx[tmp_ind, 0]], feature2["pts"][init_pair_idx[tmp_ind, 1]], method=cv2.RANSAC, ransacReprojThreshold=5) # Then use the transform find more matched points pts12 = cv2.perspectiveTransform(np.array([[p] for p in feature1["pts"]], dtype="float32"), tf[0])[:, 0, :] dist_mat = spd.cdist(pts12, feature2["pts"]) num1, num2 = dist_mat.shape idx12 = np.argsort(dist_mat, axis=1) tmp_ind = np.argwhere(np.array([dist_mat[i, idx12[i, 0]] for i in range(num1)]) < 5) if len(tmp_ind) > len(ind): ind = tmp_ind logging.debug("len(ind) = %d, len(feature) = %d", len(ind), min(len(feature1["pts"]), len(feature2["pts"]))) k += 1 pair_idx = np.hstack((ind, idx12[ind, 0])) tf = cv2.findHomography(feature1["pts"][pair_idx[:, 0]], feature2["pts"][pair_idx[:, 1]], method=cv2.RANSAC, ransacReprojThreshold=5) return tf, pair_idx
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \ dist_f=None): if len(np.array(X).shape) == 1: _X = np.array([X]).T else: _X = np.array(X) if len(np.array(Y).shape) == 1: _Y = np.array([Y]).T else: _Y = np.array(Y) if dist_f == None: if symmetric: cM = pdist(_X) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y) M = kernel(cM, theta) return M else: if symmetric: cM = pdist(_X, dist_f) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y, dist_f) M = kernel(cM, theta) return M return
def pairwise_between_groups(fullsplit,i_own,split_lens,CDR3_similarity_cutoff): i_others = len(split_lens) if i_own != i_others-1: # if this is not the last group, stack groups being compared against fso = np.vstack(fullsplit[i_own+1:i_others]) else: fso = fullsplit[i_others-1] bool_dis = cdist(fullsplit[i_own],fso,'hamming').flatten() bool_inf = cdist(np.isinf(fullsplit[i_own]),np.isinf(fso),'hamming').flatten() finite_own = np.isfinite(fullsplit[i_own]).sum(axis=1) finite_others = np.isfinite(fso).sum(axis=1) pdf_all=np.empty(len(bool_dis)) for c,fin in enumerate(product(finite_own,finite_others)): pdf_all[c] = min(fin) norm_dist_all = (bool_dis-bool_inf)/pdf_all*fullsplit[0].shape[1] bool_all = norm_dist_all < (1-CDR3_similarity_cutoff) # given boolean array, find sequences belonging in a cluster, row-wise bool_all = bool_all.reshape(fullsplit[i_own].shape[0],fso.shape[0]) sets=[] col_offset = sum(split_lens[:i_own])+fullsplit[i_own].shape[0] for cnt,row in enumerate(bool_all): row_offset = cnt+sum(split_lens[:i_own]) sets_from_group = set(np.add(np.nonzero(row)[0],col_offset)) sets_from_group.add(row_offset) sets.append(sets_from_group) return sets
def __init__(self, data, bandwidth=None, fixed=True, k=None, function='triangular', eps=1.0000001, ids=None, truncate=True, points=None): #Added truncate flag if issubclass(type(data), scipy.spatial.KDTree): self.data = data.data data = self.data else: self.data = data if k is not None: self.k = int(k) + 1 else: self.k = k if points is None: self.dmat = cdist(self.data, self.data) else: self.points = points self.dmat = cdist(self.points, self.data) self.function = function.lower() self.fixed = fixed self.eps = eps self.trunc = truncate if bandwidth: try: bandwidth = np.array(bandwidth) bandwidth.shape = (len(bandwidth), 1) except: bandwidth = np.ones((len(data), 1), 'float') * bandwidth self.bandwidth = bandwidth else: self._set_bw() self.kernel = self._kernel_funcs(self.dmat/self.bandwidth) if self.trunc: mask = np.repeat(self.bandwidth, len(self.data), axis=1) self.kernel[(self.dmat >= mask)] = 0
def compute_bic(kmeans,X): """ Computes the BIC metric for a given clusters Parameters: ----------------------------------------- kmeans: List of clustering object from scikit learn X : multidimension np array of data points Returns: ----------------------------------------- BIC value """ # assign centers and labels centers = [kmeans.cluster_centers_] labels = kmeans.labels_ #number of clusters m = kmeans.n_clusters # size of the clusters n = np.bincount(labels) #size of data set N, d = X.shape #compute variance for all clusters beforehand cl_var=[] for i in xrange(m): if not n[i] - m==0: cl_var.append((1.0 / (n[i] - m)) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2)) else: cl_var.append(float(10**20) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2)) const_term = 0.5 * m * np.log10(N) BIC = np.sum([n[i] * np.log10(n[i]) - n[i] * np.log10(N) - ((n[i] * d) / 2) * np.log10(2*np.pi) - (n[i] / 2) * np.log10(cl_var[i]) - ((n[i] - m) / 2) for i in xrange(m)]) - const_term return(BIC)
def remove_redundant_mot(mot, rev_mot, max_d): """Reads a list of PWMs and removes redundant ones based on correlation. Args: - mot - rev_mot - max_d: Maximum distance for motifs to be considered redundant. Return value: A boolean numpy array is_good, that indicates whether each motif should be kept or not. """ nmot = mot.shape[0] assert(rev_mot.shape[0] == nmot) assert(rev_mot.shape[1] == mot.shape[1]) is_good = np.ones((nmot, ), dtype = np.bool) for i in range(nmot - 1): if not is_good[i]: continue # Get all the indices that are higher than i and have not been removed already. others = np.argwhere(np.logical_and(is_good, np.arange(0, nmot) > i)).flatten() d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), mot[others, :], metric = 'correlation').flatten() rev_d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), rev_mot[others, :], metric = 'correlation').flatten() # Get minimum distance so maximum correlation. d = np.minimum(d, rev_d) # PWMs that are similar to the i-th one and have worse pvalue, then they will # get marked as redundant bad = others[np.logical_and(d < max_d, pvals[i] < pvals[others])] is_good[bad] = False if np.any(np.logical_and(d < max_d, pvals[i] > pvals[others])): is_good[i] = False return is_good
def evaluate(self, individual): dist = cdist(np.atleast_2d(individual), np.atleast_2d(self.target)) if (self.model_name == "CNN"): X = np.array([np.array(individual).reshape(28,28,1)]) else: X = np.array([individual]) if self.model_name.startswith("SVM") or self.model_name.startswith("DT"): model_output = self.model.predict_proba(X) else: model_output = self.model.predict(X) desired_output = np.zeros(10) desired_output[self.target_output] = 1.0 dist2 = cdist(np.atleast_2d(model_output), np.atleast_2d(desired_output)) fit = dist*0.5 + 0.5*dist2 #fit = dist2 #fit = dist return fit,
def lloyd2(data, init_cent, metric='e', verbose=False): k = init_cent.shape[0] cent = np.copy(init_cent) labels = spdist.cdist(data, cent, metric).argmin(axis=1) converged = False t, tmax = 0, 1000 while not converged and t < tmax: t += 1 converged = True cent_ = np.array([np.mean(data[labels == l], axis=0) for l in range(k)]) labels_ = spdist.cdist(data, cent_, metric).argmin(axis=1) if not np.allclose(cent_, cent) or \ not np.alltrue(labels == labels_): converged = False labels = labels_ cent = cent_ if not converged: # raise UserWarning("did not converge after {} iterations".format(t)) print("did not converge after {} iterations".format(t)) elif verbose: print("Converged after {} iterations".format(t)) return cent, labels
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.*self.hyp[1]) # signal variance v = self.para[0] # degree (v = 0,1,2 or 3 only) if np.abs(v-np.round(v)) < 1e-8: # remove numerical error from format of parameter v = int(round(v)) assert(int(v) in range(4)) # Only allowed degrees: 0,1,2 or 3 v = int(v) n, D = x.shape j = np.floor(0.5*D) + v + 1 if z == 'diag': A = np.zeros((n,1)) elif z == None: A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') ) else: # compute covariance between data sets x and z A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') ) # cross covariances if der == None: # compute covariance matix for dataset x A = sf2 * self.pp(A,j,v,self.func) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * self.dpp(A,j,v,self.func,self.dfunc) elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2. * sf2 * self.pp(A,j,v,self.func) elif der == 2: # wants to compute derivative wrt order A = np.zeros_like(A) else: raise Exception("Wrong derivative entry in PiecePoly") return A
def learn(self,learndataset,pipp_normalise=True): """learn the tree structure required to perform evaluation :param learndataset: learning instances :type learndataset: :class:`~classifip.dataset.arff.ArffFile` :param pipp_normalise: normalise the input features or not :type pipp_normalise: boolean .. note:: learndataset should come from a xarff file tailored for lable ranking """ self.labels=learndataset.attribute_data['L'][:] learndata=[row[0:len(row)-1] for row in learndataset.data] data_array=np.array(learndata).astype(float) if pipp_normalise == True: span=data_array.max(axis=0)-data_array.min(axis=0) self.normal.append(True) self.normal.append(span) self.normal.append(data_array.min(axis=0)) data_array=(data_array-data_array.min(axis=0))/span else: self.normal.append(False) #Initalise radius as average distance between all learning instances if len(data_array) > 1000: data_red=np.random.permutation(data_array)[0:1000] distances=distance.cdist(data_red,data_red) else: distances=distance.cdist(data_array,data_array) self.radius=distances.sum()/(2*(len(distances)**2-len(distances))) self.tree=kdtree.KDTree(data_array) self.truerankings=[ranking_matrices(row[-1],self.labels) for row in learndataset.data]
def proceed(self, x=None, z=None, der=None): n, D = x.shape ell = 1./np.exp(self.hyp[0:D]) # characteristic length scale sf2 = np.exp(2.*self.hyp[D]) # signal variance if z == 'diag': A = np.zeros((n,1)) elif z == None: tem = np.dot(np.diag(ell),x.T).T A = spdist.cdist(tem,tem,'sqeuclidean') else: # compute covariance between data sets x and z A = spdist.cdist(np.dot(np.diag(ell),x.T).T,np.dot(np.diag(ell),z.T).T,'sqeuclidean') A = sf2*np.exp(-0.5*A) if der: if der < D: # compute derivative matrix wrt length scale parameters if z == 'diag': A = A*0 elif z == None: tem = np.atleast_2d(x[:,der])/ell[der] A *= spdist.cdist(tem,tem,'sqeuclidean') else: A *= spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der],np.atleast_2d(z[:,der]).T/ell[der],'sqeuclidean') elif der==D: # compute derivative matrix wrt magnitude parameter A = 2.*A else: raise Exception("Wrong derivative index in RDFard") return A
def eccentricity(data, exponent=1., metricpar={}, callback=None): if data.ndim==1: assert metricpar=={}, 'No optional parameter is allowed for a dissimilarity matrix.' ds = squareform(data, force='tomatrix') if exponent in (np.inf, 'Inf', 'inf'): return ds.max(axis=0) elif exponent==1.: ds = np.power(ds, exponent) return ds.sum(axis=0)/float(np.alen(ds)) else: ds = np.power(ds, exponent) return np.power(ds.sum(axis=0)/float(np.alen(ds)), 1./exponent) else: progress = progressreporter(callback) N = np.alen(data) ecc = np.empty(N) if exponent in (np.inf, 'Inf', 'inf'): for i in range(N): ecc[i] = cdist(data[(i,),:], data, **metricpar).max() progress((i+1)*100//N) elif exponent==1.: for i in range(N): ecc[i] = cdist(data[(i,),:], data, **metricpar).sum()/float(N) progress((i+1)*100//N) else: for i in range(N): dsum = np.power(cdist(data[(i,),:], data, **metricpar), exponent).sum() ecc[i] = np.power(dsum/float(N), 1./exponent) progress((i+1)*100//N) return ecc
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.* self.hyp[1]) # signal variance d = self.para[0] # 2 times nu if np.abs(d-np.round(d)) < 1e-8: # remove numerical error from format of parameter d = int(round(d)) d = int(d) try: assert(d in [1,3,5]) # check for valid values of d except AssertionError: print "Warning: You specified d to be neither 1,3 nor 5. We set d=3. " d = 3 if z == 'diag': A = np.zeros((x.shape[0],1)) elif z == None: x = np.sqrt(d)*x/ell A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean')) else: x = np.sqrt(d)*x/ell z = np.sqrt(d)*z/ell A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean')) if der == None: # compute covariance matix for dataset x A = sf2 * self.mfunc(d,A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * self.dmfunc(d,A) elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2 * sf2 * self.mfunc(d,A) elif der == 2: # no derivative wrt 3rd parameter A = np.zeros_like(A) # do nothing (d is not learned) else: raise Exception("Wrong derivative value in Matern") return A
def __init__(self,gifts,nb_neighbors=50,metric=None): """ metric=None uses the chord distance """ self.gifts = gifts self.X = gifts[['Latitude','Longitude']].values self.N = len(self.X) self.wgt = gifts.Weight.values #root of subtree -> list of nodes in this subtree self.subtrees = {i:[i] for i in range(self.N)} #node -> root of subtree self.Xto = range(self.N) #weight of subtrees self.subtree_weights = {i: self.wgt[i] for i in range(self.N)} #cartesian coordinates (ignoring earth radius) self.Z = np.apply_along_axis(self.to_cartesian,1,self.X) #distance from north pole to root points to_pole = cdist(np.atleast_2d(self.to_cartesian(north_pole)),self.Z) if metric is None: self.gates = to_pole[0].tolist() else: if isinstance(metric,Thin_Metric): self.gates = (AVG_EARTH_RADIUS * to_pole[0]).tolist() else: self.gates = cdist(np.atleast_2d(north_pole),self.X)[0].to_list() self.subtree_costs = {i:self.gates[i] for i in range(self.N)} self.total_cost = sum(self.subtree_costs.values()) self.nb_neighbors = nb_neighbors import sklearn.neighbors self.kdtree = sklearn.neighbors.KDTree(self.Z) self.metric=metric
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale p = np.exp(self.hyp[1]) # period sf2 = np.exp(2.*self.hyp[2]) # signal variance n,D = x.shape if z == 'diag': A = np.zeros((n,1)) elif z == None: A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean')) else: A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean')) A = np.pi*A/p if der == None: # compute covariance matix for dataset x A = np.sin(A)/ell A = A * A A = sf2 *np.exp(-2.*A) else: if der == 0: # compute derivative matrix wrt 1st parameter A = np.sin(A)/ell A = A * A A = 4. *sf2 *np.exp(-2.*A) * A elif der == 1: # compute derivative matrix wrt 2nd parameter R = np.sin(A)/ell A = 4 * sf2/ell * np.exp(-2.*R*R)*R*np.cos(A)*A elif der == 2: # compute derivative matrix wrt 3rd parameter A = np.sin(A)/ell A = A * A A = 2. * sf2 * np.exp(-2.*A) else: raise Exception("Wrong derivative index in covPeriodic") return A
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False): """Find nearest neighbors Note: The rows in xhs and rr must all be unit-length vectors, otherwise the result will be incorrect. Parameters ---------- xhs : array, shape=(n_samples, n_dim) Points of data set. rr : array, shape=(n_query, n_dim) Points to find nearest neighbors for. use_balltree : bool Use fast BallTree based search from scikit-learn. If scikit-learn is not installed it will fall back to the slow brute force search. return_dists : bool If True, return associated distances. Returns ------- nearest : array, shape=(n_query,) Index of nearest neighbor in xhs for every point in rr. distances : array, shape=(n_query,) The distances. Only returned if return_dists is True. """ if use_balltree: try: from sklearn.neighbors import BallTree except ImportError: logger.info('Nearest-neighbor searches will be significantly ' 'faster if scikit-learn is installed.') use_balltree = False if xhs.size == 0 or rr.size == 0: if return_dists: return np.array([], int), np.array([]) return np.array([], int) if use_balltree is True: ball_tree = BallTree(xhs) if return_dists: out = ball_tree.query(rr, k=1, return_distance=True) return out[1][:, 0], out[0][:, 0] else: nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0] return nearest else: from scipy.spatial.distance import cdist if return_dists: nearest = list() dists = list() for r in rr: d = cdist(r[np.newaxis, :], xhs) idx = np.argmin(d) nearest.append(idx) dists.append(d[0, idx]) return (np.array(nearest), np.array(dists)) else: nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs)) for r in rr]) return nearest
def proceed(self, x=None, z=None, der=None): ell = np.exp(self.hyp[0]) # characteristic length scale sf2 = np.exp(2.*self.hyp[1]) # signal variance alpha = np.exp(self.hyp[2]) n,D = x.shape if z == 'diag': D2 = np.zeros((n,1)) elif z == None: D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean') else: D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean') if der == None: # compute covariance matix for dataset x A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) else: if der == 0: # compute derivative matrix wrt 1st parameter A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * D2 elif der == 1: # compute derivative matrix wrt 2nd parameter A = 2.* sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) elif der == 2: # compute derivative matrix wrt 3rd parameter K = ( 1.0 + 0.5*D2/alpha ) A = sf2 * K**(-alpha) * (0.5*D2/K - alpha*np.log(K) ) else: raise Exception("Wrong derivative index in covRQ") return A
def online_k_means(k,b,t,X_in): random_number = 11232015 random_num = np.random.randint(X_in.shape[0], size =300 ) rng = np.random.RandomState(random_number) permutation1 = rng.permutation(len(random_num)) random_num = random_num[permutation1] x_input = X_in[random_num] c,l = mykmeansplusplus(x_input,k,t) v = np.zeros((k)) for i in range(t): random_num = np.random.randint(X_in.shape[0], size = b) rng = np.random.RandomState(random_number) permutation1 = rng.permutation(len(random_num)) random_num = random_num[permutation1] M = X_in[random_num] Y = cdist(M,c,metric='euclidean', p=2, V=None, VI=None, w=None) clust_index = np.argmin(Y,axis = 1) for i in range(M.shape[0]): c_in = clust_index[i] v[c_in] += 1 ita = 1 / v[c_in] c[c_in] = np.add(np.multiply((1 - ita),c[c_in]),np.multiply(ita,M[i])) Y_l = cdist(X_in,c,metric='euclidean', p=2, V=None, VI=None, w=None) l = np.argmin(Y_l,axis = 1) return c,l
def proceed(self, x=None, z=None, der=None): n, D = x.shape ell = 1./np.exp(self.hyp[0:D]) # characteristic length scale sf2 = np.exp(2.*self.hyp[D]) # signal variance alpha = np.exp(self.hyp[D+1]) if z == 'diag': D2 = np.zeros((n,1)) elif z == None: tmp = np.dot(np.diag(ell),x.T).T D2 = spdist.cdist(tmp, tmp, 'sqeuclidean') else: D2 = spdist.cdist(np.dot(np.diag(ell),x.T).T, np.dot(np.diag(ell),z.T).T, 'sqeuclidean') if der == None: # compute covariance matix for dataset x A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) else: if der < D: # compute derivative matrix wrt length scale parameters if z == 'diag': A = D2*0 elif z == None: tmp = np.atleast_2d(x[:,der])/ell[der] A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(tmp, tmp, 'sqeuclidean') else: A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der], np.atleast_2d(z[:,der]).T/ell[der], 'sqeuclidean') elif der==D: # compute derivative matrix wrt magnitude parameter A = 2. * sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) ) elif der==(D+1): # compute derivative matrix wrt magnitude parameter K = ( 1.0 + 0.5*D2/alpha ) A = sf2 * K**(-alpha) * ( 0.5*D2/K - alpha*np.log(K) ) else: raise Exception("Wrong derivative index in covRQard") return A
def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z): if self.kernelX_use_median: sigmax = self.kernelX.get_sigma_median_heuristic(train_x) self.kernelX.set_width(float(sigmax)) if self.kernelY_use_median: sigmay = self.kernelY.get_sigma_median_heuristic(train_y) self.kernelY.set_width(float(sigmay)) kf = KFold( n_splits=self.K_folds) matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as # this command simply copied [None] many times. But the above gives different ids. count = 0 for train_index, test_index in kf.split(np.ones((self.num_samples,1))): X_tr, X_tst = train_x[train_index], train_x[test_index] Y_tr, Y_tst = train_y[train_index], train_y[test_index] Z_tr, Z_tst = train_z[train_index], train_z[test_index] matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix count = count + 1 return matrix_results
def euclidean_distances(X, Y, squared=False, inverse=True): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. An implementation of a "similarity" based on the Euclidean "distance" between two vectors X and Y. Thinking of items as dimensions and preferences as points along those dimensions, a distance is computed using all items (dimensions) where both users have expressed a preference for that item. This is simply the square root of the sum of the squares of differences in position (preference) along each dimension. Parameters ---------- X: array of shape (n_samples_1, n_features) Y: array of shape (n_samples_2, n_features) squared: boolean, optional This routine will return squared Euclidean distances instead. inverse: boolean, optional This routine will return the inverse Euclidean distances instead. Returns ------- distances: array of shape (n_samples_1, n_samples_2) Examples -------- >>> from scikits.crab.metrics.pairwise import euclidean_distances >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[3.0, 3.5, 1.5, 5.0, 3.5,3.0]] >>> # distrance between rows of X >>> euclidean_distances(X, X) array([[ 1. , 0.29429806], [ 0.29429806, 1. ]]) >>> # get distance to origin >>> X = [[1.0, 0.0],[1.0,1.0]] >>> euclidean_distances(X, [[0.0, 0.0]]) array([[ 0.5 ], [ 0.41421356]]) """ # should not need X_norm_squared because if you could precompute that as # well as Y, then you should just pre-compute the output and not even # call this function. if X is Y: X = Y = np.asanyarray(X) else: X = np.asanyarray(X) Y = np.asanyarray(Y) if X.shape[1] != Y.shape[1]: raise ValueError("Incompatible dimension for X and Y matrices") if squared: return ssd.cdist(X, Y, 'sqeuclidean') XY = ssd.cdist(X, Y) return np.divide(1.0, (1.0 + XY)) if inverse else XY