Exemple #1
0
    def test_init_corr(self, other, T = 5e-3, outlierprior=1e-1, outlierfrac=1e-2, outliercutoff=1e-2, ):
        import scipy.spatial.distance as ssd
        import sys
        self.transform_points()
        other.transform_points()
        init_prob_nm(self.pt_ptrs, other.pt_ptrs, 
                     self.pt_w_ptrs, other.pt_w_ptrs, 
                     self.dims_gpu, other.dims_gpu,
                     self.N, outlierprior, outlierfrac, T, 
                     self.corr_cm_ptrs, self.corr_rm_ptrs)
        gpu_corr_rm = self.corr_rm[0].get()
        gpu_corr_rm = gpu_corr_rm.flatten()[:(self.dims[0] + 1) * (other.dims[0] + 1)].reshape(self.dims[0]+1, other.dims[0]+1)
        s_pt_w = self.pts_w[0].get()
        s_pt   = self.pts[0].get()
        o_pt_w = other.pts_w[0].get()
        o_pt   = other.pts[0].get()

        d1 = ssd.cdist(s_pt_w, o_pt, 'euclidean')
        d2 = ssd.cdist(s_pt, o_pt_w, 'euclidean')

        p_nm = np.exp( -(d1 + d2) / (2 * T))

        for i in range(self.dims[0]):
            for j in range(other.dims[0]):
                if abs(p_nm[i, j] - gpu_corr_rm[i, j]) > 1e-7:
                    print "INIT CORR MATRICES DIFFERENT"
                    print i, j, p_nm[i, j], gpu_corr_rm[i, j]
                    ipy.embed()
                    sys.exit(1)
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                                                y_is_x):
    # check that pairwise_distances give the same result in sequential and
    # parallel, when metric has data-derived parameters.
    with config_context(working_memory=1):  # to have more than 1 chunk
        rng = np.random.RandomState(0)
        X = rng.random_sample((1000, 10))

        if y_is_x:
            Y = X
            expected_dist_default_params = squareform(pdist(X, metric=metric))
            if metric == "seuclidean":
                params = {'V': np.var(X, axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
        else:
            Y = rng.random_sample((1000, 10))
            expected_dist_default_params = cdist(X, Y, metric=metric)
            if metric == "seuclidean":
                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}

        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
        dist = np.vstack(tuple(dist_function(X, Y,
                                             metric=metric, n_jobs=n_jobs)))

        assert_allclose(dist, expected_dist_explicit_params)
        assert_allclose(dist, expected_dist_default_params)
Exemple #3
0
def ch(X, cIDX, distance="euclidean"):
    Nclusters = cIDX.max() + 1
    Npoints = len(X)

    n = np.ndarray(shape=(Nclusters), dtype=float)

    j = 0
    for i in range(cIDX.min(), cIDX.max() + 1):
        aux = np.asarray([float(b) for b in (cIDX == i)])
        n[j] = aux.sum()
        j = j + 1

    # Clusters
    A = np.array([X[np.where(cIDX == i)] for i in range(Nclusters)])
    # Centroids
    v = np.array([np.sum(Ai, axis=0) / float(Ai.shape[0]) for Ai in A])

    ssb = 0

    for i in range(Nclusters):
        ssb = n[i] * (cdist([v[i]], [np.mean(X, axis=0)], metric=distance)[0][0] ** 2) + ssb

    z = np.ndarray(shape=(Nclusters), dtype=float)

    for i in range(cIDX.min(), cIDX.max() + 1):
        aux = np.array([(cdist([x], [v[i]], metric=distance)[0][0] ** 2) for x in X[cIDX == i]])
        z[i] = aux.sum()

    ssw = z.sum()

    return (ssb / (Nclusters - 1)) / (ssw / (Npoints - Nclusters))
Exemple #4
0
 def bandwidth(self, X):
     """
 Estimate bandwidth
 TODO Replace this with a method which treats the data like a 
 distorted doughnut by estimating limit cycle, and integrating an ellipsoid
 swept along the trajectory of the limit cycle with apropriate lengths
 """
     N = X.shape[0]
     D = X.shape[1]
     points_in_cluster = N / float(self.Ncl)  # Wanted points in a cluster
     debugLog(self, "Estimating bandwidth")
     # Sample points from an n-box for numerical intergration
     S = 10 * (6 ** D)
     # Grab subsamples to of points to look at
     idx = randint(0, N, (S))
     v = X[idx]
     # Sample points
     y = (X.max() - X.min()) * randn(S, D) + X.min()
     # Find how close together points in our subsample typically are
     w = std(cdist(X[list(set(arange(N)).difference(idx))], v).min(0))
     # Count points in our box that are approximately this close
     c = sum(cdist(X, y).min(0) < self.sf * w)
     # Compute volume from length with sphere prefactor
     V = nSphereVolume(D) * ((c / float(S)) * (X.max() - X.min()) ** D)
     # Calculate bandwidth by
     return ((V * points_in_cluster) / N) ** (1.0 / D)
Exemple #5
0
def cdist_sparse( X, Y, **kwargs ):
    """ -> |X| x |Y| cdist array, any cdist metric
        X or Y may be sparse -- best csr
    """
        # todense row at a time, v slow if both v sparse
    sxy = 2*issparse(X) + issparse(Y)
    if sxy == 0:
        if kwargs["metric"] == "cosine":
            return 1 - cdist( X, Y, **kwargs )
        else:
            return d
    d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
    if sxy == 2:
        for j, x in enumerate(X):
            d[j] = cdist( x.todense(), Y, **kwargs ) [0]
    elif sxy == 1:
        for k, y in enumerate(Y):
            d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
    else:
        for j, x in enumerate(X):
            for k, y in enumerate(Y):
                d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
    if kwargs["metric"] == "cosine":
        return 1 - d
    else:
        return d
def sim_compute(word, dis_type, topk=100):
    # Get the index of word and the corresponding vector
    try:
        index = word2idx[word]
        wordvec = myembed[index, :].reshape(1,-1)
    except KeyError:
        print "Word %s is not present in Vocablury" % sys.exc_value
        return



    # For cosine and correlation the similarity is 1 - distance
    # Else for others just inverse the distance by  multipying with -1
    if (dis_type == 'cosine' or dis_type == 'correlation' ):
        sim = 1 - cdist(wordvec, myembed, dis_type)
    else:
        sim = -1 * cdist(wordvec, myembed, dis_type)

    # Now operations to get sim the shape we need i.e from (1,N) to (N,)
    final = sim[0].T

    zipped = zip(range(len(final)), final)
    del zipped[index]
    zipped.sort(key=lambda t: t[1], reverse=True)

    return zipped
Exemple #7
0
def covSEisoU(hyp=None, x=None, z=None, der=None):
    # Squared Exponential covariance function with isotropic distance measure with
    # unit magnitude. The covariance function is parameterized as:
    # 
    # k(x^p,x^q) = exp( -(x^p - x^q)' * inv(P) * (x^p - x^q) / 2 )
    #
    # where the P matrix is ell^2 times the unit matrix. 
    # 
    # The hyperparameters of the function are:
    #
    # hyp = [ log(ell) ]
    

    if hyp == None:                 # report number of parameters
        return [1]

    ell = np.exp(hyp[0])            # characteristic length scale
    n,D = x.shape

    if z == 'diag':
        A = np.zeros((n,1))
    elif z == None:
        A = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
    else:                            # compute covariance between data sets x and z
        A = spdist.cdist(x/ell, z/ell, 'sqeuclidean')   # self covariances

    if der == None:                  # compute covariance matix for dataset x
        A = np.exp(-0.5*A)
    else:
        if der == 0:                 # compute derivative matrix wrt 1st parameter
            A = np.exp(-0.5*A) * A
        else:
            raise Exception("Wrong derivative index in covSEisoU")

    return A
Exemple #8
0
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'):
    l = asarray(l)
    sig = 1 #l[0]
    #l = l[1:]
    xs = ascolumn(xs)
    if ys is None:
        d = squareform(pdist(xs/l, 'sqeuclidean'))
    else:
        ys = ascolumn(ys)
        d = cdist(xs/l, ys/l, 'sqeuclidean')
    cov = exp(-d/2)
    if not deriv: return sig * cov

    grads = []
    if wrt == 'l':
        #grads.append(cov) # grad of sig
        for i in xrange(shape(xs)[1]):
            if ys is None:
                grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean'))
            else:
                grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean')
            grad /= l[i] ** 3
            grads.append(grad)
        return sig * cov, grads
    elif wrt == 'y':
        if shape(xs)[0] != 1: print '*** x not a row vector ***'
        jac = sig * cov * ((ys - xs) / l**2).T
        return sig * cov, jac
def computeScipySimilarity(Xs1,Xs2,sparse=False):
    Xall_new = np.zeros((Xs1.shape[0],4))

    if sparse:
        print Xs1.shape
        print Xs2.shape
        Xs1 = np.asarray(Xs1.todense())
        Xs2 = np.asarray(Xs2.todense())

    for i,(a,b) in enumerate(zip(Xs1,Xs2)):
        a = a.reshape(-1,a.shape[0])
        b = b.reshape(-1,b.shape[0])
        #print a.shape
        #print type(a)
        dist = cdist(a,b,'cosine')
        Xall_new[i,0] = dist
        #Xall_new[i,3] = dist
        dist = cdist(a,b,'cityblock')
        Xall_new[i,1] = dist
        dist = cdist(a,b,'hamming')
        Xall_new[i,2] = dist
        dist = cdist(a,b,'euclidean')
        Xall_new[i,3] = dist

    Xall_new = pd.DataFrame(Xall_new,columns=['cosine','cityblock','hamming','euclidean'])

    print "NA:",Xall_new.isnull().values.sum()
    Xall_new = Xall_new.fillna(0.0)
    print "NA:",Xall_new.isnull().values.sum()
    print Xall_new.corr(method='spearman')
    return Xall_new
Exemple #10
0
    def fine_tune_transform(feature1, feature2, init_pair_idx):
        ind = []
        k = 1
        while len(ind) < 0.6 * min(len(feature1["pts"]), len(feature2["pts"])) and k < 10:
            # Step 1. Randomly choose 20 points evenly distributed on the image
            rand_pts = np.random.rand(20, 2) * (np.amax(feature1["pts"], axis=0) - np.amin(feature1["pts"], axis=0)) * \
                       np.array([1, 0.8]) + np.amin(feature1["pts"], axis=0)
            # Step 2. Find nearest points from feature1
            dist_mat = spd.cdist(rand_pts, feature1["pts"][init_pair_idx[:, 0]])
            tmp_ind = np.argmin(dist_mat, axis=1)
            # Step 3. Use these points to find a homography
            tf = cv2.findHomography(feature1["pts"][init_pair_idx[tmp_ind, 0]], feature2["pts"][init_pair_idx[tmp_ind, 1]],
                                    method=cv2.RANSAC, ransacReprojThreshold=5)

            # Then use the transform find more matched points
            pts12 = cv2.perspectiveTransform(np.array([[p] for p in feature1["pts"]], dtype="float32"), tf[0])[:, 0, :]
            dist_mat = spd.cdist(pts12, feature2["pts"])
            num1, num2 = dist_mat.shape

            idx12 = np.argsort(dist_mat, axis=1)
            tmp_ind = np.argwhere(np.array([dist_mat[i, idx12[i, 0]] for i in range(num1)]) < 5)
            if len(tmp_ind) > len(ind):
                ind = tmp_ind
            logging.debug("len(ind) = %d, len(feature) = %d", len(ind), min(len(feature1["pts"]), len(feature2["pts"])))
            k += 1

        pair_idx = np.hstack((ind, idx12[ind, 0]))

        tf = cv2.findHomography(feature1["pts"][pair_idx[:, 0]], feature2["pts"][pair_idx[:, 1]],
                                method=cv2.RANSAC, ransacReprojThreshold=5)
        return tf, pair_idx
Exemple #11
0
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \
        dist_f=None):
    if len(np.array(X).shape) == 1:
        _X = np.array([X]).T
    else:
        _X = np.array(X)
        
    if len(np.array(Y).shape) == 1:
        _Y = np.array([Y]).T
    else:
        _Y = np.array(Y)
        
    if dist_f == None:
        if symmetric:
            cM = pdist(_X)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y)
            M = kernel(cM, theta)
            return M
    else:
        if symmetric:
            cM = pdist(_X, dist_f)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y, dist_f)
            M = kernel(cM, theta)
            return M
    return
def pairwise_between_groups(fullsplit,i_own,split_lens,CDR3_similarity_cutoff):
    i_others = len(split_lens)
    if i_own != i_others-1:
        # if this is not the last group, stack groups being compared against
        fso = np.vstack(fullsplit[i_own+1:i_others])
    else:
        fso = fullsplit[i_others-1]
        
    bool_dis = cdist(fullsplit[i_own],fso,'hamming').flatten()
    bool_inf = cdist(np.isinf(fullsplit[i_own]),np.isinf(fso),'hamming').flatten()
    finite_own = np.isfinite(fullsplit[i_own]).sum(axis=1)
    finite_others = np.isfinite(fso).sum(axis=1)
    
    pdf_all=np.empty(len(bool_dis))
    for c,fin in enumerate(product(finite_own,finite_others)):
        pdf_all[c] = min(fin)

    norm_dist_all  = (bool_dis-bool_inf)/pdf_all*fullsplit[0].shape[1]
    bool_all = norm_dist_all < (1-CDR3_similarity_cutoff)
    
    # given boolean array, find sequences belonging in a cluster, row-wise
    bool_all = bool_all.reshape(fullsplit[i_own].shape[0],fso.shape[0])
    sets=[]
    
    col_offset = sum(split_lens[:i_own])+fullsplit[i_own].shape[0]
        
    for cnt,row in enumerate(bool_all):
        row_offset = cnt+sum(split_lens[:i_own])

        sets_from_group = set(np.add(np.nonzero(row)[0],col_offset))
        sets_from_group.add(row_offset)
        sets.append(sets_from_group)

    return sets
Exemple #13
0
    def __init__(self, data, bandwidth=None, fixed=True, k=None,
                 function='triangular', eps=1.0000001, ids=None, truncate=True,
                 points=None): #Added truncate flag
        if issubclass(type(data), scipy.spatial.KDTree):
            self.data = data.data
            data = self.data
        else:
            self.data = data
        if k is not None:
            self.k = int(k) + 1
        else:
            self.k = k
        if points is None:
            self.dmat = cdist(self.data, self.data)
        else:
            self.points = points
            self.dmat = cdist(self.points, self.data)
        self.function = function.lower()
        self.fixed = fixed
        self.eps = eps
        self.trunc = truncate
        if bandwidth:
            try:
                bandwidth = np.array(bandwidth)
                bandwidth.shape = (len(bandwidth), 1)
            except:
                bandwidth = np.ones((len(data), 1), 'float') * bandwidth
            self.bandwidth = bandwidth
        else:
            self._set_bw()
        self.kernel = self._kernel_funcs(self.dmat/self.bandwidth)

        if self.trunc:
            mask = np.repeat(self.bandwidth, len(self.data), axis=1)
            self.kernel[(self.dmat >= mask)] = 0
def compute_bic(kmeans,X):
	"""
	Computes the BIC metric for a given clusters
	Parameters:
	-----------------------------------------
	kmeans:  List of clustering object from scikit learn
	X     :  multidimension np array of data points
	Returns:
	-----------------------------------------
	BIC value
	"""
	# assign centers and labels
	centers = [kmeans.cluster_centers_]
	labels  = kmeans.labels_
	#number of clusters
	m = kmeans.n_clusters
	# size of the clusters
	n = np.bincount(labels)
	#size of data set
	N, d = X.shape
	#compute variance for all clusters beforehand
	cl_var=[]
	for i in xrange(m):
		if not n[i] - m==0:
			cl_var.append((1.0 / (n[i] - m)) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2))
		else:
			cl_var.append(float(10**20) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2))
	const_term = 0.5 * m * np.log10(N)
	BIC = np.sum([n[i] * np.log10(n[i]) -
	       n[i] * np.log10(N) -
	     ((n[i] * d) / 2) * np.log10(2*np.pi) -
	      (n[i] / 2) * np.log10(cl_var[i]) -
	     ((n[i] - m) / 2) for i in xrange(m)]) - const_term
	return(BIC)
Exemple #15
0
def remove_redundant_mot(mot, rev_mot, max_d):
    """Reads a list of PWMs and removes redundant ones based on correlation.
    
    Args:
    - mot
    - rev_mot
    - max_d: Maximum distance for motifs to be considered redundant.
    
    Return value:
    A boolean numpy array is_good, that indicates whether each motif should 
    be kept or not.
    """
    
    nmot = mot.shape[0]
    assert(rev_mot.shape[0] == nmot)
    assert(rev_mot.shape[1] == mot.shape[1])
    
    is_good = np.ones((nmot, ), dtype = np.bool)
    
    for i in range(nmot - 1):
        if not is_good[i]:
            continue
        # Get all the indices that are higher than i and have not been removed already.
        others = np.argwhere(np.logical_and(is_good, np.arange(0, nmot) > i)).flatten()
        d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), mot[others, :], metric = 'correlation').flatten()
        rev_d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), rev_mot[others, :], metric = 'correlation').flatten()
        # Get minimum distance so maximum correlation.
        d = np.minimum(d, rev_d)
        # PWMs that are similar to the i-th one and have worse pvalue, then they will
        # get marked as redundant
        bad = others[np.logical_and(d < max_d, pvals[i] < pvals[others])]
        is_good[bad] = False
        if np.any(np.logical_and(d < max_d, pvals[i] > pvals[others])):
            is_good[i] = False
    return is_good
    def evaluate(self, individual):

        dist = cdist(np.atleast_2d(individual), np.atleast_2d(self.target))

        if (self.model_name == "CNN"):
            X = np.array([np.array(individual).reshape(28,28,1)])
        else:
            X = np.array([individual]) 
       
        if self.model_name.startswith("SVM") or self.model_name.startswith("DT"):
            model_output = self.model.predict_proba(X)
        else:
            model_output = self.model.predict(X)


        desired_output = np.zeros(10)
        desired_output[self.target_output] = 1.0 
        
        dist2 = cdist(np.atleast_2d(model_output), np.atleast_2d(desired_output))            

        fit = dist*0.5 + 0.5*dist2
        #fit = dist2
        #fit = dist 

        return fit, 
Exemple #17
0
def lloyd2(data, init_cent, metric='e', verbose=False):
    k = init_cent.shape[0]
    cent = np.copy(init_cent)
    labels = spdist.cdist(data, cent, metric).argmin(axis=1)
    converged = False
    t, tmax = 0, 1000

    while not converged and t < tmax:
        t += 1
        converged = True

        cent_ = np.array([np.mean(data[labels == l], axis=0)
                         for l in range(k)])

        labels_ = spdist.cdist(data, cent_, metric).argmin(axis=1)

        if not np.allclose(cent_, cent) or \
                not np.alltrue(labels == labels_):
            converged = False
            labels = labels_
            cent = cent_

    if not converged:
        # raise UserWarning("did not converge after {} iterations".format(t))
        print("did not converge after {} iterations".format(t))
    elif verbose:
        print("Converged after {} iterations".format(t))

    return cent, labels
Exemple #18
0
    def proceed(self, x=None, z=None, der=None):
        ell = np.exp(self.hyp[0])            # characteristic length scale
        sf2 = np.exp(2.*self.hyp[1])         # signal variance
        v   = self.para[0]                   # degree (v = 0,1,2 or 3 only)
        if np.abs(v-np.round(v)) < 1e-8:     # remove numerical error from format of parameter
            v = int(round(v))
        assert(int(v) in range(4))           # Only allowed degrees: 0,1,2 or 3
        v = int(v)        
        n, D = x.shape
        j = np.floor(0.5*D) + v + 1
        if z == 'diag':
            A = np.zeros((n,1))
        elif z == None:
            A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') )
        else:                                       # compute covariance between data sets x and z
            A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') )     # cross covariances 
        if der == None:                             # compute covariance matix for dataset x
            A = sf2 * self.pp(A,j,v,self.func)
        else:
            if der == 0:                            # compute derivative matrix wrt 1st parameter
                A = sf2 * self.dpp(A,j,v,self.func,self.dfunc)

            elif der == 1:                          # compute derivative matrix wrt 2nd parameter
                A = 2. * sf2 * self.pp(A,j,v,self.func)

            elif der == 2:                          # wants to compute derivative wrt order
                A = np.zeros_like(A)
            else:
                raise Exception("Wrong derivative entry in PiecePoly")
        return A
Exemple #19
0
 def learn(self,learndataset,pipp_normalise=True):
     """learn the tree structure required to perform evaluation
     
     :param learndataset: learning instances
     :type learndataset: :class:`~classifip.dataset.arff.ArffFile`
     :param pipp_normalise: normalise the input features or not
     :type pipp_normalise: boolean
     
     .. note::
 
         learndataset should come from a xarff file tailored for lable ranking
     """
     self.labels=learndataset.attribute_data['L'][:]
     learndata=[row[0:len(row)-1] for row in learndataset.data]
     data_array=np.array(learndata).astype(float)
     if pipp_normalise == True:
         span=data_array.max(axis=0)-data_array.min(axis=0)
         self.normal.append(True)
         self.normal.append(span)
         self.normal.append(data_array.min(axis=0))
         data_array=(data_array-data_array.min(axis=0))/span
     else:
         self.normal.append(False)
         
     #Initalise radius as average distance between all learning instances
     if len(data_array) > 1000:
         data_red=np.random.permutation(data_array)[0:1000]
         distances=distance.cdist(data_red,data_red)
     else:
         distances=distance.cdist(data_array,data_array)
     self.radius=distances.sum()/(2*(len(distances)**2-len(distances)))
     self.tree=kdtree.KDTree(data_array)
     self.truerankings=[ranking_matrices(row[-1],self.labels) for row
                      in learndataset.data]
Exemple #20
0
 def proceed(self, x=None, z=None, der=None):
     n, D = x.shape  
     ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
     sf2 = np.exp(2.*self.hyp[D])      # signal variance
     if z == 'diag':
         A = np.zeros((n,1))
     elif z == None:
         tem = np.dot(np.diag(ell),x.T).T
         A = spdist.cdist(tem,tem,'sqeuclidean')
     else:                # compute covariance between data sets x and z
         A = spdist.cdist(np.dot(np.diag(ell),x.T).T,np.dot(np.diag(ell),z.T).T,'sqeuclidean')
     A = sf2*np.exp(-0.5*A)
     if der:
         if der < D:      # compute derivative matrix wrt length scale parameters
             if z == 'diag':
                 A = A*0
             elif z == None:
                 tem = np.atleast_2d(x[:,der])/ell[der]
                 A *= spdist.cdist(tem,tem,'sqeuclidean')
             else:
                 A *= spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der],np.atleast_2d(z[:,der]).T/ell[der],'sqeuclidean')
         elif der==D:     # compute derivative matrix wrt magnitude parameter
             A = 2.*A
         else:
             raise Exception("Wrong derivative index in RDFard")   
     return A
Exemple #21
0
def eccentricity(data, exponent=1.,  metricpar={}, callback=None):
    if data.ndim==1:
        assert metricpar=={}, 'No optional parameter is allowed for a dissimilarity matrix.'
        ds = squareform(data, force='tomatrix')
        if exponent in (np.inf, 'Inf', 'inf'):
            return ds.max(axis=0)
        elif exponent==1.:
            ds = np.power(ds, exponent)
            return ds.sum(axis=0)/float(np.alen(ds))
        else:
            ds = np.power(ds, exponent)
            return np.power(ds.sum(axis=0)/float(np.alen(ds)), 1./exponent)
    else:
        progress = progressreporter(callback)
        N = np.alen(data)
        ecc = np.empty(N)
        if exponent in (np.inf, 'Inf', 'inf'):
            for i in range(N):
                ecc[i] = cdist(data[(i,),:], data, **metricpar).max()
                progress((i+1)*100//N)
        elif exponent==1.:
            for i in range(N):
                ecc[i] = cdist(data[(i,),:], data, **metricpar).sum()/float(N)
                progress((i+1)*100//N)
        else:
            for i in range(N):
                dsum = np.power(cdist(data[(i,),:], data, **metricpar),
                                exponent).sum()
                ecc[i] = np.power(dsum/float(N), 1./exponent)
                progress((i+1)*100//N)
        return ecc
Exemple #22
0
 def proceed(self, x=None, z=None, der=None):
     ell = np.exp(self.hyp[0])        # characteristic length scale
     sf2 = np.exp(2.* self.hyp[1])    # signal variance
     d   = self.para[0]               # 2 times nu
     if np.abs(d-np.round(d)) < 1e-8: # remove numerical error from format of parameter
         d = int(round(d))
     d = int(d)
     try:
         assert(d in [1,3,5])         # check for valid values of d
     except AssertionError:
         print "Warning: You specified d to be neither 1,3 nor 5. We set d=3. "
         d = 3
     if z == 'diag':
         A = np.zeros((x.shape[0],1))
     elif z == None:
         x = np.sqrt(d)*x/ell   
         A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean'))
     else:
         x = np.sqrt(d)*x/ell
         z = np.sqrt(d)*z/ell
         A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
     if der == None:                     # compute covariance matix for dataset x
         A = sf2 * self.mfunc(d,A)
     else:
         if der == 0:                    # compute derivative matrix wrt 1st parameter
             A = sf2 * self.dmfunc(d,A)
         elif der == 1:                  # compute derivative matrix wrt 2nd parameter
             A = 2 * sf2 * self.mfunc(d,A)
         elif der == 2:                  # no derivative wrt 3rd parameter
             A = np.zeros_like(A)        # do nothing (d is not learned)
         else:
             raise Exception("Wrong derivative value in Matern")
     return A
Exemple #23
0
 def __init__(self,gifts,nb_neighbors=50,metric=None):
     """
     metric=None uses the chord distance
     """
     self.gifts = gifts
     self.X = gifts[['Latitude','Longitude']].values
     self.N = len(self.X)
     self.wgt = gifts.Weight.values
     #root of subtree -> list of nodes in this subtree
     self.subtrees = {i:[i] for i in range(self.N)}
     #node -> root of subtree
     self.Xto = range(self.N)
     #weight of subtrees
     self.subtree_weights = {i: self.wgt[i] for i in range(self.N)}
     #cartesian coordinates (ignoring earth radius)
     self.Z = np.apply_along_axis(self.to_cartesian,1,self.X)
     #distance from north pole to root points
     to_pole = cdist(np.atleast_2d(self.to_cartesian(north_pole)),self.Z)
     if metric is None:
         self.gates = to_pole[0].tolist()
     else:
         if isinstance(metric,Thin_Metric):
             self.gates = (AVG_EARTH_RADIUS * to_pole[0]).tolist()
         else:
             self.gates = cdist(np.atleast_2d(north_pole),self.X)[0].to_list()
     self.subtree_costs = {i:self.gates[i] for i in range(self.N)}
     self.total_cost = sum(self.subtree_costs.values())
     self.nb_neighbors = nb_neighbors
     import sklearn.neighbors
     self.kdtree = sklearn.neighbors.KDTree(self.Z)
     self.metric=metric
Exemple #24
0
 def proceed(self, x=None, z=None, der=None):
     ell = np.exp(self.hyp[0])        # characteristic length scale
     p   = np.exp(self.hyp[1])        # period
     sf2 = np.exp(2.*self.hyp[2])     # signal variance
     n,D = x.shape
     if z == 'diag':
         A = np.zeros((n,1))
     elif z == None:
         A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean'))
     else:
         A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
     A = np.pi*A/p
     if der == None:             # compute covariance matix for dataset x
         A = np.sin(A)/ell
         A = A * A
         A = sf2 *np.exp(-2.*A)
     else:
         if der == 0:            # compute derivative matrix wrt 1st parameter
             A = np.sin(A)/ell
             A = A * A
             A = 4. *sf2 *np.exp(-2.*A) * A
         elif der == 1:          # compute derivative matrix wrt 2nd parameter
             R = np.sin(A)/ell
             A = 4 * sf2/ell * np.exp(-2.*R*R)*R*np.cos(A)*A
         elif der == 2:          # compute derivative matrix wrt 3rd parameter
             A = np.sin(A)/ell
             A = A * A
             A = 2. * sf2 * np.exp(-2.*A)
         else:
             raise Exception("Wrong derivative index in covPeriodic")            
     return A
Exemple #25
0
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False):
    """Find nearest neighbors

    Note: The rows in xhs and rr must all be unit-length vectors, otherwise
    the result will be incorrect.

    Parameters
    ----------
    xhs : array, shape=(n_samples, n_dim)
        Points of data set.
    rr : array, shape=(n_query, n_dim)
        Points to find nearest neighbors for.
    use_balltree : bool
        Use fast BallTree based search from scikit-learn. If scikit-learn
        is not installed it will fall back to the slow brute force search.
    return_dists : bool
        If True, return associated distances.

    Returns
    -------
    nearest : array, shape=(n_query,)
        Index of nearest neighbor in xhs for every point in rr.
    distances : array, shape=(n_query,)
        The distances. Only returned if return_dists is True.
    """
    if use_balltree:
        try:
            from sklearn.neighbors import BallTree
        except ImportError:
            logger.info('Nearest-neighbor searches will be significantly '
                        'faster if scikit-learn is installed.')
            use_balltree = False

    if xhs.size == 0 or rr.size == 0:
        if return_dists:
            return np.array([], int), np.array([])
        return np.array([], int)
    if use_balltree is True:
        ball_tree = BallTree(xhs)
        if return_dists:
            out = ball_tree.query(rr, k=1, return_distance=True)
            return out[1][:, 0], out[0][:, 0]
        else:
            nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0]
            return nearest
    else:
        from scipy.spatial.distance import cdist
        if return_dists:
            nearest = list()
            dists = list()
            for r in rr:
                d = cdist(r[np.newaxis, :], xhs)
                idx = np.argmin(d)
                nearest.append(idx)
                dists.append(d[0, idx])
            return (np.array(nearest), np.array(dists))
        else:
            nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs))
                                for r in rr])
            return nearest
Exemple #26
0
    def proceed(self, x=None, z=None, der=None):
        ell   = np.exp(self.hyp[0])            # characteristic length scale
        sf2   = np.exp(2.*self.hyp[1])         # signal variance
        alpha = np.exp(self.hyp[2])            
        n,D = x.shape
        if z == 'diag':
            D2 = np.zeros((n,1))
        elif z == None:
            D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
        else:
            D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean')
        if der == None:                  # compute covariance matix for dataset x
            A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
        else:
            if der == 0:                # compute derivative matrix wrt 1st parameter
                A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * D2

            elif der == 1:              # compute derivative matrix wrt 2nd parameter
                A = 2.* sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )

            elif der == 2:              # compute derivative matrix wrt 3rd parameter
                K = ( 1.0 + 0.5*D2/alpha )
                A = sf2 * K**(-alpha) * (0.5*D2/K - alpha*np.log(K) )
            else:
                raise Exception("Wrong derivative index in covRQ")
        return A
def online_k_means(k,b,t,X_in):
    random_number = 11232015
    random_num = np.random.randint(X_in.shape[0], size =300 )
    rng = np.random.RandomState(random_number)
    permutation1 = rng.permutation(len(random_num))
    random_num = random_num[permutation1]
    x_input = X_in[random_num]
    c,l = mykmeansplusplus(x_input,k,t)
    v = np.zeros((k))
    for i in range(t):
        random_num = np.random.randint(X_in.shape[0], size = b)
        rng = np.random.RandomState(random_number)
        permutation1 = rng.permutation(len(random_num))
        random_num = random_num[permutation1]
        M = X_in[random_num]
        Y = cdist(M,c,metric='euclidean', p=2, V=None, VI=None, w=None)
        clust_index = np.argmin(Y,axis = 1)
        for i in range(M.shape[0]):
            c_in = clust_index[i]
            v[c_in] += 1
            ita = 1 / v[c_in]
            c[c_in] = np.add(np.multiply((1 - ita),c[c_in]),np.multiply(ita,M[i]))
    Y_l = cdist(X_in,c,metric='euclidean', p=2, V=None, VI=None, w=None)
    l = np.argmin(Y_l,axis = 1)        
    return c,l
Exemple #28
0
    def proceed(self, x=None, z=None, der=None):
        n, D = x.shape  
        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
        sf2 = np.exp(2.*self.hyp[D])      # signal variance
        alpha = np.exp(self.hyp[D+1])
        if z == 'diag':
            D2 = np.zeros((n,1))
        elif z == None:
            tmp = np.dot(np.diag(ell),x.T).T
            D2 = spdist.cdist(tmp, tmp, 'sqeuclidean')
        else:
            D2 = spdist.cdist(np.dot(np.diag(ell),x.T).T, np.dot(np.diag(ell),z.T).T, 'sqeuclidean')
        if der == None:                 # compute covariance matix for dataset x
            A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
        else:
            if der < D:                 # compute derivative matrix wrt length scale parameters
                if z == 'diag':
                    A = D2*0
                elif z == None:
                    tmp = np.atleast_2d(x[:,der])/ell[der]
                    A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(tmp, tmp, 'sqeuclidean')
                else:
                    A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der], np.atleast_2d(z[:,der]).T/ell[der], 'sqeuclidean') 
            elif der==D:                # compute derivative matrix wrt magnitude parameter
                A = 2. * sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )

            elif der==(D+1):            # compute derivative matrix wrt magnitude parameter
                K = ( 1.0 + 0.5*D2/alpha )
                A = sf2 * K**(-alpha) * ( 0.5*D2/K - alpha*np.log(K) )
            else:
                raise Exception("Wrong derivative index in covRQard") 
        return A
Exemple #29
0
 def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z):
     if self.kernelX_use_median:
         sigmax = self.kernelX.get_sigma_median_heuristic(train_x)
         self.kernelX.set_width(float(sigmax))
     if self.kernelY_use_median:
         sigmay = self.kernelY.get_sigma_median_heuristic(train_y)
         self.kernelY.set_width(float(sigmay))
     kf = KFold( n_splits=self.K_folds)
     matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] 
     # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as 
     # this command simply copied [None] many times. But the above gives different ids.
     count = 0
     for train_index, test_index in kf.split(np.ones((self.num_samples,1))):
         X_tr, X_tst = train_x[train_index], train_x[test_index]
         Y_tr, Y_tst = train_y[train_index], train_y[test_index]
         Z_tr, Z_tst = train_z[train_index], train_z[test_index]
         matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr
         matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr
         matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst
         matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr
         matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr
         matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst
         matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix
         matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix
         count = count + 1
     return matrix_results
Exemple #30
0
def euclidean_distances(X, Y, squared=False, inverse=True):
    """
    Considering the rows of X (and Y=X) as vectors, compute the
    distance matrix between each pair of vectors.

    An implementation of a "similarity" based on the Euclidean "distance"
    between two vectors X and Y. Thinking of items as dimensions and
    preferences as points along those dimensions, a distance is computed using
    all items (dimensions) where both users have expressed a preference for
    that item. This is simply the square root of the sum of the squares of
    differences in position (preference) along each dimension.

    Parameters
    ----------
    X: array of shape (n_samples_1, n_features)

    Y: array of shape (n_samples_2, n_features)

    squared: boolean, optional
        This routine will return squared Euclidean distances instead.

    inverse: boolean, optional
        This routine will return the inverse Euclidean distances instead.

    Returns
    -------
    distances: array of shape (n_samples_1, n_samples_2)

    Examples
    --------
    >>> from scikits.crab.metrics.pairwise import euclidean_distances
    >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[3.0, 3.5, 1.5, 5.0, 3.5,3.0]]
    >>> # distrance between rows of X
    >>> euclidean_distances(X, X)
    array([[ 1.        ,  0.29429806],
           [ 0.29429806,  1.        ]])
    >>> # get distance to origin
    >>> X = [[1.0, 0.0],[1.0,1.0]]
    >>> euclidean_distances(X, [[0.0, 0.0]])
    array([[ 0.5       ],
          [ 0.41421356]])

    """
    # should not need X_norm_squared because if you could precompute that as
    # well as Y, then you should just pre-compute the output and not even
    # call this function.
    if X is Y:
        X = Y = np.asanyarray(X)
    else:
        X = np.asanyarray(X)
        Y = np.asanyarray(Y)

    if X.shape[1] != Y.shape[1]:
        raise ValueError("Incompatible dimension for X and Y matrices")

    if squared:
        return ssd.cdist(X, Y, 'sqeuclidean')

    XY = ssd.cdist(X, Y)
    return np.divide(1.0, (1.0 + XY)) if inverse else XY