Beispiel #1
0
def euclidean_distances(X, Y, squared=False, inverse=True):
    """
    Considering the rows of X (and Y=X) as vectors, compute the
    distance matrix between each pair of vectors.

    An implementation of a "similarity" based on the Euclidean "distance"
    between two vectors X and Y. Thinking of items as dimensions and
    preferences as points along those dimensions, a distance is computed using
    all items (dimensions) where both users have expressed a preference for
    that item. This is simply the square root of the sum of the squares of
    differences in position (preference) along each dimension.

    Parameters
    ----------
    X: array of shape (n_samples_1, n_features)

    Y: array of shape (n_samples_2, n_features)

    squared: boolean, optional
        This routine will return squared Euclidean distances instead.

    inverse: boolean, optional
        This routine will return the inverse Euclidean distances instead.

    Returns
    -------
    distances: array of shape (n_samples_1, n_samples_2)

    Examples
    --------
    >>> from scikits.crab.metrics.pairwise import euclidean_distances
    >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[3.0, 3.5, 1.5, 5.0, 3.5,3.0]]
    >>> # distrance between rows of X
    >>> euclidean_distances(X, X)
    array([[ 1.        ,  0.29429806],
           [ 0.29429806,  1.        ]])
    >>> # get distance to origin
    >>> X = [[1.0, 0.0],[1.0,1.0]]
    >>> euclidean_distances(X, [[0.0, 0.0]])
    array([[ 0.5       ],
          [ 0.41421356]])

    """
    # should not need X_norm_squared because if you could precompute that as
    # well as Y, then you should just pre-compute the output and not even
    # call this function.
    if X is Y:
        X = Y = np.asanyarray(X)
    else:
        X = np.asanyarray(X)
        Y = np.asanyarray(Y)

    if X.shape[1] != Y.shape[1]:
        raise ValueError("Incompatible dimension for X and Y matrices")

    if squared:
        return ssd.cdist(X, Y, 'sqeuclidean')

    XY = ssd.cdist(X, Y)
    return np.divide(1.0, (1.0 + XY)) if inverse else XY
Beispiel #2
0
 def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z):
     if self.kernelX_use_median:
         sigmax = self.kernelX.get_sigma_median_heuristic(train_x)
         self.kernelX.set_width(float(sigmax))
     if self.kernelY_use_median:
         sigmay = self.kernelY.get_sigma_median_heuristic(train_y)
         self.kernelY.set_width(float(sigmay))
     kf = KFold( n_splits=self.K_folds)
     matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] 
     # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as 
     # this command simply copied [None] many times. But the above gives different ids.
     count = 0
     for train_index, test_index in kf.split(np.ones((self.num_samples,1))):
         X_tr, X_tst = train_x[train_index], train_x[test_index]
         Y_tr, Y_tst = train_y[train_index], train_y[test_index]
         Z_tr, Z_tst = train_z[train_index], train_z[test_index]
         matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr
         matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr
         matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst
         matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr
         matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr
         matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst
         matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix
         matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix
         count = count + 1
     return matrix_results
def online_k_means(k,b,t,X_in):
    random_number = 11232015
    random_num = np.random.randint(X_in.shape[0], size =300 )
    rng = np.random.RandomState(random_number)
    permutation1 = rng.permutation(len(random_num))
    random_num = random_num[permutation1]
    x_input = X_in[random_num]
    c,l = mykmeansplusplus(x_input,k,t)
    v = np.zeros((k))
    for i in range(t):
        random_num = np.random.randint(X_in.shape[0], size = b)
        rng = np.random.RandomState(random_number)
        permutation1 = rng.permutation(len(random_num))
        random_num = random_num[permutation1]
        M = X_in[random_num]
        Y = cdist(M,c,metric='euclidean', p=2, V=None, VI=None, w=None)
        clust_index = np.argmin(Y,axis = 1)
        for i in range(M.shape[0]):
            c_in = clust_index[i]
            v[c_in] += 1
            ita = 1 / v[c_in]
            c[c_in] = np.add(np.multiply((1 - ita),c[c_in]),np.multiply(ita,M[i]))
    Y_l = cdist(X_in,c,metric='euclidean', p=2, V=None, VI=None, w=None)
    l = np.argmin(Y_l,axis = 1)        
    return c,l
Beispiel #4
0
    def test_init_corr(self, other, T = 5e-3, outlierprior=1e-1, outlierfrac=1e-2, outliercutoff=1e-2, ):
        import scipy.spatial.distance as ssd
        import sys
        self.transform_points()
        other.transform_points()
        init_prob_nm(self.pt_ptrs, other.pt_ptrs, 
                     self.pt_w_ptrs, other.pt_w_ptrs, 
                     self.dims_gpu, other.dims_gpu,
                     self.N, outlierprior, outlierfrac, T, 
                     self.corr_cm_ptrs, self.corr_rm_ptrs)
        gpu_corr_rm = self.corr_rm[0].get()
        gpu_corr_rm = gpu_corr_rm.flatten()[:(self.dims[0] + 1) * (other.dims[0] + 1)].reshape(self.dims[0]+1, other.dims[0]+1)
        s_pt_w = self.pts_w[0].get()
        s_pt   = self.pts[0].get()
        o_pt_w = other.pts_w[0].get()
        o_pt   = other.pts[0].get()

        d1 = ssd.cdist(s_pt_w, o_pt, 'euclidean')
        d2 = ssd.cdist(s_pt, o_pt_w, 'euclidean')

        p_nm = np.exp( -(d1 + d2) / (2 * T))

        for i in range(self.dims[0]):
            for j in range(other.dims[0]):
                if abs(p_nm[i, j] - gpu_corr_rm[i, j]) > 1e-7:
                    print "INIT CORR MATRICES DIFFERENT"
                    print i, j, p_nm[i, j], gpu_corr_rm[i, j]
                    ipy.embed()
                    sys.exit(1)
Beispiel #5
0
def ch(X, cIDX, distance="euclidean"):
    Nclusters = cIDX.max() + 1
    Npoints = len(X)

    n = np.ndarray(shape=(Nclusters), dtype=float)

    j = 0
    for i in range(cIDX.min(), cIDX.max() + 1):
        aux = np.asarray([float(b) for b in (cIDX == i)])
        n[j] = aux.sum()
        j = j + 1

    # Clusters
    A = np.array([X[np.where(cIDX == i)] for i in range(Nclusters)])
    # Centroids
    v = np.array([np.sum(Ai, axis=0) / float(Ai.shape[0]) for Ai in A])

    ssb = 0

    for i in range(Nclusters):
        ssb = n[i] * (cdist([v[i]], [np.mean(X, axis=0)], metric=distance)[0][0] ** 2) + ssb

    z = np.ndarray(shape=(Nclusters), dtype=float)

    for i in range(cIDX.min(), cIDX.max() + 1):
        aux = np.array([(cdist([x], [v[i]], metric=distance)[0][0] ** 2) for x in X[cIDX == i]])
        z[i] = aux.sum()

    ssw = z.sum()

    return (ssb / (Nclusters - 1)) / (ssw / (Npoints - Nclusters))
Beispiel #6
0
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False):
    """Find nearest neighbors

    Note: The rows in xhs and rr must all be unit-length vectors, otherwise
    the result will be incorrect.

    Parameters
    ----------
    xhs : array, shape=(n_samples, n_dim)
        Points of data set.
    rr : array, shape=(n_query, n_dim)
        Points to find nearest neighbors for.
    use_balltree : bool
        Use fast BallTree based search from scikit-learn. If scikit-learn
        is not installed it will fall back to the slow brute force search.
    return_dists : bool
        If True, return associated distances.

    Returns
    -------
    nearest : array, shape=(n_query,)
        Index of nearest neighbor in xhs for every point in rr.
    distances : array, shape=(n_query,)
        The distances. Only returned if return_dists is True.
    """
    if use_balltree:
        try:
            from sklearn.neighbors import BallTree
        except ImportError:
            logger.info('Nearest-neighbor searches will be significantly '
                        'faster if scikit-learn is installed.')
            use_balltree = False

    if xhs.size == 0 or rr.size == 0:
        if return_dists:
            return np.array([], int), np.array([])
        return np.array([], int)
    if use_balltree is True:
        ball_tree = BallTree(xhs)
        if return_dists:
            out = ball_tree.query(rr, k=1, return_distance=True)
            return out[1][:, 0], out[0][:, 0]
        else:
            nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0]
            return nearest
    else:
        from scipy.spatial.distance import cdist
        if return_dists:
            nearest = list()
            dists = list()
            for r in rr:
                d = cdist(r[np.newaxis, :], xhs)
                idx = np.argmin(d)
                nearest.append(idx)
                dists.append(d[0, idx])
            return (np.array(nearest), np.array(dists))
        else:
            nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs))
                                for r in rr])
            return nearest
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                                                y_is_x):
    # check that pairwise_distances give the same result in sequential and
    # parallel, when metric has data-derived parameters.
    with config_context(working_memory=1):  # to have more than 1 chunk
        rng = np.random.RandomState(0)
        X = rng.random_sample((1000, 10))

        if y_is_x:
            Y = X
            expected_dist_default_params = squareform(pdist(X, metric=metric))
            if metric == "seuclidean":
                params = {'V': np.var(X, axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
        else:
            Y = rng.random_sample((1000, 10))
            expected_dist_default_params = cdist(X, Y, metric=metric)
            if metric == "seuclidean":
                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}

        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
        dist = np.vstack(tuple(dist_function(X, Y,
                                             metric=metric, n_jobs=n_jobs)))

        assert_allclose(dist, expected_dist_explicit_params)
        assert_allclose(dist, expected_dist_default_params)
Beispiel #8
0
 def __init__(self,gifts,nb_neighbors=50,metric=None):
     """
     metric=None uses the chord distance
     """
     self.gifts = gifts
     self.X = gifts[['Latitude','Longitude']].values
     self.N = len(self.X)
     self.wgt = gifts.Weight.values
     #root of subtree -> list of nodes in this subtree
     self.subtrees = {i:[i] for i in range(self.N)}
     #node -> root of subtree
     self.Xto = range(self.N)
     #weight of subtrees
     self.subtree_weights = {i: self.wgt[i] for i in range(self.N)}
     #cartesian coordinates (ignoring earth radius)
     self.Z = np.apply_along_axis(self.to_cartesian,1,self.X)
     #distance from north pole to root points
     to_pole = cdist(np.atleast_2d(self.to_cartesian(north_pole)),self.Z)
     if metric is None:
         self.gates = to_pole[0].tolist()
     else:
         if isinstance(metric,Thin_Metric):
             self.gates = (AVG_EARTH_RADIUS * to_pole[0]).tolist()
         else:
             self.gates = cdist(np.atleast_2d(north_pole),self.X)[0].to_list()
     self.subtree_costs = {i:self.gates[i] for i in range(self.N)}
     self.total_cost = sum(self.subtree_costs.values())
     self.nb_neighbors = nb_neighbors
     import sklearn.neighbors
     self.kdtree = sklearn.neighbors.KDTree(self.Z)
     self.metric=metric
Beispiel #9
0
def cdist_sparse( X, Y, **kwargs ):
    """ -> |X| x |Y| cdist array, any cdist metric
        X or Y may be sparse -- best csr
    """
        # todense row at a time, v slow if both v sparse
    sxy = 2*issparse(X) + issparse(Y)
    if sxy == 0:
        if kwargs["metric"] == "cosine":
            return 1 - cdist( X, Y, **kwargs )
        else:
            return d
    d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
    if sxy == 2:
        for j, x in enumerate(X):
            d[j] = cdist( x.todense(), Y, **kwargs ) [0]
    elif sxy == 1:
        for k, y in enumerate(Y):
            d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
    else:
        for j, x in enumerate(X):
            for k, y in enumerate(Y):
                d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
    if kwargs["metric"] == "cosine":
        return 1 - d
    else:
        return d
Beispiel #10
0
def eccentricity(data, exponent=1.,  metricpar={}, callback=None):
    if data.ndim==1:
        assert metricpar=={}, 'No optional parameter is allowed for a dissimilarity matrix.'
        ds = squareform(data, force='tomatrix')
        if exponent in (np.inf, 'Inf', 'inf'):
            return ds.max(axis=0)
        elif exponent==1.:
            ds = np.power(ds, exponent)
            return ds.sum(axis=0)/float(np.alen(ds))
        else:
            ds = np.power(ds, exponent)
            return np.power(ds.sum(axis=0)/float(np.alen(ds)), 1./exponent)
    else:
        progress = progressreporter(callback)
        N = np.alen(data)
        ecc = np.empty(N)
        if exponent in (np.inf, 'Inf', 'inf'):
            for i in range(N):
                ecc[i] = cdist(data[(i,),:], data, **metricpar).max()
                progress((i+1)*100//N)
        elif exponent==1.:
            for i in range(N):
                ecc[i] = cdist(data[(i,),:], data, **metricpar).sum()/float(N)
                progress((i+1)*100//N)
        else:
            for i in range(N):
                dsum = np.power(cdist(data[(i,),:], data, **metricpar),
                                exponent).sum()
                ecc[i] = np.power(dsum/float(N), 1./exponent)
                progress((i+1)*100//N)
        return ecc
Beispiel #11
0
 def bandwidth(self, X):
     """
 Estimate bandwidth
 TODO Replace this with a method which treats the data like a 
 distorted doughnut by estimating limit cycle, and integrating an ellipsoid
 swept along the trajectory of the limit cycle with apropriate lengths
 """
     N = X.shape[0]
     D = X.shape[1]
     points_in_cluster = N / float(self.Ncl)  # Wanted points in a cluster
     debugLog(self, "Estimating bandwidth")
     # Sample points from an n-box for numerical intergration
     S = 10 * (6 ** D)
     # Grab subsamples to of points to look at
     idx = randint(0, N, (S))
     v = X[idx]
     # Sample points
     y = (X.max() - X.min()) * randn(S, D) + X.min()
     # Find how close together points in our subsample typically are
     w = std(cdist(X[list(set(arange(N)).difference(idx))], v).min(0))
     # Count points in our box that are approximately this close
     c = sum(cdist(X, y).min(0) < self.sf * w)
     # Compute volume from length with sphere prefactor
     V = nSphereVolume(D) * ((c / float(S)) * (X.max() - X.min()) ** D)
     # Calculate bandwidth by
     return ((V * points_in_cluster) / N) ** (1.0 / D)
Beispiel #12
0
 def learn(self,learndataset,pipp_normalise=True):
     """learn the tree structure required to perform evaluation
     
     :param learndataset: learning instances
     :type learndataset: :class:`~classifip.dataset.arff.ArffFile`
     :param pipp_normalise: normalise the input features or not
     :type pipp_normalise: boolean
     
     .. note::
 
         learndataset should come from a xarff file tailored for lable ranking
     """
     self.labels=learndataset.attribute_data['L'][:]
     learndata=[row[0:len(row)-1] for row in learndataset.data]
     data_array=np.array(learndata).astype(float)
     if pipp_normalise == True:
         span=data_array.max(axis=0)-data_array.min(axis=0)
         self.normal.append(True)
         self.normal.append(span)
         self.normal.append(data_array.min(axis=0))
         data_array=(data_array-data_array.min(axis=0))/span
     else:
         self.normal.append(False)
         
     #Initalise radius as average distance between all learning instances
     if len(data_array) > 1000:
         data_red=np.random.permutation(data_array)[0:1000]
         distances=distance.cdist(data_red,data_red)
     else:
         distances=distance.cdist(data_array,data_array)
     self.radius=distances.sum()/(2*(len(distances)**2-len(distances)))
     self.tree=kdtree.KDTree(data_array)
     self.truerankings=[ranking_matrices(row[-1],self.labels) for row
                      in learndataset.data]
Beispiel #13
0
def covSEisoU(hyp=None, x=None, z=None, der=None):
    # Squared Exponential covariance function with isotropic distance measure with
    # unit magnitude. The covariance function is parameterized as:
    # 
    # k(x^p,x^q) = exp( -(x^p - x^q)' * inv(P) * (x^p - x^q) / 2 )
    #
    # where the P matrix is ell^2 times the unit matrix. 
    # 
    # The hyperparameters of the function are:
    #
    # hyp = [ log(ell) ]
    

    if hyp == None:                 # report number of parameters
        return [1]

    ell = np.exp(hyp[0])            # characteristic length scale
    n,D = x.shape

    if z == 'diag':
        A = np.zeros((n,1))
    elif z == None:
        A = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
    else:                            # compute covariance between data sets x and z
        A = spdist.cdist(x/ell, z/ell, 'sqeuclidean')   # self covariances

    if der == None:                  # compute covariance matix for dataset x
        A = np.exp(-0.5*A)
    else:
        if der == 0:                 # compute derivative matrix wrt 1st parameter
            A = np.exp(-0.5*A) * A
        else:
            raise Exception("Wrong derivative index in covSEisoU")

    return A
Beispiel #14
0
def lloyd2(data, init_cent, metric='e', verbose=False):
    k = init_cent.shape[0]
    cent = np.copy(init_cent)
    labels = spdist.cdist(data, cent, metric).argmin(axis=1)
    converged = False
    t, tmax = 0, 1000

    while not converged and t < tmax:
        t += 1
        converged = True

        cent_ = np.array([np.mean(data[labels == l], axis=0)
                         for l in range(k)])

        labels_ = spdist.cdist(data, cent_, metric).argmin(axis=1)

        if not np.allclose(cent_, cent) or \
                not np.alltrue(labels == labels_):
            converged = False
            labels = labels_
            cent = cent_

    if not converged:
        # raise UserWarning("did not converge after {} iterations".format(t))
        print("did not converge after {} iterations".format(t))
    elif verbose:
        print("Converged after {} iterations".format(t))

    return cent, labels
def sim_compute(word, dis_type, topk=100):
    # Get the index of word and the corresponding vector
    try:
        index = word2idx[word]
        wordvec = myembed[index, :].reshape(1,-1)
    except KeyError:
        print "Word %s is not present in Vocablury" % sys.exc_value
        return



    # For cosine and correlation the similarity is 1 - distance
    # Else for others just inverse the distance by  multipying with -1
    if (dis_type == 'cosine' or dis_type == 'correlation' ):
        sim = 1 - cdist(wordvec, myembed, dis_type)
    else:
        sim = -1 * cdist(wordvec, myembed, dis_type)

    # Now operations to get sim the shape we need i.e from (1,N) to (N,)
    final = sim[0].T

    zipped = zip(range(len(final)), final)
    del zipped[index]
    zipped.sort(key=lambda t: t[1], reverse=True)

    return zipped
Beispiel #16
0
Datei: cov.py Projekt: Comy/pyGPs
    def proceed(self, x=None, z=None, der=None):
        n, D = x.shape  
        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
        sf2 = np.exp(2.*self.hyp[D])      # signal variance
        alpha = np.exp(self.hyp[D+1])
        if z == 'diag':
            D2 = np.zeros((n,1))
        elif z == None:
            tmp = np.dot(np.diag(ell),x.T).T
            D2 = spdist.cdist(tmp, tmp, 'sqeuclidean')
        else:
            D2 = spdist.cdist(np.dot(np.diag(ell),x.T).T, np.dot(np.diag(ell),z.T).T, 'sqeuclidean')
        if der == None:                 # compute covariance matix for dataset x
            A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
        else:
            if der < D:                 # compute derivative matrix wrt length scale parameters
                if z == 'diag':
                    A = D2*0
                elif z == None:
                    tmp = np.atleast_2d(x[:,der])/ell[der]
                    A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(tmp, tmp, 'sqeuclidean')
                else:
                    A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der], np.atleast_2d(z[:,der]).T/ell[der], 'sqeuclidean') 
            elif der==D:                # compute derivative matrix wrt magnitude parameter
                A = 2. * sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )

            elif der==(D+1):            # compute derivative matrix wrt magnitude parameter
                K = ( 1.0 + 0.5*D2/alpha )
                A = sf2 * K**(-alpha) * ( 0.5*D2/K - alpha*np.log(K) )
            else:
                raise Exception("Wrong derivative index in covRQard") 
        return A
Beispiel #17
0
def remove_redundant_mot(mot, rev_mot, max_d):
    """Reads a list of PWMs and removes redundant ones based on correlation.
    
    Args:
    - mot
    - rev_mot
    - max_d: Maximum distance for motifs to be considered redundant.
    
    Return value:
    A boolean numpy array is_good, that indicates whether each motif should 
    be kept or not.
    """
    
    nmot = mot.shape[0]
    assert(rev_mot.shape[0] == nmot)
    assert(rev_mot.shape[1] == mot.shape[1])
    
    is_good = np.ones((nmot, ), dtype = np.bool)
    
    for i in range(nmot - 1):
        if not is_good[i]:
            continue
        # Get all the indices that are higher than i and have not been removed already.
        others = np.argwhere(np.logical_and(is_good, np.arange(0, nmot) > i)).flatten()
        d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), mot[others, :], metric = 'correlation').flatten()
        rev_d = cdist(np.reshape(mot[i, :], (1, mot.shape[1])), rev_mot[others, :], metric = 'correlation').flatten()
        # Get minimum distance so maximum correlation.
        d = np.minimum(d, rev_d)
        # PWMs that are similar to the i-th one and have worse pvalue, then they will
        # get marked as redundant
        bad = others[np.logical_and(d < max_d, pvals[i] < pvals[others])]
        is_good[bad] = False
        if np.any(np.logical_and(d < max_d, pvals[i] > pvals[others])):
            is_good[i] = False
    return is_good
Beispiel #18
0
Datei: cov.py Projekt: Comy/pyGPs
 def proceed(self, x=None, z=None, der=None):
     ell = np.exp(self.hyp[0])        # characteristic length scale
     p   = np.exp(self.hyp[1])        # period
     sf2 = np.exp(2.*self.hyp[2])     # signal variance
     n,D = x.shape
     if z == 'diag':
         A = np.zeros((n,1))
     elif z == None:
         A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean'))
     else:
         A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
     A = np.pi*A/p
     if der == None:             # compute covariance matix for dataset x
         A = np.sin(A)/ell
         A = A * A
         A = sf2 *np.exp(-2.*A)
     else:
         if der == 0:            # compute derivative matrix wrt 1st parameter
             A = np.sin(A)/ell
             A = A * A
             A = 4. *sf2 *np.exp(-2.*A) * A
         elif der == 1:          # compute derivative matrix wrt 2nd parameter
             R = np.sin(A)/ell
             A = 4 * sf2/ell * np.exp(-2.*R*R)*R*np.cos(A)*A
         elif der == 2:          # compute derivative matrix wrt 3rd parameter
             A = np.sin(A)/ell
             A = A * A
             A = 2. * sf2 * np.exp(-2.*A)
         else:
             raise Exception("Wrong derivative index in covPeriodic")            
     return A
Beispiel #19
0
Datei: cov.py Projekt: Comy/pyGPs
    def proceed(self, x=None, z=None, der=None):
        ell   = np.exp(self.hyp[0])            # characteristic length scale
        sf2   = np.exp(2.*self.hyp[1])         # signal variance
        alpha = np.exp(self.hyp[2])            
        n,D = x.shape
        if z == 'diag':
            D2 = np.zeros((n,1))
        elif z == None:
            D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
        else:
            D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean')
        if der == None:                  # compute covariance matix for dataset x
            A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
        else:
            if der == 0:                # compute derivative matrix wrt 1st parameter
                A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * D2

            elif der == 1:              # compute derivative matrix wrt 2nd parameter
                A = 2.* sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )

            elif der == 2:              # compute derivative matrix wrt 3rd parameter
                K = ( 1.0 + 0.5*D2/alpha )
                A = sf2 * K**(-alpha) * (0.5*D2/K - alpha*np.log(K) )
            else:
                raise Exception("Wrong derivative index in covRQ")
        return A
Beispiel #20
0
Datei: cov.py Projekt: Comy/pyGPs
 def proceed(self, x=None, z=None, der=None):
     n, D = x.shape  
     ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
     sf2 = np.exp(2.*self.hyp[D])      # signal variance
     if z == 'diag':
         A = np.zeros((n,1))
     elif z == None:
         tem = np.dot(np.diag(ell),x.T).T
         A = spdist.cdist(tem,tem,'sqeuclidean')
     else:                # compute covariance between data sets x and z
         A = spdist.cdist(np.dot(np.diag(ell),x.T).T,np.dot(np.diag(ell),z.T).T,'sqeuclidean')
     A = sf2*np.exp(-0.5*A)
     if der:
         if der < D:      # compute derivative matrix wrt length scale parameters
             if z == 'diag':
                 A = A*0
             elif z == None:
                 tem = np.atleast_2d(x[:,der])/ell[der]
                 A *= spdist.cdist(tem,tem,'sqeuclidean')
             else:
                 A *= spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der],np.atleast_2d(z[:,der]).T/ell[der],'sqeuclidean')
         elif der==D:     # compute derivative matrix wrt magnitude parameter
             A = 2.*A
         else:
             raise Exception("Wrong derivative index in RDFard")   
     return A
Beispiel #21
0
Datei: cov.py Projekt: Comy/pyGPs
 def proceed(self, x=None, z=None, der=None):
     ell = np.exp(self.hyp[0])        # characteristic length scale
     sf2 = np.exp(2.* self.hyp[1])    # signal variance
     d   = self.para[0]               # 2 times nu
     if np.abs(d-np.round(d)) < 1e-8: # remove numerical error from format of parameter
         d = int(round(d))
     d = int(d)
     try:
         assert(d in [1,3,5])         # check for valid values of d
     except AssertionError:
         print "Warning: You specified d to be neither 1,3 nor 5. We set d=3. "
         d = 3
     if z == 'diag':
         A = np.zeros((x.shape[0],1))
     elif z == None:
         x = np.sqrt(d)*x/ell   
         A = np.sqrt(spdist.cdist(x, x, 'sqeuclidean'))
     else:
         x = np.sqrt(d)*x/ell
         z = np.sqrt(d)*z/ell
         A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
     if der == None:                     # compute covariance matix for dataset x
         A = sf2 * self.mfunc(d,A)
     else:
         if der == 0:                    # compute derivative matrix wrt 1st parameter
             A = sf2 * self.dmfunc(d,A)
         elif der == 1:                  # compute derivative matrix wrt 2nd parameter
             A = 2 * sf2 * self.mfunc(d,A)
         elif der == 2:                  # no derivative wrt 3rd parameter
             A = np.zeros_like(A)        # do nothing (d is not learned)
         else:
             raise Exception("Wrong derivative value in Matern")
     return A
Beispiel #22
0
    def evaluate(self, individual):

        dist = cdist(np.atleast_2d(individual), np.atleast_2d(self.target))

        if (self.model_name == "CNN"):
            X = np.array([np.array(individual).reshape(28,28,1)])
        else:
            X = np.array([individual]) 
       
        if self.model_name.startswith("SVM") or self.model_name.startswith("DT"):
            model_output = self.model.predict_proba(X)
        else:
            model_output = self.model.predict(X)


        desired_output = np.zeros(10)
        desired_output[self.target_output] = 1.0 
        
        dist2 = cdist(np.atleast_2d(model_output), np.atleast_2d(desired_output))            

        fit = dist*0.5 + 0.5*dist2
        #fit = dist2
        #fit = dist 

        return fit, 
Beispiel #23
0
Datei: gp.py Projekt: davidar/gpo
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'):
    l = asarray(l)
    sig = 1 #l[0]
    #l = l[1:]
    xs = ascolumn(xs)
    if ys is None:
        d = squareform(pdist(xs/l, 'sqeuclidean'))
    else:
        ys = ascolumn(ys)
        d = cdist(xs/l, ys/l, 'sqeuclidean')
    cov = exp(-d/2)
    if not deriv: return sig * cov

    grads = []
    if wrt == 'l':
        #grads.append(cov) # grad of sig
        for i in xrange(shape(xs)[1]):
            if ys is None:
                grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean'))
            else:
                grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean')
            grad /= l[i] ** 3
            grads.append(grad)
        return sig * cov, grads
    elif wrt == 'y':
        if shape(xs)[0] != 1: print '*** x not a row vector ***'
        jac = sig * cov * ((ys - xs) / l**2).T
        return sig * cov, jac
Beispiel #24
0
    def fine_tune_transform(feature1, feature2, init_pair_idx):
        ind = []
        k = 1
        while len(ind) < 0.6 * min(len(feature1["pts"]), len(feature2["pts"])) and k < 10:
            # Step 1. Randomly choose 20 points evenly distributed on the image
            rand_pts = np.random.rand(20, 2) * (np.amax(feature1["pts"], axis=0) - np.amin(feature1["pts"], axis=0)) * \
                       np.array([1, 0.8]) + np.amin(feature1["pts"], axis=0)
            # Step 2. Find nearest points from feature1
            dist_mat = spd.cdist(rand_pts, feature1["pts"][init_pair_idx[:, 0]])
            tmp_ind = np.argmin(dist_mat, axis=1)
            # Step 3. Use these points to find a homography
            tf = cv2.findHomography(feature1["pts"][init_pair_idx[tmp_ind, 0]], feature2["pts"][init_pair_idx[tmp_ind, 1]],
                                    method=cv2.RANSAC, ransacReprojThreshold=5)

            # Then use the transform find more matched points
            pts12 = cv2.perspectiveTransform(np.array([[p] for p in feature1["pts"]], dtype="float32"), tf[0])[:, 0, :]
            dist_mat = spd.cdist(pts12, feature2["pts"])
            num1, num2 = dist_mat.shape

            idx12 = np.argsort(dist_mat, axis=1)
            tmp_ind = np.argwhere(np.array([dist_mat[i, idx12[i, 0]] for i in range(num1)]) < 5)
            if len(tmp_ind) > len(ind):
                ind = tmp_ind
            logging.debug("len(ind) = %d, len(feature) = %d", len(ind), min(len(feature1["pts"]), len(feature2["pts"])))
            k += 1

        pair_idx = np.hstack((ind, idx12[ind, 0]))

        tf = cv2.findHomography(feature1["pts"][pair_idx[:, 0]], feature2["pts"][pair_idx[:, 1]],
                                method=cv2.RANSAC, ransacReprojThreshold=5)
        return tf, pair_idx
Beispiel #25
0
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \
        dist_f=None):
    if len(np.array(X).shape) == 1:
        _X = np.array([X]).T
    else:
        _X = np.array(X)
        
    if len(np.array(Y).shape) == 1:
        _Y = np.array([Y]).T
    else:
        _Y = np.array(Y)
        
    if dist_f == None:
        if symmetric:
            cM = pdist(_X)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y)
            M = kernel(cM, theta)
            return M
    else:
        if symmetric:
            cM = pdist(_X, dist_f)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y, dist_f)
            M = kernel(cM, theta)
            return M
    return
Beispiel #26
0
    def __init__(self, data, bandwidth=None, fixed=True, k=None,
                 function='triangular', eps=1.0000001, ids=None, truncate=True,
                 points=None): #Added truncate flag
        if issubclass(type(data), scipy.spatial.KDTree):
            self.data = data.data
            data = self.data
        else:
            self.data = data
        if k is not None:
            self.k = int(k) + 1
        else:
            self.k = k
        if points is None:
            self.dmat = cdist(self.data, self.data)
        else:
            self.points = points
            self.dmat = cdist(self.points, self.data)
        self.function = function.lower()
        self.fixed = fixed
        self.eps = eps
        self.trunc = truncate
        if bandwidth:
            try:
                bandwidth = np.array(bandwidth)
                bandwidth.shape = (len(bandwidth), 1)
            except:
                bandwidth = np.ones((len(data), 1), 'float') * bandwidth
            self.bandwidth = bandwidth
        else:
            self._set_bw()
        self.kernel = self._kernel_funcs(self.dmat/self.bandwidth)

        if self.trunc:
            mask = np.repeat(self.bandwidth, len(self.data), axis=1)
            self.kernel[(self.dmat >= mask)] = 0
def pairwise_between_groups(fullsplit,i_own,split_lens,CDR3_similarity_cutoff):
    i_others = len(split_lens)
    if i_own != i_others-1:
        # if this is not the last group, stack groups being compared against
        fso = np.vstack(fullsplit[i_own+1:i_others])
    else:
        fso = fullsplit[i_others-1]
        
    bool_dis = cdist(fullsplit[i_own],fso,'hamming').flatten()
    bool_inf = cdist(np.isinf(fullsplit[i_own]),np.isinf(fso),'hamming').flatten()
    finite_own = np.isfinite(fullsplit[i_own]).sum(axis=1)
    finite_others = np.isfinite(fso).sum(axis=1)
    
    pdf_all=np.empty(len(bool_dis))
    for c,fin in enumerate(product(finite_own,finite_others)):
        pdf_all[c] = min(fin)

    norm_dist_all  = (bool_dis-bool_inf)/pdf_all*fullsplit[0].shape[1]
    bool_all = norm_dist_all < (1-CDR3_similarity_cutoff)
    
    # given boolean array, find sequences belonging in a cluster, row-wise
    bool_all = bool_all.reshape(fullsplit[i_own].shape[0],fso.shape[0])
    sets=[]
    
    col_offset = sum(split_lens[:i_own])+fullsplit[i_own].shape[0]
        
    for cnt,row in enumerate(bool_all):
        row_offset = cnt+sum(split_lens[:i_own])

        sets_from_group = set(np.add(np.nonzero(row)[0],col_offset))
        sets_from_group.add(row_offset)
        sets.append(sets_from_group)

    return sets
def computeScipySimilarity(Xs1,Xs2,sparse=False):
    Xall_new = np.zeros((Xs1.shape[0],4))

    if sparse:
        print Xs1.shape
        print Xs2.shape
        Xs1 = np.asarray(Xs1.todense())
        Xs2 = np.asarray(Xs2.todense())

    for i,(a,b) in enumerate(zip(Xs1,Xs2)):
        a = a.reshape(-1,a.shape[0])
        b = b.reshape(-1,b.shape[0])
        #print a.shape
        #print type(a)
        dist = cdist(a,b,'cosine')
        Xall_new[i,0] = dist
        #Xall_new[i,3] = dist
        dist = cdist(a,b,'cityblock')
        Xall_new[i,1] = dist
        dist = cdist(a,b,'hamming')
        Xall_new[i,2] = dist
        dist = cdist(a,b,'euclidean')
        Xall_new[i,3] = dist

    Xall_new = pd.DataFrame(Xall_new,columns=['cosine','cityblock','hamming','euclidean'])

    print "NA:",Xall_new.isnull().values.sum()
    Xall_new = Xall_new.fillna(0.0)
    print "NA:",Xall_new.isnull().values.sum()
    print Xall_new.corr(method='spearman')
    return Xall_new
def compute_bic(kmeans,X):
	"""
	Computes the BIC metric for a given clusters
	Parameters:
	-----------------------------------------
	kmeans:  List of clustering object from scikit learn
	X     :  multidimension np array of data points
	Returns:
	-----------------------------------------
	BIC value
	"""
	# assign centers and labels
	centers = [kmeans.cluster_centers_]
	labels  = kmeans.labels_
	#number of clusters
	m = kmeans.n_clusters
	# size of the clusters
	n = np.bincount(labels)
	#size of data set
	N, d = X.shape
	#compute variance for all clusters beforehand
	cl_var=[]
	for i in xrange(m):
		if not n[i] - m==0:
			cl_var.append((1.0 / (n[i] - m)) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2))
		else:
			cl_var.append(float(10**20) * sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2))
	const_term = 0.5 * m * np.log10(N)
	BIC = np.sum([n[i] * np.log10(n[i]) -
	       n[i] * np.log10(N) -
	     ((n[i] * d) / 2) * np.log10(2*np.pi) -
	      (n[i] / 2) * np.log10(cl_var[i]) -
	     ((n[i] - m) / 2) for i in xrange(m)]) - const_term
	return(BIC)
Beispiel #30
0
Datei: cov.py Projekt: Comy/pyGPs
    def proceed(self, x=None, z=None, der=None):
        ell = np.exp(self.hyp[0])            # characteristic length scale
        sf2 = np.exp(2.*self.hyp[1])         # signal variance
        v   = self.para[0]                   # degree (v = 0,1,2 or 3 only)
        if np.abs(v-np.round(v)) < 1e-8:     # remove numerical error from format of parameter
            v = int(round(v))
        assert(int(v) in range(4))           # Only allowed degrees: 0,1,2 or 3
        v = int(v)        
        n, D = x.shape
        j = np.floor(0.5*D) + v + 1
        if z == 'diag':
            A = np.zeros((n,1))
        elif z == None:
            A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') )
        else:                                       # compute covariance between data sets x and z
            A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') )     # cross covariances 
        if der == None:                             # compute covariance matix for dataset x
            A = sf2 * self.pp(A,j,v,self.func)
        else:
            if der == 0:                            # compute derivative matrix wrt 1st parameter
                A = sf2 * self.dpp(A,j,v,self.func,self.dfunc)

            elif der == 1:                          # compute derivative matrix wrt 2nd parameter
                A = 2. * sf2 * self.pp(A,j,v,self.func)

            elif der == 2:                          # wants to compute derivative wrt order
                A = np.zeros_like(A)
            else:
                raise Exception("Wrong derivative entry in PiecePoly")
        return A
Beispiel #31
0
def radial_basis(x_l, x, epsilon, norm="euclidean"):
    """
    Calculates the radial basis for vector x
    """
    r = cdist(np.array([x_l]).T, np.array([x]).T, norm)
    return _radial_basis(r, epsilon)
def dataset3(K):
    df_3 = df[(df['label']==6) | (df['label']==7)].reset_index(drop= True)
    vertex = list(zip(df_3['dim1'], df_3['dim2']))
    centroid_index = np.random.randint(0,len(df_3), size = K)
    centroid = [vertex[x] for x in centroid_index]

    for iteration in range(50):
        #print(iteration)
        centroid_old = centroid
        cluster = []
        array_1 = (np.square(distance.cdist(vertex, centroid)))
        cluster = [np.argmin(array_1[i]) for i in range(len(array_1))]
        cluster = np.array(cluster)
        def index_cluster(cluster_number):
            return (np.where(cluster == cluster_number)[0])
        clust_points = {}
        for num in range(K):
            clust_points[num] = [vertex[i] for i in index_cluster(num)]
        centroid = [(sum((list(zip(*clust_points[num]))[0]))/len(clust_points[num]), sum((list(zip(*clust_points[num]))[1]))/len(clust_points[num])) for num in range(K)]

        if centroid == centroid_old:
            break

    list_ss = []
    for clust_index in range(K):
        for i in range(len(clust_points[clust_index])):
            list_ss.append(np.square(distance.euclidean(centroid[clust_index],clust_points[clust_index][i])))
    WC_SSD = sum(list_ss)

    cluster = []
    for i in range(K):
        cluster.extend(list(np.repeat(i,len(clust_points[i]))))
    dim_1 = []
    dim_2 = []
    for i in range(K):
        x = list(zip(*clust_points[i]))[0]
        y = list(zip(*clust_points[i]))[1]
        dim_1.extend(x)
        dim_2.extend(y)

    dt = pd.DataFrame()
    dt['dim_1'] = dim_1
    dt['dim_2'] = dim_2
    dt['cluster'] = cluster

    dt = dt.sort_values(by = ['dim_1', 'dim_2', 'cluster']).reset_index(drop = True)
    df_3 = df_3.sort_values(by = ['dim1', 'dim2', 'label']).reset_index(drop = True)

    result = pd.merge(df_3,dt,on = df_3.index ).drop(['key_0','dim_1', 'dim_2'], axis = 1)

    B_avg = []
    for i in range(K):
        for j in range(K):
            add = np.mean(distance.cdist(clust_points[i], clust_points[j], 'euclidean'))
            B_avg.append(add)
    B = np.mean(B_avg)
    A_avg = []
    for i in range(K):
        add = np.mean(distance.cdist(clust_points[i],clust_points[i], 'euclidean'))
        A_avg.append(add)
    A = np.mean(A_avg)
    SC = (B-A)/max(A,B)

    return WC_SSD, SC
def get_gaussian_kernel(X, Y, sigma):
       
    D = cdist(X,Y, 'euclidean')
    K = np.exp(-sigma * D**2)

    return K
Beispiel #34
0
 def time_cdist(self, num_points, metric):
     """Time scipy.spatial.distance.cdist over a range of input data
     sizes and metrics.
     """
     distance.cdist(self.points, self.points, metric)
Beispiel #35
0
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-ids', '--ids_file', help='ids file', required=True)
    parser.add_argument('-n_components',
                        '--n_components',
                        help='number of components in pca',
                        required=True)
    parser.add_argument('-k', '--k', help='k of kmeans', required=True)
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    ids_list, news_groups = get_hash_ids(ARGS.ids_file)

    print("PCA")
    pca = PCA(n_components=int(ARGS.n_components))
    descriptors = pca.fit_transform(descriptors)

    # kmeanModel = KMeans(n_clusters=int(ARGS.k), init='k-means++')
    # kmeanModel.fit(descriptors)
    # predictions = kmeanModel.predict(descriptors)
    # cluster_centers_ = kmeanModel.cluster_centers_
    # print(predictions)

    print("Kmeans")
    kclusterer = KMeansClusterer(int(ARGS.k),
                                 distance=nltk.cluster.util.cosine_distance)
    predictions = np.array(
        kclusterer.cluster(descriptors, assign_clusters=True))
    cluster_centers_ = np.array(kclusterer.means())

    print("Distortions")
    # distortion_eu = sum(np.min(distance.cdist(descriptors, cluster_centers_, 'euclidean'), axis=1)) / descriptors.shape[0]
    distortion_cos = sum(
        np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
               axis=1)) / descriptors.shape[0]

    print("Silhouettes")
    # silhouette_score_eu = metrics.silhouette_score(descriptors, predictions, metric='euclidean')
    silhouette_score_cos = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

    # print("EUCLIDEAN K:", ARGS.k, "distortion:", distortion_eu, "silhouette score:", silhouette_score_eu)
    print("COS K:", ARGS.k, "distortion:", distortion_cos, "silhouette score:",
          silhouette_score_cos)

    closest, _ = pairwise_distances_argmin_min(cluster_centers_, descriptors)

    medoids_ids = ids_list[closest]

    medoids = descriptors[closest]

    dist = distance.cdist(medoids, medoids, metric='cosine')
    # Five
    knns = dist.argsort(axis=1)[:, :6][:, 1:]

    for id_, knn in zip(medoids_ids, knns):
        print("\nMedoid id:", id_, "label:", news_groups[id_])
        print("Cercanos:")
        for nn in knn:
            print("\t id:", medoids_ids[nn], "labels:",
                  news_groups[medoids_ids[nn]])

    metric = []

    for i in range(int(225)):
        ids_l = ids_list[np.where(predictions == i)]

        #     if len(ids_l) == 0:
        #         counter_0+=1
        #         continue
        clusters_labels = []
        for id_l in ids_l:
            label_list = news_groups[id_l]
            for ll in label_list:
                clusters_labels.append(ll)

        clnp = np.array(clusters_labels)
        uni, con = np.unique(clnp, return_counts=True)
        #letter_counts = Counter(clusters_labels)
        #df = pandas.DataFrame.from_dict(letter_counts, orient='index')

        ind = np.argsort(con)[::-1]
        uni = uni[ind]
        con = con[ind]

        maxim = con.sum()
        cont = con[0]

        label = uni[0]
        uni = uni[1:]
        con = con[1:]
        marker = np.zeros(uni.shape)

        for s in label.split('.'):
            for j in range(uni.shape[0]):
                if marker[j] == 0 and s in uni[j]:
                    cont += con[j]
                    marker[j] = 1

    #     print("cluster:", i, "metrica:", cont/maxim  )
        metric.append(cont / maxim)

    metric = np.array(metric, dtype=np.float)

    print("mean:", metric.mean())
    print("std:", metric.std())
    print("median:", np.median(metric))
    print("Min:", np.min(metric))
    print("Max:", np.max(metric))

    return 0
Beispiel #36
0
def graph():
    #
    # Pb Filter
    #
    mask_filenames = glob(PathManager.path_valid_masks + 'mask*.npy')
    valid_bits = np.sort(
        np.unique([
            int(name.replace('\\', '/').split('/')[-1].split('_')[1])
            for name in mask_filenames
        ]))

    #
    # Questions
    #

    reldist_filter = np.load(
        PathManager.path_questions_hamming_reldistance_keep_bit_idxs)

    questions = np.concatenate([
        np.load(PathManager.path_questions_hamming_angles),
        np.load(PathManager.path_questions_hamming_distances),
        np.load(
            PathManager.path_questions_hamming_reldistances)[reldist_filter]
    ])

    #
    # Posebyte
    #

    posebyte_conditional = np.load('../posebytes/posebyte_conditioned.npy')

    angles_val = np.load(PathManager.path_annotations_hamming_valtest_angle)
    distances_val = np.load(
        PathManager.path_annotations_hamming_valtest_distance)
    reldistances_val = np.load(
        PathManager.path_annotations_hamming_valtest_reldistance)
    posebyte_valtest = np.concatenate((
        angles_val,
        distances_val,
        reldistances_val,
    ),
                                      axis=1)[1919:]

    #
    # Embeddings
    #

    embedding_conditional = np.load('../embeddings/embeddings_conditional.npy')
    embedding_test = np.load(
        '../../image/hamming/embeddings/embeddings_valtest_0.npy')[1919:]

    #
    # Distances
    #

    distances = cdist(embedding_conditional, embedding_test)
    nearest_indices = np.argsort(distances, axis=1)

    #
    # Display
    #

    output_path = 'predictions/'

    root_img_dir = PathManager.path_image_root
    sequence_file = PathManager.path_dataset_valtest_txt
    with open(sequence_file, 'r') as in_file:
        label_lines = in_file.readlines()
        image_list = [x.strip() for x in label_lines]
        image_list = [[' '.join(x.strip().split(' ')[:-16]) + '/'] +
                      x.strip().split(' ')[-16:] for x in image_list]
    image_list = image_list[1919:]

    for anno_idx, anno in enumerate(embedding_conditional):
        question_idx = int(anno_idx / 2)
        answer = posebyte_conditional[anno_idx, question_idx]

        if question_idx in valid_bits:
            pass
        else:
            continue

        answer = bool(answer)
        question = str(question_idx) + ': ' + str(questions[question_idx])
        question = question.replace('angle:', 'is bent:')
        question = question.replace('distance:', 'is near:')
        question = question.replace('beyond:', 'is beyond:')
        question = question + '? ' + str(answer)

        output_file_name = output_path + question + '.png'

        nearest = nearest_indices[anno_idx]

        fig = plt.figure()
        fig.set_size_inches(8.0, 8.0)

        for frame_idx in range(25):
            near_idx = nearest[frame_idx]
            image_name = root_img_dir + image_list[near_idx][0] + image_list[
                near_idx][1].split('_')[1] + '.png'

            axes = fig.add_subplot(5, 5, frame_idx + 1)

            if posebyte_valtest[near_idx, question_idx] == answer:
                for spine in axes.spines.values():
                    spine.set_edgecolor('green')
                    spine.set_linewidth(8)
            else:
                for spine in axes.spines.values():
                    spine.set_edgecolor('red')
                    spine.set_linewidth(8)

            image_to_show = imread(image_name)
            plt.suptitle(question, fontsize=16)
            plt.imshow(imresize(image_to_show, (288, 288)))
            plt.setp(axes.get_xticklabels(), visible=False)
            plt.setp(axes.get_yticklabels(), visible=False)

        plt.show()
Beispiel #37
0
    def update(self, rects):
        # check to see if the list of input bounding box rectangles
        # is empty
        if len(rects) == 0:
            # loop over any existing tracked objects and mark them
            # as disappeared
            for objectID in list(self.disappeared.keys()):
                self.disappeared[objectID] += 1

                # if we have reached a maximum number of consecutive
                # frames where a given object has been marked as
                # missing, deregister it
                if self.disappeared[objectID] > self.maxDisappeared:
                    self.deregister(objectID)

            # return early as there are no centroids or tracking info
            # to update
            return self.objects

        # initialize an array of input centroids for the current frame
        inputCentroids = np.zeros((len(rects), 2), dtype="int")

        # loop over the bounding box rectangles
        for (i, (startX, startY, endX, endY)) in enumerate(rects):
            # use the bounding box coordinates to derive the centroid
            cX = int((startX + endX) / 2.0)
            cY = int((startY + endY) / 2.0)
            print("cX : " + str(cX) + ", cY : " + str(cY))
            inputCentroids[i] = (cX, cY)

        # if we are currently not tracking any objects take the input
        # centroids and register each of them
        if len(self.objects) == 0:
            for i in range(0, len(inputCentroids)):
                self.register(inputCentroids[i])

        # otherwise, are are currently tracking objects so we need to
        # try to match the input centroids to existing object
        # centroids
        else:
            # grab the set of object IDs and corresponding centroids
            objectIDs = list(self.objects.keys())
            objectCentroids = list(self.objects.values())

            # compute the distance between each pair of object
            # centroids and input centroids, respectively -- our
            # goal will be to match an input centroid to an existing
            # object centroid
            D = dist.cdist(np.array(objectCentroids), inputCentroids)

            # in order to perform this matching we must (1) find the
            # smallest value in each row and then (2) sort the row
            # indexes based on their minimum values so that the row
            # with the smallest value as at the *front* of the index
            # list
            rows = D.min(axis=1).argsort()

            # next, we perform a similar process on the columns by
            # finding the smallest value in each column and then
            # sorting using the previously computed row index list
            cols = D.argmin(axis=1)[rows]

            # in order to determine if we need to update, register,
            # or deregister an object we need to keep track of which
            # of the rows and column indexes we have already examined
            usedRows = set()
            usedCols = set()

            # loop over the combination of the (row, column) index
            # tuples
            for (row, col) in zip(rows, cols):
                # if we have already examined either the row or
                # column value before, ignore it
                # val
                if row in usedRows or col in usedCols:
                    continue

                # otherwise, grab the object ID for the current row,
                # set its new centroid, and reset the disappeared
                # counter
                objectID = objectIDs[row]
                self.objects[objectID] = inputCentroids[col]
                self.disappeared[objectID] = 0

                # indicate that we have examined each of the row and
                # column indexes, respectively
                usedRows.add(row)
                usedCols.add(col)

            # compute both the row and column index we have NOT yet
            # examined
            unusedRows = set(range(0, D.shape[0])).difference(usedRows)
            unusedCols = set(range(0, D.shape[1])).difference(usedCols)

            # in the event that the number of object centroids is
            # equal or greater than the number of input centroids
            # we need to check and see if some of these objects have
            # potentially disappeared
            if D.shape[0] >= D.shape[1]:
                # loop over the unused row indexes
                for row in unusedRows:
                    # grab the object ID for the corresponding row
                    # index and increment the disappeared counter
                    objectID = objectIDs[row]
                    self.disappeared[objectID] += 1

                    # check to see if the number of consecutive
                    # frames the object has been marked "disappeared"
                    # for warrants deregistering the object
                    if self.disappeared[objectID] > self.maxDisappeared:
                        self.deregister(objectID)

            # otherwise, if the number of input centroids is greater
            # than the number of existing object centroids we need to
            # register each new input centroid as a trackable object
            else:
                for col in unusedCols:
                    self.register(inputCentroids[col])

        # return the set of trackable objects
        return self.objects
 def average_linkage(self, cluster1, cluster2):
     distances = cdist(cluster1, cluster2, 'euclidean')
     
     return distances.mean()
 def single_linkage(self, cluster1, cluster2):
     distances = cdist(cluster1, cluster2, 'euclidean')
     
     return distances.min()
Beispiel #40
0
def test_live(message):
    app.queue.put(message['data'])
    img_bytes = base64.b64decode(app.queue.get())
    img = np.array(Image.open(io.BytesIO(img_bytes)))
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    try:
        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        boxes_c, landmarks = mtcnn_detector.detect(img)
        for i in range(boxes_c.shape[0]):
            bbox = boxes_c[i, :4]
            # we dont want too small
            if (int(bbox[2]) - int(bbox[0])) > 100:
                # only detect husein
                cropped = rgb[int(bbox[1]):int(bbox[3]),
                              int(bbox[0]):int(bbox[2])]
                predicted = face_sess.run(face_model.logits,
                                          feed_dict={face_model.X: cropped})[0]
                person = data_Y[np.argmin(
                    cdist(embedded, [predicted], 'cosine')[:, 0])]
                if person == 0:
                    cropped = gray[int(bbox[1]):int(bbox[3]),
                                   int(bbox[0]):int(bbox[2])]
                    predicted = emotion_sess.run(emotion_model.logits,
                                                 feed_dict={
                                                     emotion_model.X:
                                                     np.expand_dims(
                                                         cropped, 2)
                                                 })[0]
                    emotion = np.argmax(predicted)
                    shape = predictor(
                        gray,
                        dlib.rectangle(
                            int(bbox[0]) - 10,
                            int(bbox[1]) - 10,
                            int(bbox[2]) - 10,
                            int(bbox[3]) - 10))
                    shape = shape_to_np(shape)
                    roll, pitch, yaw = face_orientation(
                        img, shape[[33, 8, 36, 45, 48, 54]])
                    left_eye = shape[42:48]
                    right_eye = shape[36:42]
                    left_EAR = eye_aspect_ratio(left_eye)
                    right_EAR = eye_aspect_ratio(right_eye)
                    ear = (left_EAR + right_EAR) / 2.0
                    ear = (ear - eyes[0]) / (eyes[1] - eyes[0])
                    MOUTH = mouth_aspect_ratio(shape[[61, 67, 63, 65, 60, 64]])
                    MOUTH = (MOUTH - mouths[0]) / (mouths[1] - mouths[0])
                    current_time = (
                        datetime.now() -
                        timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S")
                    doc = {
                        'emotion': labels_emotion[emotion],
                        'roll': roll,
                        'pitch': pitch,
                        'yaw': yaw,
                        'mouth': MOUTH,
                        'eyes': ear,
                        'datetime': current_time.replace(' ', 'T')
                    }
                    es.index(index="huseinhouse_emotion",
                             doc_type='face',
                             id=current_time,
                             body=doc)
                    emit('camera_update', {'data': doc}, broadcast=True)
    except Exception as e:
        print(e)
        pass
    f"document in {matched_preprint_published_pairs.document.tolist()}"
).set_index("document").reindex(
    matched_preprint_published_pairs.document.tolist()).fillna(0))
biorxiv_documents.head()

pmc_documents = (pmc_embed_df.query(
    f"document in {matched_preprint_published_pairs.pmcid.tolist()}"
).set_index("document").reindex(
    matched_preprint_published_pairs.pmcid.tolist()).drop("journal",
                                                          axis=1).fillna(0))
pmc_documents.head()

# +
published_date_distances = (matched_preprint_published_pairs.assign(
    doc_distances=np.diag(
        cdist(biorxiv_documents.values, pmc_documents.values,
              "euclidean"))).replace(
                  0, np.nan).dropna().query("doc_distances.notnull()"))

published_date_distances.to_csv(
    "output/preprint_published_distances_rerun.tsv", sep="\t", index=False)

print(published_date_distances.shape)
published_date_distances.head()
# -

# # Construct Scatter Plot of Date vs Version Count

# Preprints are delayed on an average of 51 days for each new version posted onto bioRxiv. This section regresses preprint's version counts against the time it takes to have a preprint published. A scatter and square bin plot are generated below.

# +
# Get smoothed linear regression line
 def complete_linkage(self, cluster1, cluster2):
     distances = cdist(cluster1, cluster2, 'euclidean')
     
     return distances.max()
Beispiel #43
0
import numpy as np
from scipy.spatial import distance
from scipy.spatial import distance_matrix
from scipy.spatial.distance import euclidean
from scipy.spatial import distance

data = np.load('data/digit2.npy')
sample_size = data.shape[0]
image_size = data.shape[1]

data = data.reshape(sample_size, image_size * image_size)
print(data.shape)
print(data[0].shape)


def Euclidean_distance(x, y, p=2):
    x = x.reshape(1, -1)
    y = y.reshape(1, -1)
    return distance_matrix(x, y, p=p)


import time

start = time.time()
distance_matrix = distance.cdist(data, data, metric='euclidean')
print(distance_matrix)
end = time.time()
print(end - start)

np.save('distance_matrix.npy', distance_matrix)
Beispiel #44
0
            id.append(int(row[0]))
            metadata.append([float(i) for i in row[1:]])
            data[int(row[0])] = []
            data[int(row[0])] = ([float(i) for i in row[1:]])
    f.close()

    with open(data_filename) as f:
        position = []
        for line in f:
            row = line.strip('\n').split(',')
            position.append([float(i) for i in row])

    f.close()

    heap = []
    dis = cdist(metadata, metadata, 'euclidean')
    for i in range(len(dis)):
        for j in range(i + 1, len(dis[0])):
            heapq.heappush(heap, [dis[i][j], (id[i], id[j])])
    # hierarchical
    centroid = []
    results = hierarchical(data, k, id, heap)
    clusters = []
    for i in range(len(results)):
        centroid.append(data[results[i]])
        cur_class = sorted([
            int(j) for j in str(results[i]).replace('(', '').replace(
                ')', '').split(',')
        ])
        clusters.append(cur_class)
    print clusters
Beispiel #45
0
import numpy as np
from scipy.spatial.distance import cdist
from time import time
import mrpt

# Generate synthetic test data
k = 10; n_queries = 100
data = np.dot(np.random.rand(int(1e5),5), np.random.rand(5,100)).astype('float32')
queries = np.dot(np.random.rand(n_queries,5), np.random.rand(5,100)).astype('float32')

# Solve exact nearest neighbors with standard methods from scipy and numpy for reference
exact_search_time = time()
exact_neighbors = np.zeros((n_queries, k))
for i in range(n_queries):
    exact_neighbors[i] = np.argsort(cdist([queries[i]], data))[0,:k]
exact_search_time = time() - exact_search_time

# Offline phase: Indexing the data. This might take some time.
indexing_time = time()
index = mrpt.MRPTIndex(data, depth=5, n_trees=100)
index.build()
indexing_time = time() - indexing_time

# Online phase: Finding nearest neighbors stupendously fast.
approximate_search_time = time()
approximate_neighbors = np.zeros((n_queries, k))
for i in range(n_queries):
    approximate_neighbors[i] = index.ann(queries[i], k, votes_required=4)
approximate_search_time = time() - approximate_search_time

# Print some stats
Beispiel #46
0
#reading the data from the csv file using pandas
dataIn = pd.read_csv('C:\Crime_Analysis.csv')
x1 = dataIn['Y']  #considering latitudes as x1
y1 = dataIn['X']  #considering longitudes as y1

data = np.array(list(zip(x1, y1))).reshape(len(x1), 2)

#K Means Algorithm
distortions = []
#clusters in steps of 10
clusters = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

for k in clusters:
    kmean = KMeans(n_clusters=k).fit(data)
    kmean.fit(data)
    distortions.append(
        sum(np.min(cdist(data, kmean.cluster_centers_, 'euclidean'), axis=1)) /
        data.shape[0])

# Plotting elbow graph

plt.plot(clusters, distortions, 'bx-')
# setting the label for x-axis
plt.xlabel('clusters')
# setting the label for x-axis
plt.ylabel('distortion')
# setting the title
plt.title('plot to select the number of clusters')
plt.show()
def get_distances_between_keypoints(keypoints):
    return cdist(keypoints, keypoints)
Beispiel #48
0
 def closest_node(node, nodes):
     closest_index = distance.cdist([node], nodes).argmin()
     return nodes[closest_index]
Beispiel #49
0
def profile_axis(profile, rasterize_factor, min_length=10):

    [x0, y0], [x1, y1] = profile.min(axis=0), profile.max(axis=0)
    x0, y0, x1, y1 = int(x0) - 1, int(y0) - 1, int(x1) + 1, int(y1) + 1
    w, h = x1 - x0, y1 - y0

    img = Image.new("1", (w * rasterize_factor, h * rasterize_factor))
    draw = ImageDraw.Draw(img)
    profile = ((profile - [x0, y0]) * rasterize_factor).astype(int)
    draw.polygon([(x, y) for x, y in profile], fill=0, outline=1)
    outline = np.argwhere(np.array(img, dtype=bool).T)
    draw.polygon([(x, y) for x, y in profile], fill=1, outline=0)
    profile_mask = clean_mask(np.array(img, dtype=bool).T)
    img.close()

    skelet = skeletonize(profile_mask)
    connectivity = convolve2d(skelet,
                              square(3),
                              mode="same",
                              boundary="fill",
                              fillvalue=False)
    nodes = (skelet & ((connectivity > 3) | (connectivity == 2)))

    rcs = np.argwhere(skelet)
    idxs_nodes = set(np.where(nodes[rcs[:, 0], rcs[:, 1]])[0].tolist())
    d = np.abs(
        np.dstack((rcs[:, 0, None] - rcs[None, :, 0],
                   rcs[:, 1, None] - rcs[None, :, 1]))).max(axis=2)
    G = nx.from_numpy_matrix(d == 1)
    paths = []
    for i, j in combinations(idxs_nodes, 2):
        path = nx.shortest_path(G, i, j)
        if len(idxs_nodes.intersection(path)) > 2:
            continue
        paths.append(rcs[path])
    paths = sorted(paths, key=lambda path: len(path))

    axis = np.array(paths.pop(), dtype=float)

    if ((axis[-1] / rasterize_factor - [-x0, -y0])**
            2).sum() < ((axis[0] / rasterize_factor - [-x0, -y0])**2).sum():
        axis = axis[::-1]

    d_min = cdist(axis, profile).min(axis=1)
    d_min = d_min[d_min > d_min.max() / 2].mean() / 2

    d_max = rasterize_factor * 2
    while paths:
        path = paths.pop()

        if cdist(path, profile).min() < d_min:
            break

        d1 = np.sqrt(((path[0] - axis[0])**2).sum())
        d2 = np.sqrt(((path[0] - axis[-1])**2).sum())
        d3 = np.sqrt(((path[-1] - axis[0])**2).sum())
        d4 = np.sqrt(((path[-1] - axis[-1])**2).sum())

        if (d1 < d_max) and (d3 > d_max):
            axis = np.vstack((path[::-1], axis))
        elif (d2 < d_max) and (d4 > d_max):
            axis = np.vstack((axis, path))
        elif (d3 < d_max) and (d1 > d_max):
            axis = np.vstack((path, axis))
        elif (d4 < d_max) and (d2 > d_max):
            axis = np.vstack((axis, path[::-1]))

    d_min = cdist(axis, profile).min(axis=1)
    d_min = d_min[d_min > d_min.max() / 2].mean()

    axis0 = axis.copy()
    d = cdist([profile[np.argmin(cdist([axis[-1]], profile)[0])]], axis)[0]
    axis = axis[d > d_min]
    if axis.shape[0] < min_length:
        axis = axis0.copy()

    axis0 = axis.copy()
    d = cdist([profile[np.argmin(cdist([axis[0]], profile)[0])]], axis)[0]
    axis = axis[d > d_min]
    if axis.shape[0] < min_length:
        axis = axis0.copy()

    thickness = cdist(axis, profile).min(axis=1) * 2
    thickness = thickness[thickness > thickness.max() /
                          2].mean() / rasterize_factor

    axis = axis / rasterize_factor + [x0, y0]

    axis = smoothen_coords(axis)

    return axis, thickness
Beispiel #50
0
def distance_matrix(data,
                    numeric_distance="euclidean",
                    categorical_distance="jaccard"):
    """ Compute the pairwise distance attribute by attribute in order to account for different variables type:
        - Continuous
        - Categorical
        For ordinal values, provide a numerical representation taking the order into account.
        Categorical variables are transformed into a set of binary ones.
        If both continuous and categorical distance are provided, a Gower-like distance is computed and the numeric
        variables are all normalized in the process.
        If there are missing values, the mean is computed for numerical attributes and the mode for categorical ones.
        
        Note: If weighted-hamming distance is chosen, the computation time increases a lot since it is not coded in C 
        like other distance metrics provided by scipy.
        @params:
            - data                  = pandas dataframe to compute distances on.
            - numeric_distances     = the metric to apply to continuous attributes.
                                      "euclidean" and "cityblock" available.
                                      Default = "euclidean"
            - categorical_distances = the metric to apply to binary attributes.
                                      "jaccard", "hamming", "weighted-hamming" and "euclidean"
                                      available. Default = "jaccard"
        @returns:
            - the distance matrix
    """
    possible_continuous_distances = ["euclidean", "cityblock"]
    possible_binary_distances = [
        "euclidean", "jaccard", "hamming", "weighted-hamming"
    ]
    number_of_variables = data.shape[1]
    number_of_observations = data.shape[0]

    # Get the type of each attribute (Numeric or categorical)
    is_numeric = [
        all(isinstance(n, numbers.Number) for n in data.iloc[:, i])
        for i, x in enumerate(data)
    ]
    is_all_numeric = sum(is_numeric) == len(is_numeric)
    is_all_categorical = sum(is_numeric) == 0
    is_mixed_type = not is_all_categorical and not is_all_numeric

    # Check the content of the distances parameter
    if numeric_distance not in possible_continuous_distances:
        print "The continuous distance " + numeric_distance + " is not supported."
        return None
    elif categorical_distance not in possible_binary_distances:
        print "The binary distance " + categorical_distance + " is not supported."
        return None

    # Separate the data frame into categorical and numeric attributes and normalize numeric data
    if is_mixed_type:
        number_of_numeric_var = sum(is_numeric)
        number_of_categorical_var = number_of_variables - number_of_numeric_var
        data_numeric = data.iloc[:, is_numeric]
        data_numeric = (data_numeric - data_numeric.mean()) / (
            data_numeric.max() - data_numeric.min())
        data_categorical = data.iloc[:, [not x for x in is_numeric]]

    # Replace missing values with column mean for numeric values and mode for categorical ones. With the mode, it
    # triggers a warning: "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame"
    # but the value are properly replaced
    if is_mixed_type:
        data_numeric.fillna(data_numeric.mean(), inplace=True)
        for x in data_categorical:
            data_categorical[x].fillna(data_categorical[x].mode()[0],
                                       inplace=True)
    elif is_all_numeric:
        data.fillna(data.mean(), inplace=True)
    else:
        for x in data:
            data[x].fillna(data[x].mode()[0], inplace=True)

    # "Dummifies" categorical variables in place
    if not is_all_numeric and not (categorical_distance == 'hamming' or
                                   categorical_distance == 'weighted-hamming'):
        if is_mixed_type:
            data_categorical = pd.get_dummies(data_categorical)
        else:
            data = pd.get_dummies(data)
    elif not is_all_numeric and categorical_distance == 'hamming':
        if is_mixed_type:
            data_categorical = pd.DataFrame([
                pd.factorize(data_categorical[x])[0] for x in data_categorical
            ]).transpose()
        else:
            data = pd.DataFrame([pd.factorize(data[x])[0]
                                 for x in data]).transpose()

    if is_all_numeric:
        result_matrix = cdist(data, data, metric=numeric_distance)
    elif is_all_categorical:
        if categorical_distance == "weighted-hamming":
            result_matrix = weighted_hamming(data)
        else:
            result_matrix = cdist(data, data, metric=categorical_distance)
    else:
        result_numeric = cdist(data_numeric,
                               data_numeric,
                               metric=numeric_distance)
        if categorical_distance == "weighted-hamming":
            result_categorical = weighted_hamming(data_categorical)
        else:
            result_categorical = cdist(data_categorical,
                                       data_categorical,
                                       metric=categorical_distance)
        result_matrix = np.array([[
            1.0 * (result_numeric[i, j] * number_of_numeric_var +
                   result_categorical[i, j] * number_of_categorical_var) /
            number_of_variables for j in range(number_of_observations)
        ] for i in range(number_of_observations)])

    # Fill the diagonal with NaN values
    np.fill_diagonal(result_matrix, np.nan)

    return pd.DataFrame(result_matrix)
Beispiel #51
0
def main():
   global Q,E,S,Pk,e,T,pH,total_charged_residues,G,G0,indiv_data,Gqq,Q0

   file_pdb = arguments.f     
   pH = np.float(arguments.arg_pH)
   T = np.float(arguments.arg_T)
   
   ##################################################################################################
   # runs the standalone version of ©Surfrace
   ##################################################################################################

      
   print 'Running SASA - ©Surfrace'
   cmd1 = 'echo 1' + arguments.f.name + ' 1.4 1| ./surfrace5_0_linux_64bit > SASA_'+os.path.splitext(arguments.f.name)[0]+'_all.trash' ## Roda o programa para a SASA
   os.system(cmd1)
   try: 
     file_sasa = open(os.path.splitext(arguments.f.name)[0] + '_residue.txt', 'r') ## Abre o arquivo que vem do programa acima
   except (IOError) as errno:
     print ('I/O error - ** Check the files of SASA calculation - something went wrong **. %s' % errno)
     sys.exit()
   
   SASA_data=[]
   for line2 in file_sasa:
     list2 = line2.split()
     Area_norm = np.float(list2[2])/np.float(Area_residues[All_residues.index(list2[1])])
     if Area_norm >= 1.0:
       print "Warning - ** SASA greater than 1.0 **",list2[1],list2[0],list2[2],np.float(Area_residues[All_residues.index(list2[1])]),Area_norm
       print "Automatically changed to 0.75"
       Area_norm = 0.750000000001
     SASA_data.append([list2[1],list2[2],Area_norm])
   indiv_data=[]
   S=[]
   SAij=[]
   total_atoms=[]
   total_residues=[]
   total_charged_residues=[]
   for line in file_pdb: ## Reading file.pdb
     lista = line.split()
     id = lista[0]
     if id == 'ATOM':
       atom_index = np.int(lista[1]) 
       atom_type = lista[2]
       residue_type = lista[3]
       chain = lista[4]
       residue_index = np.int(lista[5])
       total_atoms.append([atom_index])
       if atom_type == 'CA' and chain == 'A':
	 total_residues.append([residue_index])
       if atom_index == 1 and atom_type == 'N' and chain == 'A' and residue_index == 1 and not residue_type in Charged_residues: ## Select the charged residues
	 total_charged_residues.append([atom_index])
	 S.append(['N_T',residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index('N_TER')],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index('N_TER')]])
       if residue_type in Charged_residues and atom_type in Charged_atoms: ## Seleciona os resíduos carregados
	 total_charged_residues.append([atom_index])
         S.append([lista[3],residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index(residue_type)],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index(residue_type)]])
       if atom_type == 'OXT' and chain == 'A'  and not residue_type in Charged_residues:
	 total_charged_residues.append([atom_index])
	 S.append(['C_T',residue_index,np.size(total_charged_residues),lista[1],lista[2],np.float(lista[6]),np.float(lista[7]),np.float(lista[8]),PKA[Charged_residues.index('C_TER')],SASA_data[np.size(total_residues)-1][2],Charge_values[Charged_residues.index('C_TER')]])

   print "There are: %d Charged_residues" % np.size(total_charged_residues)
   Restype=np.asarray([i[0] for i in S])
   X=np.asarray([i[5] for i in S])
   Y=np.asarray([i[6] for i in S])
   Z=np.asarray([i[7] for i in S])
   Pk=np.asarray([i[8] for i in S])
   SA=np.asarray([i[9] for i in S])
   Q=np.asarray([i[10] for i in S])
   Restype=np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(Restype, 'HIS','H'), 'ASP','D'), 'ARG','R'), 'GLU','E'), 'LYS','K')
   X = X - np.mean(X)
   Y = Y - np.mean(Y)
   Z = Z - np.mean(Z)
   XYZ = zip(X,Y,Z)
   Origin = np.zeros(np.shape(XYZ))
   dist = distance.cdist(XYZ, XYZ, 'euclidean')
   if arguments.arg_e == 'TK':
    dist_origin = distance.cdist(XYZ, Origin, 'euclidean')
    angle = distance.cdist(XYZ, XYZ, 'cosine')
    raio = (np.max(dist)*0.5 + 3.4+2.0, np.max(dist)*0.5 + 2.0+2.0)
    np.seterr(invalid='ignore')
    np.seterr(divide='ignore')
    theta = np.arccos(1-angle)
    NormA = np.matrix([LA.norm(v) for v in np.array(XYZ)])
    rirj = np.array(np.dot(np.transpose(NormA),NormA))
    A = np.divide(raio[1],e[0]*dist)
    B = (np.nansum(np.array([((e[1]-e[0])/(e[1]-(n*e[0])/(n+1)))*(np.power((rirj/(raio[1]*raio[1])),n))*(eval_legendre(n, np.cos(theta))) for n in range(0,60)]),axis=0))/(e[0]) 
    C = (np.divide(e[2],1+e[2]) + np.power(e[2],2)*np.sum(np.array([np.divide(np.divide(2*n+1,2*n-1)*np.divide(e[1],(n+1)*e[1]+n*e[0])*(np.power((rirj/(raio[0]*raio[0])),n))*(eval_legendre(n, np.cos(theta))),np.divide(Kn(n+1,e[2]),Kn(n-1,e[2])) + np.divide(n*(e[1]-e[0]),(n+1)*e[1]+n*e[0])*np.divide(np.power(e[2],2),4.0*np.power(n,2)-1)*np.power(np.divide(raio[1],raio[0]),2*n+1)) for n in range(1,60)]),axis=0))/(e[1])
    Qe = np.divide(e[3]*e[4]*e[4]*np.power(10,7),4*np.pi*e[5])
    SAij = distance.cdist(zip(SA), zip(SA), lambda u,v: (u+v)*0.5)
    E = Qe*(np.divide(A-B,2*raio[1])-np.divide(C,2*raio[0]))*(1-SAij)
 
    if np.sum(np.where(E<0)) > 0:
      print '###############################################################'
      print "There are: %d negatives TK energy values - Please check the radius of TK method!" % np.int(np.sum(np.where(E<0)))
      print "Sugestion - Increase b radius"
      print "Current radius ratio b/a=", np.divide(raio[1],raio[0])
      print '###############################################################'
    E[np.isinf(E)]= 0
    E[np.isnan(E)]= 0   
    E_out=np.vstack([np.vstack([Q,E]),Pk])
    np.savetxt('E.dat',E_out)
    
   if arguments.arg_s == 'EX': 
      
      print u"\U0001F63A", "### TK - Exact ###", u"\U0001F63A"
      start = time.time()        
      p = subprocess.Popen([r"c++","./src/tksaex.c",'-lm','-O3','-o','tksaex.exe'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
      p.communicate()
      p = subprocess.Popen(["./tksaex.exe",np.str(pH),np.str(T)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
      i=0
      j=1
      while p.poll() is None:
          sys.stdout.write('\r')
          sys.stdout.write("TKSA EX is running, please wait - [%20s%-20s]" % ('='*i,'='*i))
          sys.stdout.write(u"\U0001F63A")
          sys.stdout.flush()
          if i>19:
           j=j+1
          if j%2 == 0:
           i=i-1
          if j%2 == 1:
           i=i+1
          if i == 0:
            j=1
          sys.stdout.flush()
          time.sleep(0.1)
          
      output,err = p.communicate()
      print output
      print err
      end = time.time()
      elapsed = end - start
      print "Ran in %f sec" % elapsed
      
   if arguments.arg_s == 'MC': 
      
      print u"\U0001F63A", "### TKSA - MC ###", u"\U0001F63A"
      start = time.time()        
      p = subprocess.Popen([r"c++","./src/tksamc.c",'-lm','-O3','-o','tksamc.exe'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
      p.communicate()
      p = subprocess.Popen(["./tksamc.exe",np.str(pH),np.str(T)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
      i=0
      j=1
      while p.poll() is None:
          sys.stdout.write('\r')
          sys.stdout.write("TKSA MC is running, please wait - [%20s%-20s]" % ('='*i,'='*i))
          sys.stdout.write(u"\U0001F63A")
          sys.stdout.flush()
          if i>19:
           j=j+1
          if j%2 == 0:
           i=i-1
          if j%2 == 1:
           i=i+1
          if i == 0:
            j=1
          sys.stdout.flush()
          time.sleep(0.1)
          
      output,err = p.communicate()
      print output
      print err
      end = time.time()
      elapsed = end - start
      print "Ran in %f sec" % elapsed
      
   if arguments.arg_plot == 'yes' and arguments.arg_s =='EX': 
      try: 
        file_plot = open("out.dat", 'r')
      except (IOError) as errno:
        print ('I/O error - ** Output file with issues  - out.dat **. %s' % errno)
        sys.exit()
   
      plot_data=[]
      for line3 in file_plot: # Plotting
        list3 = line3.split()
        plot_data.append(list3)
      
      Restype=np.char.replace(np.char.replace(["%s%02d" % t for t in zip(Restype,np.asarray([i[1] for i in S]))],'C_T'+np.str(S[-1][1]),'CTR'),'N_T0'+np.str(S[0][1]),'NTR')
      S=np.hstack((S,plot_data))
      
      plot_data=list(map(float, np.asarray(plot_data).flatten()))
      print "Total dG Energy: ",np.sum(np.asarray(plot_data))
      x_pos = np.arange(len(total_charged_residues))
      fig = plt.figure()
      ax = fig.add_subplot(111)
      width=1.0
      colors = []
      for position, value in enumerate(plot_data):
        if value > 0 and SA[position] > 0.5:
           colors.append('r')
        else:
           colors.append('b')
      ax.bar(x_pos, plot_data,width=width,color=colors,linewidth=2)
      ax.tick_params('both', length=5, width=2, which='major',labelsize=13)   
      plt.setp(ax.spines.values(), linewidth=2)
      plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=15)
      plt.xlim([0,np.size(x_pos)])
      plt.ylabel(r'$\Delta G_{qq}$(kJ/mol)',fontsize=20)
      fig.savefig('Fig_EX_'+ os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.jpg', dpi = 300)
   
      header='1-Name	2-Residue-index	3-Position	4-Atom	5-Atom-type	6-X	7-Y	8-Z	9-PKA	10-SASA	11-Charge	12-dG_Energy 13-Total_dG= '+str(np.sum(np.asarray(plot_data)))+''
      np.savetxt('Output_EX_'+os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.dat',S,fmt='%s', delimiter="	",header=str(header))
   
   if arguments.arg_plot == 'yes' and(arguments.arg_s =='MC'): 
      try: 
        file_plot = open("out.dat", 'r')
      except (IOError) as errno:
        print ('I/O error - ** Output file with issues - out.dat **. %s' % errno)
        sys.exit()
   
      plot_data=[]
      for line3 in file_plot: ## Plotting
        list3 = line3.split()
        plot_data.append(list3)
      
      Restype=np.char.replace(np.char.replace(["%s%02d" % t for t in zip(Restype,np.asarray([i[1] for i in S]))],'C_T'+np.str(S[-1][1]),'CTR'),'N_T0'+np.str(S[0][1]),'NTR')
      S=np.hstack((S,plot_data))
      
      plot_data=list(map(float, np.asarray(plot_data).flatten()))
      print "Total dG Energy: ",np.sum(np.asarray(plot_data))
      x_pos = np.arange(len(total_charged_residues))
      fig = plt.figure()
      ax = fig.add_subplot(111)
      width=1.0
      colors = []
      for position, value in enumerate(plot_data):
        if value > 0 and SA[position] > 0.5:
           colors.append('r')
        else:
           colors.append('b')
      ax.bar(x_pos, plot_data,width=width,color=colors,linewidth=2)
      ax.tick_params('both', length=5, width=2, which='major',labelsize=13)   
      plt.setp(ax.spines.values(), linewidth=2)
      if np.size(total_charged_residues)>35:
        plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=8)
      elif np.size(total_charged_residues) >= 15 and np.size(total_charged_residues) <= 35:
        plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=12)
      else: 
        plt.xticks(x_pos+width/2.0,Restype,rotation=90,fontsize=15)
      plt.xlim([0,np.size(x_pos)])
      plt.ylabel(r'$\Delta G_{qq}$(kJ/mol)',fontsize=20)
      plt.show()
      fig.savefig('Fig_MC_'+ os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.jpg', dpi = 300)
   
      header='1-Name	2-Residue-index	3-Position	4-Atom	5-Atom-type	6-X	7-Y	8-Z	9-PKA	10-SASA	11-Charge	12-dG_Energy 13-Total_dG= '+str(np.sum(np.asarray(plot_data)))+''
      np.savetxt('Output_MC_'+os.path.splitext(arguments.f.name)[0]+'_pH_'+str(pH)+'_T_'+str(T)+'.dat',S,fmt='%s', delimiter="	",header=str(header))
   cmd2 = 'mv result.txt *.exe E.dat out.dat SASA* '+os.path.splitext(arguments.f.name)[0]+'*.txt ./aux'
   os.system(cmd2)
   print u"\U0001F63A", "### Finished ###", u"\U0001F63A"
Beispiel #52
0
def dice_dist(profile1, keypoints1, profile2, keypoints2, r, rasterize_factor):
    def _make_mask(profile, keypoints, angle, r_r, rasterize_factor):

        if angle != 0:
            profile = rotate_coords(profile, angle)
            keypoints = rotate_coords(keypoints, angle)
        profile = np.round(profile * rasterize_factor).astype(int)
        keypoints = np.round(keypoints * rasterize_factor).astype(int)
        x0, y0 = profile.min(axis=0) - 4 * r_r
        profile -= [x0, y0]
        keypoints -= [x0, y0]
        w, h = profile.max(axis=0) + 4 * r_r
        img = Image.new("1", (w, h))
        draw = ImageDraw.Draw(img)
        draw.polygon([(x, y) for x, y in profile], fill=1)
        mask = np.array(img, dtype=bool).T
        img.close()
        return mask, keypoints

    def _get_kp_mask(mask, keypoints, i, shift_x, shift_y, r_r,
                     rasterize_factor):

        row, col = np.round(
            keypoints[i] -
            [shift_x * rasterize_factor, shift_y *
             rasterize_factor]).astype(int)
        col0 = col - r_r
        col1 = col0 + 2 * r_r + 1
        row0 = row - r_r
        row1 = row0 + 2 * r_r + 1
        return mask[:, col0:col1][row0:row1]

    r = int(round(r))
    r_r = r * rasterize_factor
    w = np.ones((2 * r_r + 1, 2 * r_r + 1), dtype=float)
    ijs = np.argwhere(w > 0)
    d = np.sqrt(((ijs - r_r)**2).sum(axis=1))
    d[d > r_r] = r_r
    w[ijs[:, 0], ijs[:, 1]] = ((r_r - d) / r_r)**2

    rot_step = 2 * np.arcsin(1 / (2 * r))
    angle_min = -np.pi / 8
    angle_max = np.pi / 8
    angles = np.linspace(angle_min, angle_max,
                         int(round((angle_max - angle_min) / rot_step)))
    angles = angles[angles != 0]
    angles = np.insert(angles, 0, 0)

    shifts = []
    for shift_x in range(-4, 5, 2):
        for shift_y in range(-4, 5, 2):
            if [shift_x, shift_y] == [0, 0]:
                continue
            shifts.append([shift_x, shift_y])
    shifts = [[0, 0]] + shifts

    d_dist_sum = 0
    d_dist_norm = 0
    d = cdist(keypoints1, keypoints2)
    mask_m1, keypoints_m1 = _make_mask(profile1, keypoints1, 0, r_r,
                                       rasterize_factor)
    for i in range(keypoints1.shape[0]):
        jj = np.where(d[i] < 2 * r)[0]
        if not jj.size:
            continue
        mask1 = _get_kp_mask(mask_m1, keypoints_m1, i, 0, 0, r_r,
                             rasterize_factor)
        mask1_sum = w[mask1].sum()
        d_dist_opt = np.inf
        for angle in angles:
            mask_m2, keypoints_m2 = _make_mask(profile2, keypoints2, angle,
                                               r_r, rasterize_factor)
            for j in jj:
                for shift_x, shift_y in shifts:
                    mask2 = _get_kp_mask(mask_m2, keypoints_m2, j, shift_x,
                                         shift_y, r_r, rasterize_factor)
                    d_dist = 1 - (2 * w[mask1 & mask2].sum()) / (
                        mask1_sum + w[mask2].sum())
                    if d_dist < d_dist_opt:
                        d_dist_opt = d_dist

        if d_dist_opt < np.inf:
            d_dist_sum += d_dist_opt**2
            d_dist_norm += 1

    return d_dist_sum, d_dist_norm
Beispiel #53
0
    def fit(self,
            target_mask,
            method='min_distance',
            r=5,
            n_exps=50,
            n_parcels=2,
            meta_estimator=SCALE,
            **kwargs):
        """
        Run CBP parcellation.

        Parameters
        ----------
        target_mask : img_like
            Image with binary mask for region of interest to be parcellated.
        n_parcels : :obj:`int` or array_like of :obj:`int`, optional
            Number of parcels to generate for ROI. If array_like, each parcel
            number will be evaluated and results for all will be returned.
            Default is 2.
        n_iters : :obj:`int`, optional
            Number of iterations to run for each parcel number.
            Default is 10000.
        n_cores : :obj:`int`, optional
            Number of cores to use for model fitting.

        Returns
        -------
        results
        """
        assert np.array_equal(self.mask.affine, target_mask.affine)
        kernel_args = {
            k: v
            for k, v in kwargs.items() if k.startswith('kernel__')
        }
        meta_args = {
            k.split('meta__')[1]: v
            for k, v in kwargs.items() if k.startswith('meta__')
        }

        if not isinstance(n_parcels, list):
            n_parcels = [n_parcels]

        # Step 1: Build correlation matrix
        target_data = apply_mask(target_mask, self.mask)
        target_map = unmask(target_data, self.mask)
        target_data = target_map.get_data()
        mask_idx = np.vstack(np.where(target_data))
        n_voxels = mask_idx.shape[1]
        voxel_arr = np.zeros((n_voxels, np.sum(self.mask)))

        ijk = self.coordinates[['i', 'j', 'k']].values
        temp_df = self.coordinates.copy()
        for i_voxel in range(n_voxels):
            voxel = mask_idx[:, i_voxel]
            temp_df['distance'] = cdist(ijk, voxel)

            if method == 'min_studies':
                # number of studies
                temp_df2 = temp_df.groupby('id')[['distance']].min()
                temp_df2 = temp_df2.sort_values(by='distance')
                sel_ids = temp_df2.iloc[:n_exps].index.values
            elif method == 'min_distance':
                # minimum distance
                temp_df2 = temp_df.groupby('id')[['distance']].min()
                sel_ids = temp_df2.loc[temp_df2['distance'] < r].index.values

            # Run MACM
            voxel_meta = meta_estimator(self.dataset,
                                        ids=sel_ids,
                                        **kernel_args)
            voxel_meta.fit(**meta_args)
            voxel_arr[i_voxel, :] = apply_mask(voxel_meta.results['ale'],
                                               self.mask)

        # Correlate voxel-specific MACMs across voxels in ROI
        voxel_corr = np.corrcoef(voxel_arr)
        corr_dist = 1 - voxel_corr

        # Step 2: Clustering
        labels = np.zeros((n_voxels, len(n_parcels)))
        metric_types = ['contiguous']
        metrics = pd.DataFrame(index=n_parcels,
                               columns=metric_types,
                               data=np.zeros(
                                   (len(n_parcels), len(metric_types))))
        for i_parc, n_clusters in enumerate(n_parcels):
            # K-Means clustering
            _, labeled, _ = k_means(corr_dist,
                                    n_clusters,
                                    init='k-means++',
                                    precompute_distances='auto',
                                    n_init=1000,
                                    max_iter=1023,
                                    verbose=False,
                                    tol=0.0001,
                                    random_state=1,
                                    copy_x=True,
                                    n_jobs=1,
                                    algorithm='auto',
                                    return_n_iter=False)
            labels[:, i_parc] = labeled

            # Check contiguity of clusters
            # Can nilearn do this?
            temp_mask = np.zeros(target_data.shape)
            for j_voxel in range(n_voxels):
                i, j, k = mask_idx[:, j_voxel]
                temp_mask[i, j, k] = labeled[j_voxel]
            labeled = meas.label(temp_mask, np.ones((3, 3, 3)))[0]
            n_contig = len(np.unique(labeled))
            metrics.loc[n_clusters,
                        'contiguous'] = int(n_contig > (n_clusters + 1))

        self.solutions = labels
        self.metrics = metrics
    def update(self, rects):

        if len(rects) == 0:

            for objectID in list(self.disappeared.keys()):
                self.disappeared[objectID] += 1

                if self.disappeared[objectID] > self.maxDisappeared:
                    self.deregister(objectID)

            return self.objects

        inputCentroids = np.zeros((len(rects), 2), dtype="int")

        for (i, (startX, startY, endX, endY)) in enumerate(rects):
            # use the bounding box coordinates to derive the centroid
            cX = int((startX + endX) / 2.0)
            cY = int((startY + endY) / 2.0)
            inputCentroids[i] = (cX, cY)

        if len(self.objects) == 0:
            for i in range(0, len(inputCentroids)):
                self.register(inputCentroids[i])

        else:
            objectIDs = list(self.objects.keys())
            objectCentroids = list(self.objects.values())

            D = dist.cdist(np.array(objectCentroids), inputCentroids)

            rows = D.min(axis=1).argsort()

            cols = D.argmin(axis=1)[rows]

            usedRows = set()
            usedCols = set()

            for (row, col) in zip(rows, cols):

                if row in usedRows or col in usedCols:
                    continue

                if D[row, col] > self.maxDistance:
                    continue

                objectID = objectIDs[row]
                self.objects[objectID] = inputCentroids[col]
                self.disappeared[objectID] = 0

                usedRows.add(row)
                usedCols.add(col)

            unusedRows = set(range(0, D.shape[0])).difference(usedRows)
            unusedCols = set(range(0, D.shape[1])).difference(usedCols)

            if D.shape[0] >= D.shape[1]:

                for row in unusedRows:

                    objectID = objectIDs[row]
                    self.disappeared[objectID] += 1

                    if self.disappeared[objectID] > self.maxDisappeared:
                        self.deregister(objectID)

            else:
                for col in unusedCols:
                    self.register(inputCentroids[col])

        return self.objects
Beispiel #55
0
    def __call__(self, X, Y=None, eval_gradient=False):
        X = np.atleast_2d(X)
        length_scale = _check_length_scale(X, self.length_scale)
        if Y is None:
            dists = pdist(X / length_scale, metric='euclidean')
        else:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated when Y is None.")
            dists = cdist(X / length_scale,
                          Y / length_scale,
                          metric='euclidean')

        if self.nu == 0.5:
            K = np.exp(-dists)
        elif self.nu == 1.5:
            K = dists * math.sqrt(3)
            K = (1. + K) * np.exp(-K)
        elif self.nu == 2.5:
            K = dists * math.sqrt(5)
            K = (1. + K + K**2 / 3.0) * np.exp(-K)
        else:  # general case; expensive to evaluate
            K = dists
            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
            tmp = (math.sqrt(2 * self.nu) * K)
            K.fill((2**(1. - self.nu)) / gamma(self.nu))
            K *= tmp**self.nu
            K *= kv(self.nu, tmp)

        if Y is None:
            # convert from upper-triangular matrix to square matrix
            K = squareform(K)
            np.fill_diagonal(K, 1)

        if eval_gradient:
            if self.hyperparameter_length_scale.fixed:
                # Hyperparameter l kept fixed
                K_gradient = np.empty((X.shape[0], X.shape[0], 0))
                return K, K_gradient

            # We need to recompute the pairwise dimension-wise distances
            if self.anisotropic:
                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \
                    / (length_scale ** 2)
            else:
                D = squareform(dists**2)[:, :, np.newaxis]

            if self.nu == 0.5:
                K_gradient = safe_divide(K[..., np.newaxis] * D,
                                         np.sqrt(D.sum(2))[:, :, np.newaxis])
                K_gradient[~np.isfinite(K_gradient)] = 0
            elif self.nu == 1.5:
                K_gradient = \
                    3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
            elif self.nu == 2.5:
                tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
                K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)
            else:
                # approximate gradient numerically
                def f(theta):  # helper function
                    return self.clone_with_theta(theta)(X, Y)

                return K, _approx_fprime(self.theta, f, 1e-10)

            if not self.anisotropic:
                return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
            else:
                return K, K_gradient
        else:
            return K
Beispiel #56
0
 def is_at_goal(self, target_id, goal_id, state):
     target = np.array([state[target_id, :2]])
     goal = np.array([self.blueprint[goal_id]])
     mostly_still = abs(np.sum(state[target_id, 2:])) < 0.01
     close_to_goal = float(cdist(target, goal)) < 0.2
     return close_to_goal, mostly_still
Beispiel #57
0
        sess.run(inits)

        # Evaluation routine
        stat_hb1 = []
        stat_icarl = []
        stat_ncm = []

        for i in range(int(np.ceil(len(files_from_cl) / batch_size))):
            sc, l, loss, files_tmp, feat_map_tmp = sess.run([
                scores, label_batch, loss_class, file_string_batch,
                op_feature_map
            ])
            mapped_prototypes = feat_map_tmp[:, 0, 0, :]
            pred_inter = (mapped_prototypes.T) / np.linalg.norm(
                mapped_prototypes.T, axis=0)
            sqd_icarl = -cdist(class_means[:, :, 0, itera].T, pred_inter.T,
                               'sqeuclidean').T
            sqd_ncm = -cdist(class_means[:, :, 1, itera].T, pred_inter.T,
                             'sqeuclidean').T
            stat_hb1 += ([
                ll in best
                for ll, best in zip(l,
                                    np.argsort(sc, axis=1)[:, -top:])
            ])
            stat_icarl += ([
                ll in best
                for ll, best in zip(l,
                                    np.argsort(sqd_icarl, axis=1)[:, -top:])
            ])
            stat_ncm += ([
                ll in best
                for ll, best in zip(l,
Beispiel #58
0
 
     x = PC9_Shipment_Qty.PC9_Shipped_Qty
     X = []
     for i in range(len(x)):
         X.append([float(x[i]), 0]) # changes to 2D list
     X = np.asarray(X) # changes list into array
 
 # Elbow test to determine k
 # Run K-Means algorithm for all values between 1 to 10
     from scipy.cluster.vq import kmeans
     K = range(1,10)
     KM = [kmeans(X,k) for k in K]
 
 # Determine the distance between each PC9 Size combination and all calculated Centroids
     centroids = [cent for (cent,var) in KM]
     D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
 
 # As all possible combinations are produced between PC9 Size and Centroids
 # Keep only the pairing with the shortest distance (or MINIMUM)
     dist = [np.min(D,axis=1) for D in D_k]
 
 # Stores all of the respective error results from each K cluster.
 # As 10 clusters were run, 10 cluster results were stored
     avgWithinSS[name] = [sum(d)/X.shape[0] for d in dist]
 
 # Initialize variables
     k = 2
     ratio = 1
     ratio2 = 1
 
 # Perform "Elbow" test to determine the best cluster
Beispiel #59
0
def main():
    """
    Create an aligned functional group based on command line arguments.
    """

    parser = argparse.ArgumentParser(
        description='Create a functional group from a smiles pattern',
        epilog=
        'Example usage: %(prog)s -s OPr -n PropylEther -c "Alkyl Ether" -m OCCC'
    )

    parser.add_argument('smi_string', help="Smiles string to generate group")
    parser.add_argument('-s',
                        '--short-name',
                        help='Short name (defaults to smiles string)')
    parser.add_argument('-n',
                        '--name',
                        required=True,
                        help='Descriptive name (e.g. PropylEther)')
    parser.add_argument('-m',
                        '--mepo-compatible',
                        action='store_true',
                        help='Record group as compatible with MEPO-QEq')
    parser.add_argument('-c',
                        '--classification',
                        help='General classification (e.g. "Alkyl Halide")')
    parser.add_argument('-t',
                        '--terminal',
                        action='store_true',
                        help='Output to terminal as well as files')

    args = parser.parse_args()

    fgroup = args.smi_string
    if '%99' in fgroup:
        print('Do not use ring closure 99')
        raise SystemExit
    if not args.short_name:
        args.short_name = fgroup

    # Use an explicitly defined benzene as a base
    # Do rings closure at 99 in case functional group has other closures
    attached = '[cH]%99[cH][cH][cH][cH]c%99'

    # make3D by default gives an optimised structure, great!
    pybel_mol = pybel.readstring('smi', attached + fgroup)
    pybel_mol.title = "[{}] {}".format(args.short_name, args.name)
    pybel_mol.make3D(forcefield='UFF')

    uff = ob.OBForceField_FindForceField('uff')
    uff.Setup(pybel_mol.OBMol)
    uff.GetAtomTypes(pybel_mol.OBMol)

    coordinates = []

    for ob_atom in pybel_mol:
        coordinates.append(ob_atom.coords)

    rotated_coordinates = realign(coordinates, 11, 10, 8)

    bonds = {}

    # look at all the bonds separately from the atoms
    for bond in ob.OBMolBondIter(pybel_mol.OBMol):
        # These rules are translated from ob/forcefielduff.cpp...
        start_idx = bond.GetBeginAtomIdx()
        end_idx = bond.GetEndAtomIdx()

        start_atom = bond.GetBeginAtom()
        end_atom = bond.GetEndAtom()

        bond_order = bond.GetBondOrder()
        if bond.IsAromatic():
            bond_order = 1.5
        # e.g., in Cp rings, may not be "aromatic" by OB
        # but check for explicit hydrogen counts
        #(e.g., biphenyl inter-ring is not aromatic)
        #FIXME(tdaff): aromatic C from GetType is "Car" is this correct?
        if (start_atom.GetType()[-1] == 'R' and end_atom.GetType()[-1] == 'R'
                and start_atom.ExplicitHydrogenCount() == 1
                and end_atom.ExplicitHydrogenCount() == 1):
            bond_order = 1.5
        if bond.IsAmide():
            bond_order = 1.41
        # Zero the indicies for the connecting atom so that
        # negative indexes are benzene atoms
        bond_length = bond.GetLength()
        bond_id = tuple(sorted((start_idx - 12, end_idx - 12)))
        bonds[bond_id] = (bond_length, bond_order)

    # We can start building our output now!

    output_text = [
        "[{}]\n".format(args.short_name), "name = {}\n".format(args.name),
        "smiles = {}\n".format(fgroup),
        "mepo_compatible = {}\n".format(args.mepo_compatible)
    ]

    if args.classification:
        output_text.append("class = {}\n".format(args.classification))

    # functional group fingerprint
    nbins = 10
    max_distance = 10.0
    bin_width = max_distance / nbins
    fingerprint = [0.0] * (nbins * 3)

    atom_block = []
    base_atom = pybel_mol.atoms[10].OBAtom
    for ob_atom, coord in zip(pybel_mol, rotated_coordinates):
        atom_idx = ob_atom.OBAtom.GetIndex()
        if atom_idx > 10:
            atomicnum = ob_atom.atomicnum
            element = ATOMIC_NUMBER[atomicnum]
            ff_type = ob_atom.OBAtom.GetData("FFAtomType").GetValue()
            atom_block.append(
                "    {0:4} {1:5} {2[0]:10.6f} {2[1]:10.6f} {2[2]:10.6f}\n".
                format(element, ff_type, coord))

            # Generate fingerprint data
            distance = ob_atom.OBAtom.GetDistance(base_atom)
            if distance > max_distance:
                continue
            # Put in distance bin
            fingerprint[int(distance / bin_width)] += 1
            # Put in electronegativity bin
            electronegativity = ob.etab.GetElectroNeg(atomicnum)
            fingerprint[nbins + int(distance / bin_width)] += electronegativity
            # Put in vdw radii
            vdw_radius = ob.etab.GetVdwRad(atomicnum)
            fingerprint[2 * nbins + int(distance / bin_width)] += vdw_radius

    fingerprint = ",".join("{:.2f}".format(i) for i in fingerprint)

    #
    # 3D fingerprint
    #

    xmin, xmax = -5.658385, 6.758497
    ymin, ymax = -2.506779, 7.580274
    zmin, zmax = -2.469688, 4.024162
    spacing = 1.0

    # make gridpoints have the cartesian coordinates of all the
    # points of interest on the grid
    x_range = np.arange(xmin - 2.0 * spacing, xmax + 3.0 * spacing, spacing)
    y_range = np.arange(ymin - 2.0 * spacing, ymax + 3.0 * spacing, spacing)
    z_range = np.arange(zmin - 2.0 * spacing, zmax + 3.0 * spacing, spacing)
    gridpoints = [(x, y, z) for x in x_range for y in y_range for z in z_range]
    grid_shape = (len(x_range), len(y_range), len(z_range))

    # Calculate all the atom-point distances
    distance_matrix = cdist(rotated_coordinates, gridpoints)

    # Find charges for all the atoms manually.
    # Automatically would do gasteiger, but fails for
    # some elements and we use qeq anyway
    qeq = ob.OBChargeModel_FindType('qeq')
    qeq.ComputeCharges(pybel_mol.OBMol)

    # coulomb = q1q2/4pie0r no units yet...
    coulomb_matrix = np.zeros(len(gridpoints))
    for ob_atom, distances in zip(pybel_mol, distance_matrix):
        coulomb_matrix += ob_atom.partialcharge / distances

    # LJ potential based off UFF also no units yet...
    vdw_matrix = np.zeros(len(gridpoints))
    for ob_atom, distances in zip(pybel_mol, distance_matrix):
        # Lorentz-Berthelot mixing rules
        probe = (3.4309, 0.1050)  # Carbon
        source = UFF[ATOMIC_NUMBER[ob_atom.atomicnum]]
        sigma = (source[0] + probe[0]) / 2.0
        epsilon = (source[1] * probe[1])**0.5
        vdw_matrix += 4 * epsilon * ((sigma / distances)**12 -
                                     (sigma / distances)**6)

    # Make into 3D gridded data
    coulomb_matrix = np.reshape(coulomb_matrix, grid_shape)
    vdw_matrix = np.reshape(vdw_matrix, grid_shape)

    # Can clip the maximums here or elsewhere
    coulomb_matrix = np.clip(coulomb_matrix, -0.1, 0.1)
    vdw_matrix = np.clip(vdw_matrix, -10, 0)

    # 3D plotting for visualisation
    #from mayavi import mlab
    #s = mlab.contour3d(coulomb_matrix)
    #s = mlab.contour3d(vdw_matrix)
    #mlab.show()

    #
    # Output
    #

    output_text.append('atoms =\n')
    output_text.extend(atom_block)
    output_text.append('orientation = 0.0 1.0 0.0\n')
    output_text.append('normal = 0.0 0.0 1.0\n')
    output_text.append('carbon_bond = {:.3f}\n'.format(bonds[(-1, 0)][0]))
    output_text.append('fingerprint = {}\n'.format(fingerprint))

    bonds_block = []
    # no bonds < idx 11
    for bond in sorted(bonds):
        if not bond[0] < 0 and not bond[1] < 0:
            bonds_block.append("    {0[0]:4} {0[1]:4} {1[1]:5.2f}\n".format(
                bond, bonds[bond]))

    output_text.append('bonds =\n')
    output_text.extend(bonds_block[:])

    # Make some pictures; do this now so the ascii can go in the file
    # But first get rid of the benzene
    for _idx in range(10):
        pybel_mol.OBMol.DeleteAtom(pybel_mol.atoms[0].OBAtom)
    pybel_mol.atoms[0].OBAtom.SetType('R')

    if not 'ascii' in pybel.outformats:
        print("Ascii art not available, please upgrade openbabel")
    else:
        ascii_mol = pybel_mol.write(format='ascii', opt={'a': 2, 'w': 40})
        ascii_mol = [
            '# {}\n'.format(x) for x in ascii_mol.splitlines() if x.strip()
        ]
        output_text[2:2] = ['#\n'] + ascii_mol + ['#\n']

    basename = args.short_name

    pybel_mol.write(format='mol', filename='{}.mol'.format(basename))

    # Always output to a library
    with open('{}.flib'.format(basename), 'w') as out_lib:
        out_lib.writelines(output_text)

    # Make the image with R groups and implicit hydrogen
    unopt_mol = pybel.readstring('smi', "[*:1]" + fgroup)
    unopt_mol.write(format='svg',
                    filename='{}.svg'.format(basename),
                    opt={'C': None})

    # Make a table row in html
    with open('{}.html'.format(basename), 'w') as out_html:
        out_html.write("""\
                <td>{args.short_name}</td>
                <td><p>name: {args.name}</p>
                    <p>smiles: {args.smi_string}</p>
                    <p>MEPO-QEq compatible: {args.mepo_compatible}</td>
                <td><a href="img/{args.short_name}.svg">
                    <img src="img/{args.short_name}.svg"
                         alt="Group: {args.short_name}"
                         title="[{args.short_name}]
                                {args.name}
                                {args.smi_string})"
                         style="height: 75px"/></a>
                </td>
""".format(args=args))

    if args.terminal:
        print("".join(output_text))
Beispiel #60
0
    def _evaluate(self, X, std, gradient, hessian):
        F, dF, hF = [], [], []  # mean
        S, dS, hS = [], [], []  # std

        for gp in self.gps:

            # mean
            K = gp.kernel_(X, gp.X_train_)  # K: shape (N, N_train)
            y_mean = K.dot(gp.alpha_)

            F.append(y_mean)  # y_mean: shape (N,)

            if std:
                if gp._K_inv is None:
                    L_inv = solve_triangular(gp.L_.T, np.eye(gp.L_.shape[0]))
                    gp._K_inv = L_inv.dot(L_inv.T)

                y_var = gp.kernel_.diag(X)
                y_var -= np.einsum("ij,ij->i", np.dot(K, gp._K_inv), K)

                y_var_negative = y_var < 0
                if np.any(y_var_negative):
                    y_var[y_var_negative] = 0.0

                y_std = np.sqrt(y_var)

                S.append(y_std)  # y_std: shape (N,)

            if not (gradient or hessian): continue

            ell = np.exp(gp.kernel_.theta[1:-1])  # ell: shape (n_var,)
            sf2 = np.exp(gp.kernel_.theta[0])  # sf2: shape (1,)
            d = np.expand_dims(cdist(X / ell, gp.X_train_ / ell),
                               2)  # d: shape (N, N_train, 1)
            X_, X_train_ = np.expand_dims(X, 1), np.expand_dims(gp.X_train_, 0)
            dd_N = X_ - X_train_  # numerator
            dd_D = d * ell**2  # denominator
            dd = safe_divide(dd_N, dd_D)  # dd: shape (N, N_train, n_var)

            if self.nu == 1:
                dK = -sf2 * np.exp(-d) * dd

            elif self.nu == 3:
                dK = -3 * sf2 * np.exp(-np.sqrt(3) * d) * d * dd

            elif self.nu == 5:
                dK = -5. / 3 * sf2 * np.exp(
                    -np.sqrt(5) * d) * (1 + np.sqrt(5) * d) * d * dd

            else:  # RBF
                dK = -sf2 * np.exp(-0.5 * d**2) * d * dd

            dK_T = dK.transpose(
                0, 2, 1
            )  # dK: shape (N, N_train, n_var), dK_T: shape (N, n_var, N_train)

            if gradient:
                dy_mean = dK_T @ gp.alpha_  # gp.alpha_: shape (N_train,)
                dF.append(dy_mean)  # dy_mean: shape (N, n_var)

                # TODO: check
                if std:
                    K = np.expand_dims(K, 1)  # K: shape (N, 1, N_train)
                    K_Ki = K @ gp._K_inv  # gp._K_inv: shape (N_train, N_train), K_Ki: shape (N, 1, N_train)
                    dK_Ki = dK_T @ gp._K_inv  # dK_Ki: shape (N, n_var, N_train)

                    dy_var = -np.sum(dK_Ki * K + K_Ki * dK_T,
                                     axis=2)  # dy_var: shape (N, n_var)
                    dy_std = 0.5 * safe_divide(
                        dy_var, y_std)  # dy_std: shape (N, n_var)
                    dS.append(dy_std)

            if hessian:
                d = np.expand_dims(d, 3)  # d: shape (N, N_train, 1, 1)
                dd = np.expand_dims(dd, 2)  # dd: shape (N, N_train, 1, n_var)
                hd_N = d * np.expand_dims(np.eye(len(ell)), (
                    0, 1)) - np.expand_dims(X_ - X_train_, 3) * dd  # numerator
                hd_D = d**2 * np.expand_dims(ell**2, (0, 1, 3))  # denominator
                hd = safe_divide(hd_N,
                                 hd_D)  # hd: shape (N, N_train, n_var, n_var)

                if self.nu == 1:
                    hK = -sf2 * np.exp(-d) * (hd - dd**2)

                elif self.nu == 3:
                    hK = -3 * sf2 * np.exp(
                        -np.sqrt(3) * d) * (d * hd +
                                            (1 - np.sqrt(3) * d) * dd**2)

                elif self.nu == 5:
                    hK = -5. / 3 * sf2 * np.exp(
                        -np.sqrt(5) * d) * (-5 * d**2 * dd**2 +
                                            (1 + np.sqrt(5) * d) *
                                            (dd**2 + d * hd))

                else:  # RBF
                    hK = -sf2 * np.exp(-0.5 * d**2) * (
                        (1 - d**2) * dd**2 + d * hd)

                hK_T = hK.transpose(
                    0, 2, 3, 1
                )  # hK: shape (N, N_train, n_var, n_var), hK_T: shape (N, n_var, n_var, N_train)

                hy_mean = hK_T @ gp.alpha_  # hy_mean: shape (N, n_var, n_var)
                hF.append(hy_mean)

                # TODO: check
                if std:
                    K = np.expand_dims(K, 2)  # K: shape (N, 1, 1, N_train)
                    dK = np.expand_dims(dK_T,
                                        2)  # dK: shape (N, n_var, 1, N_train)
                    dK_Ki = np.expand_dims(
                        dK_Ki, 2)  # dK_Ki: shape (N, n_var, 1, N_train)
                    hK_Ki = hK_T @ gp._K_inv  # hK_Ki: shape (N, n_var, n_var, N_train)

                    hy_var = -np.sum(hK_Ki * K + 2 * dK_Ki * dK + K_Ki * hK_T,
                                     axis=3)  # hy_var: shape (N, n_var, n_var)
                    hy_std = 0.5 * safe_divide(
                        hy_var * y_std - dy_var * dy_std,
                        y_var)  # hy_std: shape (N, n_var, n_var)
                    hS.append(hy_std)

        F = np.stack(F, axis=1)
        dF = np.stack(dF, axis=1) if gradient else None
        hF = np.stack(hF, axis=1) if hessian else None

        S = np.stack(S, axis=1) if std else None
        dS = np.stack(dS, axis=1) if std and gradient else None
        hS = np.stack(hS, axis=1) if std and hessian else None

        out = {'F': F, 'dF': dF, 'hF': hF, 'S': S, 'dS': dS, 'hS': hS}
        return out