コード例 #1
0
ファイル: linkage.py プロジェクト: cheungzq/WebParser
def linkage(y, method):
    '''An extended edition of scipy.cluster.hierarchy.linkage allowing for custom set distance function.
        method can be str indicators as scipy linkage,
        and all be a callable having form f(dm, set1, set2) which computies set distance given distance
        matrix dm
    '''

    if isinstance(method, str):
        return scipy.cluster.hierarcy.linkage(y, method=method)

    distance.is_valid_y(y, throw=True, name='y')
    d = distance.num_obs_y(y)

    Z = np.zeros((d-1,4))
    dm = distance.squareform(y)
    dmo = distance.squareform(y)
    
    dm[np.diag_indices(d)] = np.NaN
    #print(dm)
    idmap = {i:i for i in range(d)}
    active = [i for i in range(d)]
    nodes = {i:{i} for i in range(d)}
    for i in range(d-1):
        m = d - i
        mink = np.nanargmin(dm[np.ix_(active,active)])
        minh = active[mink//m]
        minw = active[mink % m]
        left = idmap[minh]
        right = idmap[minw]
        
        Z[i,0] = left
        Z[i,1] = right
        Z[i,2] = dm[minh,minw]
        Z[i,3] = len(nodes[left]) + len(nodes[right])
        nid = d+i
        
        idmap[minh] = nid
        nodes[nid] = nodes[left] | nodes[right]
        del active[active.index(minw)]
        
        for j in active:
            if j == minh:
                continue
            dm[minh,j] = method(dmo, nodes[nid], nodes[idmap[j]])
            dm[j,minh] = dm[minh,j]
    return Z            
コード例 #2
0
ファイル: medoid.py プロジェクト: massimilian-89/eabc_v2
    def __fullEvalMat(self, cluster, Dissimilarity):

        # Evaluate all distances even if we have a diss matrix?
        M = Dissimilarity.pdist(cluster)
        if scpDist.is_valid_y(M):
            M = scpDist.squareform(M)
        #debug
        assert (M.shape[0] == M.shape[1])

        return M
コード例 #3
0
    def get_linkage(self, d=None, method='average', skip_condensed=False):
        if d is None:
            d = self.G_sym or self.G

        # convert to numpy matrix if it isn't already
        if isinstance(d, nx.Graph):
            d = nx.to_numpy_matrix(d)
        if (skip_condensed is False) and (not is_valid_y(d)):
            d = squareform(d)
        Z = linkage(d, method=method)
        self.Z = Z
        return Z
コード例 #4
0
ファイル: setup.py プロジェクト: rahatzamancse/MPSE
def setup_distances(data, shortest_path=False, min_distance=1e-4, **kwargs):
    """\
    Sets up condensed distances.

    Parameters
    ----------

    data : array
    Distance/dissimilarity data. Options:
    1) a condensed array containing distances (n_samples*(n_samples-1),)
    2) a square matrix containing distances (n_samples, n_samples)
    3) an array with features (length n_samples)

    shortest_path : boolean
    If True, alter distances by computing shortest path.

    kwargs
    ------

    metric : str
    If computing distances from an array of features, this is the metric to be
    passed to scipy.spatial.distance.pdist

    Returns
    -------

    distances : array, shape (n_samples*(n_samples-1)/2,)
    Condensed distances.
    """
    assert isinstance(data, np.ndarray)
    if len(data.shape) == 1:
        assert distance.is_valid_y(data)
        distances = data
    else:
        assert len(data.shape) == 2
        a, b = data.shape
        if b == a:
            distances = distance.squareform(data, checks=False)
        else:
            distances = distance.pdist(data)  ######,**kwargs)
    if shortest_path:
        distances = distance.squareform(distances)
        distances = csgraph.shortest_path(distances)
        distances = distance.squareform(distances, checks=False)
    if min_distance is not None:
        distances = np.maximum(distances, min_distance * np.max(distances))
    return distances
コード例 #5
0
ファイル: tetraedra_tools.py プロジェクト: paul-bd/pbmr
def simplex_volume(*, vertices=None, sides=None) -> float:
    """
    Return the volume of the simplex with given vertices or sides.

    If vertices are given they must be in a NumPy array with shape (N+1, N):
    the position vectors of the N+1 vertices in N dimensions. If the sides
    are given, they must be the compressed pairwise distance matrix as
    returned from scipy.spatial.distance.pdist.

    Raises a ValueError if the vertices do not form a simplex (for example,
    because they are coplanar, colinear or coincident).

    Warning: this algorithm has not been tested for numerical stability.
    """

    # Implements http://mathworld.wolfram.com/Cayley-MengerDeterminant.html

    if (vertices is None) == (sides is None):
        raise ValueError("Exactly one of vertices and sides must be given")

    # β_ij = |v_i - v_k|²
    if sides is None:
        vertices = np.asarray(vertices, dtype=float)
        sq_dists = distance.pdist(vertices, metric='sqeuclidean')

    else:
        sides = np.asarray(sides, dtype=float)
        if not distance.is_valid_y(sides):
            raise ValueError("Invalid number or type of side lengths")

        sq_dists = sides**2

    # Add border while compressed
    num_verts = distance.num_obs_y(sq_dists)
    bordered = np.concatenate((np.ones(num_verts), sq_dists))

    # Make matrix and find volume
    sq_dists_mat = distance.squareform(bordered)

    coeff = -(-2)**(num_verts - 1) * factorial(num_verts - 1)**2
    vol_square = np.linalg.det(sq_dists_mat) / coeff

    if vol_square <= 0:
        raise ValueError('Provided vertices do not form a tetrahedron')

    return np.sqrt(vol_square)
コード例 #6
0
ファイル: evaluate.py プロジェクト: VChristlein/wi_wacv14
def computeDistances(descriptors,
                     method,
                     parallel,
                     nprocs,
                     distance_func=None):
    num_desc = len(descriptors)

    indices = [(y, x) for y in range(num_desc - 1)
               for x in range(y + 1, num_desc)]
    splits = np.array_split(np.array(indices), 8)

    def loop(inds):
        dists = []
        for ind in inds:
            if distance_func == None:
                try:
                    dist = computeDistance(descriptors[ind[0]],
                                           descriptors[ind[1]], method)
                except:
                    print 'method {} failed'.format(method)
                    raise
            else:
                dist = distance_func(descriptors[ind[0]], descriptors[ind[1]])
            dists.append(dist)
        return dists

    if parallel:
        dists = pc.parmap(loop, splits, nprocs)
    else:
        dists = map(loop, splits)

    # convert densed vector-form to matrix
    dense_vector = np.concatenate(dists)
    if spdistance.is_valid_y(dense_vector, warning=True):
        dist_matrix = spdistance.squareform(dense_vector)
    else:
        print 'ERROR: not a valid condensed distance matrix!'
        n = dense_vector.shape[0]
        d = int(np.ceil(np.sqrt(n * 2)))
        should = d * (d - 1) / 2
        print '{} != {}, num: {}'.format(should, n, num_desc)
        sys.exit(1)

    # fill diagonal elements with max
    np.fill_diagonal(dist_matrix, np.finfo(float).max)
    return dist_matrix
コード例 #7
0
def most_and_least_similar_pairs(distance_matrix):
  if distance.is_valid_dm(distance_matrix) == False:
    if distance.is_valid_y(distance_matrix) == False:
      raise ValueError('Invalid distance matrix. Please supply a condensed or redundant distance matrix.')
    distance_matrix = distance.squareform(distance_matrix, force='tomatrix')
  similar_score = 1
  dissimilar_score = 0
  n = distance_matrix.shape[0]
  for i in range(0, n):
    for j in range(i+1, n):
      score = distance_matrix[i, j]
      if score < similar_score:
        similar_score = score
        similar_indices = (i, j)
      if score > dissimilar_score:
        dissimilar_score = score
        dissimilar_indices = (i, j)
  return similar_score, similar_indices, dissimilar_score, dissimilar_indices
コード例 #8
0
ファイル: evaluate.py プロジェクト: VChristlein/wi_wacv14
def computeDistances(descriptors, method, parallel, nprocs,
                     distance_func=None):
    num_desc = len(descriptors)

    indices = [(y,x) for y in range(num_desc-1) for x in range(y+1, num_desc)]
    splits = np.array_split(np.array(indices), 8)
    def loop(inds): 
        dists = []
        for ind in inds:
            if distance_func == None:
                try:
                    dist = computeDistance(descriptors[ ind[0] ],descriptors[ ind[1] ], method)
                except:
                    print 'method {} failed'.format(method)
                    raise
            else: 
                dist = distance_func( descriptors[ ind[0] ],descriptors[ ind[1] ] )
            dists.append(dist)
        return dists

    if parallel:
        dists = pc.parmap(loop, splits, nprocs)
    else:
        dists = map(loop, splits) 
  
    # convert densed vector-form to matrix
    dense_vector = np.concatenate( dists )
    if spdistance.is_valid_y(dense_vector, warning=True):
        dist_matrix = spdistance.squareform( dense_vector )
    else:
        print 'ERROR: not a valid condensed distance matrix!'
        n = dense_vector.shape[0]
        d = int(np.ceil(np.sqrt(n * 2)))
        should = d * (d - 1) / 2
        print '{} != {}, num: {}'.format(should, n, num_desc)
        sys.exit(1)
        
    # fill diagonal elements with max
    np.fill_diagonal(dist_matrix, np.finfo(float).max)
    return dist_matrix 
コード例 #9
0
def Test(X, Y, perms=10000, method='pearson', tail='upper'):
  """
  Takes two distance matrices (either redundant matrices or condensed vectors)
  and performs a Mantel test. The Mantel test is a significance test of the
  correlation between two distance matrices.

  Parameters
  ----------
  X : array_like
      First distance matrix (condensed or redundant).
  Y : array_like
      Second distance matrix (condensed or redundant), where the order of
      elements corresponds to the order of elements in the first matrix.
  perms : int, optional
      The number of permutations to perform (default: 10000). A larger number
      gives more reliable results but takes longer to run. If the actual number
      of possilbe permutations is smaller, the program will enumerate all
      permutations. Enumeration can be forced by setting this argument to 0.
  method : str, optional
      Type of correlation coefficient to use; either 'pearson' or 'spearman'
      (default: 'pearson').
  tail : str, optional
      Which tail to test in the calculation of the empirical p-value; either
      'upper' or 'lower' (default: 'upper').

  Returns
  -------
  r : float
      Veridical correlation
  p : float
      Empirical p-value
  z : float
      Standard score (z-score)
  """

  # Ensure X and Y are arrays.

  X = asarray(X, dtype=float)
  Y = asarray(Y, dtype=float)

  # Check that X and Y are valid distance matrices/vectors.

  if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False:
    raise ValueError('X is not a valid distance matrix')

  if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False:
    raise ValueError('Y is not a valid distance matrix')

  # If X or Y is a matrix, condense it to a vector.

  if len(X.shape) == 2:
    X = distance.squareform(X, force='tovector', checks=False)

  if len(Y.shape) == 2:
    Y = distance.squareform(Y, force='tovector', checks=False)

  # Check for size equality.

  if X.shape[0] != Y.shape[0]:
    raise ValueError('X and Y are not of equal size')

  # Check for minimum size.

  if X.shape[0] < 3:
    raise ValueError('X and Y should represent at least 3 objects')

  # If Spearman correlation is requested, convert X and Y to ranks.

  if method == 'spearman':
    X = rankdata(X)
    Y = rankdata(Y)

  elif method != 'pearson':
    raise ValueError('The method should be set to "pearson" or "spearman"')

  # Most parts of the correlation coefficient will be the same for every
  # permutation and can therefore be computed outside the loop.

  X_res = X - X.mean() # X residuals
  Y_res = Y - Y.mean() # Y residuals
  X_ss = (X_res * X_res).sum() # X sum-of-squares
  Y_ss = (Y_res * Y_res).sum() # Y sum-of-squares
  denominator = sqrt(X_ss * Y_ss) # Denominator of the correlation coefficient

  # Although Y_res will be the same set of numbers on every permutation, the
  # order will be different each time. Therefore, we reformat Y_res as a matrix
  # so that we can take matrix permutations of the Y residuals.
  Y_res_as_matrix = distance.squareform(Y_res, force='tomatrix', checks=False)

  # Determine the size of the matrix (i.e. number of rows/columns).
  n = Y_res_as_matrix.shape[0]

  # Initialize an empty array to store temporary vector permutations of Y_res.
  Y_res_permuted = zeros(Y_res.shape[0], dtype=float)

  # Either enumerate all permutations ...

  if perms >= factorial(n) or perms == 0:

    # Initialize an empty array to store the correlations.
    corrs = zeros(factorial(n), dtype=float)

    # Enumerate all permutations of row/column orders.
    orders = permutations(range(n))

    perms = 0

    for order in orders:

      # Take a permutation of the matrix.
      Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted)

      # Compute the correlation coefficient and store it to corrs.
      corrs[perms] = (X_res * Y_res_permuted).sum() / denominator

      perms += 1

  # ... or randomly sample from the space of permutations.

  else:

    # Initialize an empty array to store the correlations.
    corrs = zeros(perms, dtype=float)

    # Store the veridical correlation coefficient first.
    corrs[0] = (X_res * Y_res).sum() / denominator

    for i in range(1, perms):

      # Choose a random order in which to permute the rows and columns.
      order = random.permutation(n)

      # Take a permutation of the matrix.
      Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted)

      # Compute the correlation coefficient and store it to corrs.
      corrs[i] = (X_res * Y_res_permuted).sum() / denominator

  # Assign veridical correlation to r.
  r = corrs[0]

  # Calculate the empirical p-value for the upper or lower tail.

  if tail == 'upper':
    p = (corrs >= r).sum() / float(perms)

  elif tail == 'lower':
    p = (corrs <= r).sum() / float(perms)

  else:
    raise ValueError('The tail should be set to "upper" or "lower"')

  # Calculate the standard score.

  m = corrs.mean()
  sd = corrs.std()
  z = (r - m) / sd

  return r, p, z
コード例 #10
0
def computeDistances(descriptors,
                     method,
                     distance=True,
                     parallel=True,
                     distance_func=None,
                     nprocs=4):
    num_desc = len(descriptors)

    if np.isnan(descriptors).any():
        raise ValueError('nan in descr!')
    if np.isinf(descriptors).any():
        raise ValueError('inf in descr!')

    for i in range(len(descriptors)):
        if not descriptors[i].any():  # faster
            print 'WARNING: complete row {} is 0'.format(i)

    indices = [(y, x) for y in range(num_desc - 1)
               for x in range(y + 1, num_desc)]

    def loop(ind):
        if distance_func == None:
            try:
                dist = computeDistance(descriptors[ind[0]],
                                       descriptors[ind[1]], method)
            except:
                print 'method {} failed'.format(method)
                raise
        else:
            dist = distance_func(descriptors[ind[0]], descriptors[ind[1]])
        return dist

    if parallel:
        dists = pc.parmap(loop, indices, nprocs=nprocs)
    else:
        dists = map(loop, indices)

    dense_vector = np.array(dists, dtype=float)

    if spdistance.is_valid_y(dense_vector, warning=True):
        dist_matrix = spdistance.squareform(dense_vector)
    else:
        print 'ERROR: not a valid condensed distance matrix!'
        n = dense_vector.shape[0]
        d = int(np.ceil(np.sqrt(n * 2)))
        should = d * (d - 1) / 2
        print '{} != {}, num: {}'.format(should, n, num_desc)
        sys.exit(1)

    # do some checks
    if np.isnan(dist_matrix).any():
        print 'WARNING have a nan in the dist-matrix'
    if np.isinf(dist_matrix).any():
        print 'WARNING have a inf in the dist-matrix'

    if distance:
        if np.count_nonzero(
                dist_matrix == np.finfo(dist_matrix.dtype).max) > 0:
            raise ValueError('there is already a float-maximum')
        np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).max)
    else:
        if np.count_nonzero(
                dist_matrix == np.finfo(dist_matrix.dtype).min) > 0:
            raise ValueError('there is already a float-min')
        np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).min)

    return dist_matrix  #, dist_m
コード例 #11
0
    group_output.add_argument('-f', '--output-format', default="newick", choices=["newick", "json", "png"], help='The output format. [Default: %(default)s]')
    args = parser.parse_args()

    # Load distance matrix
    dist_matrix_io = DistanceMatrixIO(args.input_distances)
    dist_matrix = dist_matrix_io.dist_matrix
    rows_names = dist_matrix_io.names

    # Process tree
    tree = None
    data_link = None
    if len(rows_names) == 1:
        tree = Node(rows_names[0])
    else:
        # Computing distance and linkage
        if not is_valid_y(dist_matrix):
            dist_matrix = squareform(dist_matrix)
        data_link = linkage(dist_matrix, args.linkage_method)
        # SciPy format to Node
        hc_tree = to_tree(data_link, rd=False)
        id_2_name = dict(zip(range(len(rows_names)), rows_names))
        tree = Node.fromClusterNode(hc_tree, id_2_name)

    # Write output
    if args.output_format != "png":  # Text outputs
        out_str = None
        if args.output_format == "newick":
            out_str = "{};".format(tree.toNewick())
        elif args.output_format == "json":
            out_str = json.dumps(tree.toDict(), default=lambda o: o.__dict__, sort_keys=False)
        with open(args.output_tree, "w") as FH_out:
コード例 #12
0
     dm = d.squareform(d.pdist(coords))  # Distance matrix using scipy.pdist
     contact = where(dm < cutoff, ones_like(dm), zeros_like(
         dm))  # An array with 1 if distance < cutoff, 0 otherwise
     dms.append(dm)  # List of distance matrices from all structures
     contacts.append(
         contact)  # List of contact matrices from all structures
     rg = sqrt(sum(dm**2) / (2 * nres**2))  # Calculates radius of gyration
     f_rg.write("%i\t%.3f\n" % (i / nres + 1, rg))
 f_rg.close()
 ave_dms = average(array(dms),
                   axis=0)  # Mean distance matrix from all structures
 std_dms = std(array(dms),
               axis=0)  # Std distance matrix from all structures
 ave_contacts = average(array(contacts),
                        axis=0)  # Mean contact matrix from all structures
 if d.is_valid_y(ave_dms): ave_dms = d.squareform(ave_dms)
 if d.is_valid_y(std_dms): std_dms = d.squareform(ave_dms)
 if d.is_valid_y(ave_contacts): ave_contacts = d.squareform(ave_contacts)
 savetxt(f[:-4] + '.dm', ave_dms, fmt="%.3f")
 savetxt(f[:-4] + '.std', std_dms, fmt="%.3f")
 savetxt(f[:-4] + '.cm', ave_contacts, fmt="%.3f")
 m, n = ave_dms.shape
 scalingf = open(f[:-4] + '.nu', 'w')  # File with polymer scaling (r vs. N)
 scalingfs = open(f[:-4] + '.nus',
                  'w')  # File with std of polymer scaling (r vs. N)
 for i in range(m):
     dm_diag = diagonal(ave_dms, i)  # Diagonals of distance matrix
     scalingf.write("%s\t%.3f\n" %
                    (i, average(dm_diag)))  # Averaged to get mean r
     dm_diags = diagonal(std_dms, i)  # Diagonals of distance matrix
     scalingfs.write("%s\t%.3f\n" %
コード例 #13
0
ファイル: fastcluster.py プロジェクト: dooog/Galaxy-Clusters
def linkage(D, method='single', metric='euclidean', preserve_input=True):
    '''Hierarchical (agglomerative) clustering on a dissimilarity matrix or on
Euclidean data.

The argument D is either a compressed distance matrix or a collection
of m observation vectors in n dimensions as an (m×n) NumPy array. Apart
from the argument preserve_input, the methods have the same input
parameters and output format as the functions of the same name in the
package scipy.cluster.hierarchy. Therefore, the documentation is not
duplicated here. Please refer to the SciPy documentation for further
details.

The additional, optional argument preserve_input specifies whether the
fastcluster package first copies the distance matrix or writes into
the existing array. If the distance matrix is only generated for the
clustering step and is not needed afterwards, half the memory can be
saved by specifying preserve_input=False.

Note that the input array D contains unspecified values after this
procedure. It is therefore safer to write

    linkage(D, method="…", preserve_distance=False)
    del D

to make sure the matrix D is not accidentally used after it has been
used as scratch memory.

The single linkage algorithm does not write to the distance matrix or
its copy anyway, so the preserve_distance flag has no effect in this
case.'''
    if not isinstance(D, ndarray):
        raise ValueError('The first argument must be of type numpy.ndarray.')
    if len(D.shape)==1:
        if method=='single':
            assert D.dtype==double
            D_ = require(D, dtype=double, requirements=['C'])
            if D_ is not D:
                stderr.write('The condensed distance matrix had to be copied since it has the following flags:\n')
                stderr.write(str(D.flags) + '\n')
        elif preserve_input:
            D_ = D.copy()
            assert D_.dtype == double
            assert D_.flags.c_contiguous
            assert D_.flags.owndata
            assert D_.flags.writeable
            assert D_.flags.aligned
        else:
            assert D.dtype==double
            D_ = require(D, dtype=double, requirements=['C', 'A', 'W', 'O'])
            if D_ is not D:
                stderr.write('The condensed distance matrix had to be copied since it has the following flags:\n')
                stderr.write(str(D.flags) + '\n')

        is_valid_y(D_, throw=True)

        N = num_obs_y(D_)
        Z = empty((N-1,4))
        if N > 1:
            linkage_wrap(N, D_, Z, mthidx[method])
        return Z
    else:
        assert len(D.shape)==2
        N = D.shape[0]
        Z = empty((N-1,4))
        D_ = pdist(D, metric)
        assert D_.dtype == double
        assert D_.flags.c_contiguous
        assert D_.flags.owndata
        assert D_.flags.writeable
        assert D_.flags.aligned
        if N > 1:
            linkage_wrap(N, D_, Z, mthidx[method])
        return Z
コード例 #14
0
def linearized_fuzzy_c_medoids(data,
                               distance_matrix,
                               components=10,
                               eps=1e-4,
                               max_iter=1000,
                               fuzzifier=2,
                               membership_subset_size=None,
                               initialization_method="random_choice",
                               empty_clusters_method="nothing",
                               medoids_idx=None):
    """ Performs the linearized fuzzy c-medoids clustering algorithm on a dataset.

    :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as
    examples and columns as features.
    :param distance_matrix: The pairwise distance matrix applied across all examples from the data matrix. The distance
    matrix must be encoded into a condensed distance vector (see:
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html)
    :param components: The number of components (clusters) wanted.
    :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is
    lower than `eps`, the clustering stop.
    :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`.
    :param fuzzifier: Membership fuzzification coefficient.
    :param membership_subset_size: Size of subset to inspect during the memberships matrix computation. Reduce
    computations length.
    :param initialization_method: Method used to initialise the centroids. Can take one of the following values :
    * "random_uniform" or "uniform", samples values between the min and max across each dimension.
    * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's
    dimension.
    * "random_choice" or "choice", samples random examples from the data without replacement.
    * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all
    successive medoids as the most dissimilar to all medoids that have already been picked.
    * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled
    randomly.
    :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following
    values :
    * "nothing", do absolutely nothing and ignore empty clusters.
    * "random_example", assign a random example to all empty clusters.
    * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster.
    :param medoids_idx: Initials medoids indexes to use instead of randomly initialize them.
    :return: A tuple containing :
    * The memberships matrix.
    * The medoids matrix.
    * An array with all losses at each iteration.
    """
    assert len(data.shape) == 2, "The data must be a 2D array"
    assert data.shape[0] > 0, "The data must have at least one example"
    assert data.shape[1] > 0, "The data must have at least one feature"
    assert is_valid_y(
        distance_matrix
    ), "The distance matrix is not encoded into a condensed distance vector"
    assert 1 <= components <= data.shape[
        0], "The number of components wanted must be between 1 and %s" % data.shape[
            0]
    assert 0 <= max_iter, "The number of max iterations must be positive"
    assert fuzzifier > 1, "The fuzzifier must be greater than 1"
    assert (membership_subset_size is None) or (1 <= membership_subset_size <= data.shape[0]), \
        "The membership subset size wanted must be between 1 and %s" % data.shape[0]
    assert (medoids_idx is None) or (medoids_idx.shape == components), \
        "The given medoids indexes do not have a correct shape. Expected shape : {}, given shape : {}".format(
            (components,), medoids_idx.shape
        )
    assert (medoids_idx is None) or np.all(medoids_idx < data.shape[0]), \
        "The provided medoid indexes array contains unreachable indexes"

    raise NotImplementedError("TODO")

    # If no `membership_subset_size` is specified, [1] suggest to use a value much smaller than the average of points
    # in a cluster
    if membership_subset_size is None:
        membership_subset_size = distance_matrix.shape[0] // components

    # Initialisation
    if medoids_idx is None:
        medoids_idx = cluster_initialization(distance_matrix,
                                             components,
                                             initialization_method,
                                             need_idx=True)

    with tqdm(total=max_iter, bar_format=_FORMAT_PROGRESS_BAR) as progress_bar:
        best_memberships = None
        best_medoids_idx = None
        best_loss = np.inf

        memberships = None
        medoids_idx_old = None
        losses = []
        current_iter = 0
        while (current_iter < max_iter) and \
                ((current_iter < 1) or (not all(medoids_idx == medoids_idx_old))) and \
                ((current_iter < 2) or not (abs(losses[-1] - losses[-2]) <= eps)):

            medoids_idx_old = medoids_idx
            memberships = _compute_memberships(distance_matrix, medoids_idx,
                                               fuzzifier)
            handle_empty_clusters(distance_matrix,
                                  medoids_idx,
                                  memberships,
                                  strategy=empty_clusters_method)

            top_memberships_mask = _compute_top_membership_subset(
                memberships, membership_subset_size)
            medoids_idx = _compute_medoids(distance_matrix, memberships,
                                           fuzzifier, top_memberships_mask)

            loss = _compute_loss(distance_matrix, medoids_idx, memberships,
                                 fuzzifier)
            losses.append(loss)
            if loss < best_loss:
                best_loss = loss
                best_memberships = memberships
                best_medoids_idx = medoids_idx

            # Update the progress bar
            current_iter += 1
            progress_bar.update()
            progress_bar.set_postfix({
                "Loss": "{0:.6f}".format(loss),
                "best_loss": "{0:.6f}".format(best_loss)
            })

    return {
        "memberships": best_memberships,
        "medoids_indexes": best_medoids_idx,
        "clusters_center": data[best_medoids_idx, :],
        "losses": np.array(losses),
        "affectations": best_memberships.argmax(axis=1),
        "ambiguity": ambiguity(best_memberships),
        "partition_coefficient": partition_coefficient(best_memberships),
        "partition_entropy": partition_entropy(best_memberships),
        "extended_time": progress_bar.last_print_t - progress_bar.start_t,
    }
コード例 #15
0
def computeDistances(descriptors,
                     distance=True,
                     parallel=True,
                     nprocs=None,
                     normalize=False):
    num_desc = len(descriptors)

    if np.isnan(descriptors).any():
        raise ValueError('nan in descr!')
    if np.isinf(descriptors).any():
        raise ValueError('inf in descr!')

    for i in range(len(descriptors)):
        #        if np.count_nonzero(descriptors[i]) == 0:
        if not descriptors[i].any():  # faster
            print 'WARNING: complete row {} is 0'.format(i)

    indices = [(y, x) for y in range(num_desc - 1)
               for x in range(y + 1, num_desc)]
    splits = np.array_split(np.array(indices), 8)

    def loop(inds):
        dists = []
        for ind in inds:
            dist = spdistance.cosine(descriptors[ind[0]], descriptors[ind[1]])
            #            dist = 1.0 - np.dot(descriptors[ind[0]], descriptors[ind[1]]) / \
            #                ( np.sqrt(descriptors[ind[0]]**2) *\
            #                 np.sqrt(descriptors[ind[1]]**2))
            dists.append(dist)
        return dists

    if parallel:
        dists = parmap(loop, splits, nprocs)
    else:
        dists = map(loop, splits)

    # convert densed vector-form to matrix
    dense_vector = np.concatenate(dists)
    if spdistance.is_valid_y(dense_vector, warning=True):
        dist_matrix = spdistance.squareform(dense_vector)
    else:
        print 'ERROR: not a valid condensed distance matrix!'
        n = dense_vector.shape[0]
        d = int(np.ceil(np.sqrt(n * 2)))
        should = d * (d - 1) / 2
        raise ValueError('{} != {}, num: {}'.format(should, n, num_desc))

    # do some checks
    if np.isnan(dist_matrix).any():
        print 'WARNING have a nan in the dist-matrix'
    if np.isinf(dist_matrix).any():
        print 'WARNING have a inf in the dist-matrix'

    if normalize:
        dist_matrix /= np.sum(dist_matrix)


#    if distance:
#        if np.count_nonzero(dist_matrix == np.finfo(float).max) > 0:
#            raise ValueError('there is already a float-maximum')
#        if normalize:
#            dist_matrix /= np.sum(dist_matrix)
#        np.fill_diagonal(dist_matrix, np.finfo(float).max)
#    else:
#        if np.count_nonzero(dist_matrix == np.finfo(float).min) > 0:
#            raise ValueError('there is already a float-min')
#        if normalize:
#            dist_matrix /= np.sum(dist_matrix)
#        np.fill_diagonal(dist_matrix, np.finfo(float).min)

    return dist_matrix  #, dist_m
コード例 #16
0
def Test(X, Y, perms=10000, method='pearson', tail='upper'):
    """
  Takes two distance matrices (either redundant matrices or condensed vectors)
  and performs a Mantel test. The Mantel test is a significance test of the
  correlation between two distance matrices.

  Parameters
  ----------
  X : array_like
      First distance matrix (condensed or redundant).
  Y : array_like
      Second distance matrix (condensed or redundant), where the order of
      elements corresponds to the order of elements in the first matrix.
  perms : int, optional
      The number of permutations to perform (default: 10000). A larger number
      gives more reliable results but takes longer to run. If the actual number
      of possilbe permutations is smaller, the program will enumerate all
      permutations. Enumeration can be forced by setting this argument to 0.
  method : str, optional
      Type of correlation coefficient to use; either 'pearson', 'spearman', or
      'kendall' (default: 'pearson'). N.B. the time complexity of Kendall's tau
      scales exponentially with matrix size, so it is slow for large matrices.
  tail : str, optional
      Which tail to test in the calculation of the empirical p-value; either
      'upper' or 'lower' (default: 'upper').

  Returns
  -------
  r : float
      Veridical correlation
  p : float
      Empirical p-value
  z : float
      Standard score (z-score)
  """

    # Ensure X and Y are arrays.

    X = asarray(X, dtype=float)
    Y = asarray(Y, dtype=float)

    # Check that X and Y are valid distance matrices/vectors.

    if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False:
        raise ValueError('X is not a valid distance matrix')

    if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False:
        raise ValueError('Y is not a valid distance matrix')

    # Figure out whether X and Y are matrices or vectors and convert both to
    # vectors and one to a matrix (as needed).

    # X is vector and Y is vector
    if len(X.shape) == 1 and len(Y.shape) == 1:
        Y_as_matrix = distance.squareform(Y, force='tomatrix', checks=False)

    # X is vector and Y is matrix
    elif len(X.shape) == 1 and len(Y.shape) == 2:
        Y_as_matrix = Y
        Y = distance.squareform(Y, force='tovector', checks=False)

    # X is matrix and Y is vector
    elif len(X.shape) == 2 and len(Y.shape) == 1:
        Y_as_matrix = X
        X, Y = Y, distance.squareform(X, force='tovector', checks=False)

    # X is matrix and Y is matrix
    elif len(X.shape) == 2 and len(Y.shape) == 2:
        Y_as_matrix = Y
        X = distance.squareform(X, force='tovector', checks=False)
        Y = distance.squareform(Y, force='tovector', checks=False)

    # Check for size equality.

    if X.shape[0] != Y.shape[0]:
        raise ValueError('X and Y are not of equal size')

    # Check for minimum size.

    if X.shape[0] < 3:
        raise ValueError('X and Y should represent at least 3 objects')

    # Assign the relevant correlation function to the variable 'correlate'.

    if method == 'pearson':
        correlate = pearsonr

    elif method == 'spearman':
        correlate = spearmanr

    elif method == 'kendall':
        correlate = kendalltau

    else:
        raise ValueError(
            'The method should be set to "pearson", "spearman", or "kendall"')

    # Determine the size of the matrix (i.e. number of rows/columns).
    n = Y_as_matrix.shape[0]

    # Initialize an empty array to store temporary vector permutations of Y.
    Y_permuted = zeros(Y.shape[0], dtype=float)

    # Either enumerate all permutations ...

    if perms >= factorial(n) or perms == 0:

        # Initialize an empty array to store the correlations.
        corrs = zeros(factorial(n), dtype=float)

        # Enumerate all permutations of row/column orders.
        orders = permutations(range(n))

        perms = 0

        for order in orders:

            # Take a permutation of the matrix.
            Y_as_matrix_permuted = Y_as_matrix[order, :][:, order]

            # Condense the permuted version of the matrix. Rather than use
            # distance.squareform(), we call directly into the C wrapper for speed.
            distance._distance_wrap.to_vector_from_squareform_wrap(
                Y_as_matrix_permuted, Y_permuted)

            # Compute the correlation coefficient and store it to corrs.
            corrs[perms] = correlate(X, Y_permuted)[0]

            perms += 1

    # ... or randomly sample from the space of permutations.

    else:

        # Initialize an empty array to store the correlations.
        corrs = zeros(perms, dtype=float)

        # Store the veridical correlation coefficient first.
        corrs[0] = correlate(X, Y)[0]

        for i in range(1, perms):

            # Choose a random order in which to permute the rows and columns.
            order = random.permutation(n)

            # Take a permutation of the matrix.
            Y_as_matrix_permuted = Y_as_matrix[order, :][:, order]

            # Condense the permuted version of the matrix. Rather than use
            # distance.squareform(), we call directly into the C wrapper for speed.
            distance._distance_wrap.to_vector_from_squareform_wrap(
                Y_as_matrix_permuted, Y_permuted)

            # Compute the correlation coefficient and store it to corrs.
            corrs[i] = correlate(X, Y_permuted)[0]

    # Assign veridical correlation to r.
    r = corrs[0]

    # Calculate the empirical p-value for the upper or lower tail.

    if tail == 'upper':
        p = (corrs >= r).sum() / float(perms)

    elif tail == 'lower':
        p = (corrs <= r).sum() / float(perms)

    else:
        raise ValueError('The tail should be set to "upper" or "lower"')

    # Calculate the standard score.

    m = corrs.mean()
    sd = corrs.std()
    z = (r - m) / sd

    return r, p, z