Esempio n. 1
0
def spread(d, t):
    """
    Calculate the spread between two sets of compounds.

    Given a matrix containing distances between two sets of compounds, A
    and B, calculate the fraction of compounds in set A that are closer
    than t to any compound in set B.

    If the distance matrix is symmetric and has zero diagonals, the
    diagonal values are ignored.

    Parameters
    ----------
    d : ndarray
        Distance matrix with compounds from set A on first axis.
    t : float
        Distance threshold.
    """

    # test for symmetric distance matrix
    # diagonal values should be ignored
    if is_valid_dm(d):
        d = np.copy(d)
        d[np.diag_indices_from(d)] = np.inf
    s = np.mean(np.any(d < t, axis=1))
    return s
Esempio n. 2
0
    def test_fingerprint_pairwise_similarity(self):
        """
        Test pairwise similarity matrix creation
        """

        simmat = mol_fingerprint_pairwise_similarity(self.fps,
                                                     self.toolkit_name)
        self.assertTrue(hr.is_valid_dm(simmat))
        self.assertEqual(hr.num_obs_dm(simmat), 10)
Esempio n. 3
0
def plot_clustering(D, groups):
    """Given the similarity matrix now calculate similarity."""

    # Make the matrix D into a valid similarity matrix
    D = (D + D.transpose()) / 2

    assert(distance.is_valid_dm(D))
    Z = linkage(D)

    # Generate a set of labels as well.
    labels = map(lambda g: g.name, groups)


    dendrogram(Z, labels=labels, leaf_rotation=90, leaf_font_size=20)
    show()
Esempio n. 4
0
def most_and_least_similar_pairs(distance_matrix):
  if distance.is_valid_dm(distance_matrix) == False:
    if distance.is_valid_y(distance_matrix) == False:
      raise ValueError('Invalid distance matrix. Please supply a condensed or redundant distance matrix.')
    distance_matrix = distance.squareform(distance_matrix, force='tomatrix')
  similar_score = 1
  dissimilar_score = 0
  n = distance_matrix.shape[0]
  for i in range(0, n):
    for j in range(i+1, n):
      score = distance_matrix[i, j]
      if score < similar_score:
        similar_score = score
        similar_indices = (i, j)
      if score > dissimilar_score:
        dissimilar_score = score
        dissimilar_indices = (i, j)
  return similar_score, similar_indices, dissimilar_score, dissimilar_indices
Esempio n. 5
0
    def compute_correlation_coefficient_matrix(self):
        import scipy.spatial.distance as ssd

        correlation_matrix = self.cosym.target.rij_matrix

        for i in range(correlation_matrix.all()[0]):
            correlation_matrix[i, i] = 1

        # clip values of correlation matrix to account for floating point errors
        correlation_matrix.set_selected(correlation_matrix < -1, -1)
        correlation_matrix.set_selected(correlation_matrix > 1, 1)
        diffraction_dissimilarity = 1 - correlation_matrix

        dist_mat = diffraction_dissimilarity.as_numpy_array()

        assert ssd.is_valid_dm(dist_mat, tol=1e-12)
        # convert the redundant n*n square matrix form into a condensed nC2 array
        dist_mat = ssd.squareform(dist_mat, checks=False)

        linkage_matrix = hierarchy.linkage(dist_mat, method="average")

        return correlation_matrix, linkage_matrix
Esempio n. 6
0
def Test(X, Y, perms=10000, method='pearson', tail='upper'):
  """
  Takes two distance matrices (either redundant matrices or condensed vectors)
  and performs a Mantel test. The Mantel test is a significance test of the
  correlation between two distance matrices.

  Parameters
  ----------
  X : array_like
      First distance matrix (condensed or redundant).
  Y : array_like
      Second distance matrix (condensed or redundant), where the order of
      elements corresponds to the order of elements in the first matrix.
  perms : int, optional
      The number of permutations to perform (default: 10000). A larger number
      gives more reliable results but takes longer to run. If the actual number
      of possilbe permutations is smaller, the program will enumerate all
      permutations. Enumeration can be forced by setting this argument to 0.
  method : str, optional
      Type of correlation coefficient to use; either 'pearson' or 'spearman'
      (default: 'pearson').
  tail : str, optional
      Which tail to test in the calculation of the empirical p-value; either
      'upper' or 'lower' (default: 'upper').

  Returns
  -------
  r : float
      Veridical correlation
  p : float
      Empirical p-value
  z : float
      Standard score (z-score)
  """

  # Ensure X and Y are arrays.

  X = asarray(X, dtype=float)
  Y = asarray(Y, dtype=float)

  # Check that X and Y are valid distance matrices/vectors.

  if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False:
    raise ValueError('X is not a valid distance matrix')

  if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False:
    raise ValueError('Y is not a valid distance matrix')

  # If X or Y is a matrix, condense it to a vector.

  if len(X.shape) == 2:
    X = distance.squareform(X, force='tovector', checks=False)

  if len(Y.shape) == 2:
    Y = distance.squareform(Y, force='tovector', checks=False)

  # Check for size equality.

  if X.shape[0] != Y.shape[0]:
    raise ValueError('X and Y are not of equal size')

  # Check for minimum size.

  if X.shape[0] < 3:
    raise ValueError('X and Y should represent at least 3 objects')

  # If Spearman correlation is requested, convert X and Y to ranks.

  if method == 'spearman':
    X = rankdata(X)
    Y = rankdata(Y)

  elif method != 'pearson':
    raise ValueError('The method should be set to "pearson" or "spearman"')

  # Most parts of the correlation coefficient will be the same for every
  # permutation and can therefore be computed outside the loop.

  X_res = X - X.mean() # X residuals
  Y_res = Y - Y.mean() # Y residuals
  X_ss = (X_res * X_res).sum() # X sum-of-squares
  Y_ss = (Y_res * Y_res).sum() # Y sum-of-squares
  denominator = sqrt(X_ss * Y_ss) # Denominator of the correlation coefficient

  # Although Y_res will be the same set of numbers on every permutation, the
  # order will be different each time. Therefore, we reformat Y_res as a matrix
  # so that we can take matrix permutations of the Y residuals.
  Y_res_as_matrix = distance.squareform(Y_res, force='tomatrix', checks=False)

  # Determine the size of the matrix (i.e. number of rows/columns).
  n = Y_res_as_matrix.shape[0]

  # Initialize an empty array to store temporary vector permutations of Y_res.
  Y_res_permuted = zeros(Y_res.shape[0], dtype=float)

  # Either enumerate all permutations ...

  if perms >= factorial(n) or perms == 0:

    # Initialize an empty array to store the correlations.
    corrs = zeros(factorial(n), dtype=float)

    # Enumerate all permutations of row/column orders.
    orders = permutations(range(n))

    perms = 0

    for order in orders:

      # Take a permutation of the matrix.
      Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted)

      # Compute the correlation coefficient and store it to corrs.
      corrs[perms] = (X_res * Y_res_permuted).sum() / denominator

      perms += 1

  # ... or randomly sample from the space of permutations.

  else:

    # Initialize an empty array to store the correlations.
    corrs = zeros(perms, dtype=float)

    # Store the veridical correlation coefficient first.
    corrs[0] = (X_res * Y_res).sum() / denominator

    for i in range(1, perms):

      # Choose a random order in which to permute the rows and columns.
      order = random.permutation(n)

      # Take a permutation of the matrix.
      Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted)

      # Compute the correlation coefficient and store it to corrs.
      corrs[i] = (X_res * Y_res_permuted).sum() / denominator

  # Assign veridical correlation to r.
  r = corrs[0]

  # Calculate the empirical p-value for the upper or lower tail.

  if tail == 'upper':
    p = (corrs >= r).sum() / float(perms)

  elif tail == 'lower':
    p = (corrs <= r).sum() / float(perms)

  else:
    raise ValueError('The tail should be set to "upper" or "lower"')

  # Calculate the standard score.

  m = corrs.mean()
  sd = corrs.std()
  z = (r - m) / sd

  return r, p, z
Esempio n. 7
0
Xaverage=[]
conc_list=['0.0','0.1','0.2','0.4','0.6','0.8','0.9','1.0']
#conc_list=['1.0']
numConc=len(conc_list)
bFirst=True

for e in range(numConc):
    in_file='./fitted_'+metric+'_'+conc_list[e]+'_matrix.dat'
    out_file='matrix_'+metric+'_'+conc_list[e]+'.pdf'

    X = gs.load_matrix(in_file)
    # Rescale and symmetrize matrix.
    Xp = np.maximum( np.zeros( X.shape ), X )
    Xp = 0.5*( Xp + Xp.T )
    print( is_valid_dm(Xp) )

    if bFirst:
        bFirst=False
        Xaverage=Xp
    else:
        Xaverage=Xaverage+Xp

    plt.clf()
    ax1  = plt.subplot2grid((1,7), (0,0), colspan=5)
    ax2  = plt.subplot2grid((1,7), (0,5), colspan=2)
    plt.subplots_adjust(left=0.04, right=0.99, bottom=0.03, top=0.76, wspace=3.0, hspace=None)
    plt.figtext(s='Prot:RNA-ratio', x=0.68, y=0.95,
        horizontalalignment='center', verticalalignment='center', **label_font1 )
    plt.figtext(s='1.0:%s' % conc_list[e], x=0.68, y=0.90,
        horizontalalignment='center', verticalalignment='center', **label_font2 )
def Test(X, Y, perms=10000, method='pearson', tail='upper'):
    """
  Takes two distance matrices (either redundant matrices or condensed vectors)
  and performs a Mantel test. The Mantel test is a significance test of the
  correlation between two distance matrices.

  Parameters
  ----------
  X : array_like
      First distance matrix (condensed or redundant).
  Y : array_like
      Second distance matrix (condensed or redundant), where the order of
      elements corresponds to the order of elements in the first matrix.
  perms : int, optional
      The number of permutations to perform (default: 10000). A larger number
      gives more reliable results but takes longer to run. If the actual number
      of possilbe permutations is smaller, the program will enumerate all
      permutations. Enumeration can be forced by setting this argument to 0.
  method : str, optional
      Type of correlation coefficient to use; either 'pearson', 'spearman', or
      'kendall' (default: 'pearson'). N.B. the time complexity of Kendall's tau
      scales exponentially with matrix size, so it is slow for large matrices.
  tail : str, optional
      Which tail to test in the calculation of the empirical p-value; either
      'upper' or 'lower' (default: 'upper').

  Returns
  -------
  r : float
      Veridical correlation
  p : float
      Empirical p-value
  z : float
      Standard score (z-score)
  """

    # Ensure X and Y are arrays.

    X = asarray(X, dtype=float)
    Y = asarray(Y, dtype=float)

    # Check that X and Y are valid distance matrices/vectors.

    if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False:
        raise ValueError('X is not a valid distance matrix')

    if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False:
        raise ValueError('Y is not a valid distance matrix')

    # Figure out whether X and Y are matrices or vectors and convert both to
    # vectors and one to a matrix (as needed).

    # X is vector and Y is vector
    if len(X.shape) == 1 and len(Y.shape) == 1:
        Y_as_matrix = distance.squareform(Y, force='tomatrix', checks=False)

    # X is vector and Y is matrix
    elif len(X.shape) == 1 and len(Y.shape) == 2:
        Y_as_matrix = Y
        Y = distance.squareform(Y, force='tovector', checks=False)

    # X is matrix and Y is vector
    elif len(X.shape) == 2 and len(Y.shape) == 1:
        Y_as_matrix = X
        X, Y = Y, distance.squareform(X, force='tovector', checks=False)

    # X is matrix and Y is matrix
    elif len(X.shape) == 2 and len(Y.shape) == 2:
        Y_as_matrix = Y
        X = distance.squareform(X, force='tovector', checks=False)
        Y = distance.squareform(Y, force='tovector', checks=False)

    # Check for size equality.

    if X.shape[0] != Y.shape[0]:
        raise ValueError('X and Y are not of equal size')

    # Check for minimum size.

    if X.shape[0] < 3:
        raise ValueError('X and Y should represent at least 3 objects')

    # Assign the relevant correlation function to the variable 'correlate'.

    if method == 'pearson':
        correlate = pearsonr

    elif method == 'spearman':
        correlate = spearmanr

    elif method == 'kendall':
        correlate = kendalltau

    else:
        raise ValueError(
            'The method should be set to "pearson", "spearman", or "kendall"')

    # Determine the size of the matrix (i.e. number of rows/columns).
    n = Y_as_matrix.shape[0]

    # Initialize an empty array to store temporary vector permutations of Y.
    Y_permuted = zeros(Y.shape[0], dtype=float)

    # Either enumerate all permutations ...

    if perms >= factorial(n) or perms == 0:

        # Initialize an empty array to store the correlations.
        corrs = zeros(factorial(n), dtype=float)

        # Enumerate all permutations of row/column orders.
        orders = permutations(range(n))

        perms = 0

        for order in orders:

            # Take a permutation of the matrix.
            Y_as_matrix_permuted = Y_as_matrix[order, :][:, order]

            # Condense the permuted version of the matrix. Rather than use
            # distance.squareform(), we call directly into the C wrapper for speed.
            distance._distance_wrap.to_vector_from_squareform_wrap(
                Y_as_matrix_permuted, Y_permuted)

            # Compute the correlation coefficient and store it to corrs.
            corrs[perms] = correlate(X, Y_permuted)[0]

            perms += 1

    # ... or randomly sample from the space of permutations.

    else:

        # Initialize an empty array to store the correlations.
        corrs = zeros(perms, dtype=float)

        # Store the veridical correlation coefficient first.
        corrs[0] = correlate(X, Y)[0]

        for i in range(1, perms):

            # Choose a random order in which to permute the rows and columns.
            order = random.permutation(n)

            # Take a permutation of the matrix.
            Y_as_matrix_permuted = Y_as_matrix[order, :][:, order]

            # Condense the permuted version of the matrix. Rather than use
            # distance.squareform(), we call directly into the C wrapper for speed.
            distance._distance_wrap.to_vector_from_squareform_wrap(
                Y_as_matrix_permuted, Y_permuted)

            # Compute the correlation coefficient and store it to corrs.
            corrs[i] = correlate(X, Y_permuted)[0]

    # Assign veridical correlation to r.
    r = corrs[0]

    # Calculate the empirical p-value for the upper or lower tail.

    if tail == 'upper':
        p = (corrs >= r).sum() / float(perms)

    elif tail == 'lower':
        p = (corrs <= r).sum() / float(perms)

    else:
        raise ValueError('The tail should be set to "upper" or "lower"')

    # Calculate the standard score.

    m = corrs.mean()
    sd = corrs.std()
    z = (r - m) / sd

    return r, p, z