def spread(d, t): """ Calculate the spread between two sets of compounds. Given a matrix containing distances between two sets of compounds, A and B, calculate the fraction of compounds in set A that are closer than t to any compound in set B. If the distance matrix is symmetric and has zero diagonals, the diagonal values are ignored. Parameters ---------- d : ndarray Distance matrix with compounds from set A on first axis. t : float Distance threshold. """ # test for symmetric distance matrix # diagonal values should be ignored if is_valid_dm(d): d = np.copy(d) d[np.diag_indices_from(d)] = np.inf s = np.mean(np.any(d < t, axis=1)) return s
def test_fingerprint_pairwise_similarity(self): """ Test pairwise similarity matrix creation """ simmat = mol_fingerprint_pairwise_similarity(self.fps, self.toolkit_name) self.assertTrue(hr.is_valid_dm(simmat)) self.assertEqual(hr.num_obs_dm(simmat), 10)
def plot_clustering(D, groups): """Given the similarity matrix now calculate similarity.""" # Make the matrix D into a valid similarity matrix D = (D + D.transpose()) / 2 assert(distance.is_valid_dm(D)) Z = linkage(D) # Generate a set of labels as well. labels = map(lambda g: g.name, groups) dendrogram(Z, labels=labels, leaf_rotation=90, leaf_font_size=20) show()
def most_and_least_similar_pairs(distance_matrix): if distance.is_valid_dm(distance_matrix) == False: if distance.is_valid_y(distance_matrix) == False: raise ValueError('Invalid distance matrix. Please supply a condensed or redundant distance matrix.') distance_matrix = distance.squareform(distance_matrix, force='tomatrix') similar_score = 1 dissimilar_score = 0 n = distance_matrix.shape[0] for i in range(0, n): for j in range(i+1, n): score = distance_matrix[i, j] if score < similar_score: similar_score = score similar_indices = (i, j) if score > dissimilar_score: dissimilar_score = score dissimilar_indices = (i, j) return similar_score, similar_indices, dissimilar_score, dissimilar_indices
def compute_correlation_coefficient_matrix(self): import scipy.spatial.distance as ssd correlation_matrix = self.cosym.target.rij_matrix for i in range(correlation_matrix.all()[0]): correlation_matrix[i, i] = 1 # clip values of correlation matrix to account for floating point errors correlation_matrix.set_selected(correlation_matrix < -1, -1) correlation_matrix.set_selected(correlation_matrix > 1, 1) diffraction_dissimilarity = 1 - correlation_matrix dist_mat = diffraction_dissimilarity.as_numpy_array() assert ssd.is_valid_dm(dist_mat, tol=1e-12) # convert the redundant n*n square matrix form into a condensed nC2 array dist_mat = ssd.squareform(dist_mat, checks=False) linkage_matrix = hierarchy.linkage(dist_mat, method="average") return correlation_matrix, linkage_matrix
def Test(X, Y, perms=10000, method='pearson', tail='upper'): """ Takes two distance matrices (either redundant matrices or condensed vectors) and performs a Mantel test. The Mantel test is a significance test of the correlation between two distance matrices. Parameters ---------- X : array_like First distance matrix (condensed or redundant). Y : array_like Second distance matrix (condensed or redundant), where the order of elements corresponds to the order of elements in the first matrix. perms : int, optional The number of permutations to perform (default: 10000). A larger number gives more reliable results but takes longer to run. If the actual number of possilbe permutations is smaller, the program will enumerate all permutations. Enumeration can be forced by setting this argument to 0. method : str, optional Type of correlation coefficient to use; either 'pearson' or 'spearman' (default: 'pearson'). tail : str, optional Which tail to test in the calculation of the empirical p-value; either 'upper' or 'lower' (default: 'upper'). Returns ------- r : float Veridical correlation p : float Empirical p-value z : float Standard score (z-score) """ # Ensure X and Y are arrays. X = asarray(X, dtype=float) Y = asarray(Y, dtype=float) # Check that X and Y are valid distance matrices/vectors. if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False: raise ValueError('X is not a valid distance matrix') if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False: raise ValueError('Y is not a valid distance matrix') # If X or Y is a matrix, condense it to a vector. if len(X.shape) == 2: X = distance.squareform(X, force='tovector', checks=False) if len(Y.shape) == 2: Y = distance.squareform(Y, force='tovector', checks=False) # Check for size equality. if X.shape[0] != Y.shape[0]: raise ValueError('X and Y are not of equal size') # Check for minimum size. if X.shape[0] < 3: raise ValueError('X and Y should represent at least 3 objects') # If Spearman correlation is requested, convert X and Y to ranks. if method == 'spearman': X = rankdata(X) Y = rankdata(Y) elif method != 'pearson': raise ValueError('The method should be set to "pearson" or "spearman"') # Most parts of the correlation coefficient will be the same for every # permutation and can therefore be computed outside the loop. X_res = X - X.mean() # X residuals Y_res = Y - Y.mean() # Y residuals X_ss = (X_res * X_res).sum() # X sum-of-squares Y_ss = (Y_res * Y_res).sum() # Y sum-of-squares denominator = sqrt(X_ss * Y_ss) # Denominator of the correlation coefficient # Although Y_res will be the same set of numbers on every permutation, the # order will be different each time. Therefore, we reformat Y_res as a matrix # so that we can take matrix permutations of the Y residuals. Y_res_as_matrix = distance.squareform(Y_res, force='tomatrix', checks=False) # Determine the size of the matrix (i.e. number of rows/columns). n = Y_res_as_matrix.shape[0] # Initialize an empty array to store temporary vector permutations of Y_res. Y_res_permuted = zeros(Y_res.shape[0], dtype=float) # Either enumerate all permutations ... if perms >= factorial(n) or perms == 0: # Initialize an empty array to store the correlations. corrs = zeros(factorial(n), dtype=float) # Enumerate all permutations of row/column orders. orders = permutations(range(n)) perms = 0 for order in orders: # Take a permutation of the matrix. Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted) # Compute the correlation coefficient and store it to corrs. corrs[perms] = (X_res * Y_res_permuted).sum() / denominator perms += 1 # ... or randomly sample from the space of permutations. else: # Initialize an empty array to store the correlations. corrs = zeros(perms, dtype=float) # Store the veridical correlation coefficient first. corrs[0] = (X_res * Y_res).sum() / denominator for i in range(1, perms): # Choose a random order in which to permute the rows and columns. order = random.permutation(n) # Take a permutation of the matrix. Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted) # Compute the correlation coefficient and store it to corrs. corrs[i] = (X_res * Y_res_permuted).sum() / denominator # Assign veridical correlation to r. r = corrs[0] # Calculate the empirical p-value for the upper or lower tail. if tail == 'upper': p = (corrs >= r).sum() / float(perms) elif tail == 'lower': p = (corrs <= r).sum() / float(perms) else: raise ValueError('The tail should be set to "upper" or "lower"') # Calculate the standard score. m = corrs.mean() sd = corrs.std() z = (r - m) / sd return r, p, z
Xaverage=[] conc_list=['0.0','0.1','0.2','0.4','0.6','0.8','0.9','1.0'] #conc_list=['1.0'] numConc=len(conc_list) bFirst=True for e in range(numConc): in_file='./fitted_'+metric+'_'+conc_list[e]+'_matrix.dat' out_file='matrix_'+metric+'_'+conc_list[e]+'.pdf' X = gs.load_matrix(in_file) # Rescale and symmetrize matrix. Xp = np.maximum( np.zeros( X.shape ), X ) Xp = 0.5*( Xp + Xp.T ) print( is_valid_dm(Xp) ) if bFirst: bFirst=False Xaverage=Xp else: Xaverage=Xaverage+Xp plt.clf() ax1 = plt.subplot2grid((1,7), (0,0), colspan=5) ax2 = plt.subplot2grid((1,7), (0,5), colspan=2) plt.subplots_adjust(left=0.04, right=0.99, bottom=0.03, top=0.76, wspace=3.0, hspace=None) plt.figtext(s='Prot:RNA-ratio', x=0.68, y=0.95, horizontalalignment='center', verticalalignment='center', **label_font1 ) plt.figtext(s='1.0:%s' % conc_list[e], x=0.68, y=0.90, horizontalalignment='center', verticalalignment='center', **label_font2 )
def Test(X, Y, perms=10000, method='pearson', tail='upper'): """ Takes two distance matrices (either redundant matrices or condensed vectors) and performs a Mantel test. The Mantel test is a significance test of the correlation between two distance matrices. Parameters ---------- X : array_like First distance matrix (condensed or redundant). Y : array_like Second distance matrix (condensed or redundant), where the order of elements corresponds to the order of elements in the first matrix. perms : int, optional The number of permutations to perform (default: 10000). A larger number gives more reliable results but takes longer to run. If the actual number of possilbe permutations is smaller, the program will enumerate all permutations. Enumeration can be forced by setting this argument to 0. method : str, optional Type of correlation coefficient to use; either 'pearson', 'spearman', or 'kendall' (default: 'pearson'). N.B. the time complexity of Kendall's tau scales exponentially with matrix size, so it is slow for large matrices. tail : str, optional Which tail to test in the calculation of the empirical p-value; either 'upper' or 'lower' (default: 'upper'). Returns ------- r : float Veridical correlation p : float Empirical p-value z : float Standard score (z-score) """ # Ensure X and Y are arrays. X = asarray(X, dtype=float) Y = asarray(Y, dtype=float) # Check that X and Y are valid distance matrices/vectors. if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False: raise ValueError('X is not a valid distance matrix') if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False: raise ValueError('Y is not a valid distance matrix') # Figure out whether X and Y are matrices or vectors and convert both to # vectors and one to a matrix (as needed). # X is vector and Y is vector if len(X.shape) == 1 and len(Y.shape) == 1: Y_as_matrix = distance.squareform(Y, force='tomatrix', checks=False) # X is vector and Y is matrix elif len(X.shape) == 1 and len(Y.shape) == 2: Y_as_matrix = Y Y = distance.squareform(Y, force='tovector', checks=False) # X is matrix and Y is vector elif len(X.shape) == 2 and len(Y.shape) == 1: Y_as_matrix = X X, Y = Y, distance.squareform(X, force='tovector', checks=False) # X is matrix and Y is matrix elif len(X.shape) == 2 and len(Y.shape) == 2: Y_as_matrix = Y X = distance.squareform(X, force='tovector', checks=False) Y = distance.squareform(Y, force='tovector', checks=False) # Check for size equality. if X.shape[0] != Y.shape[0]: raise ValueError('X and Y are not of equal size') # Check for minimum size. if X.shape[0] < 3: raise ValueError('X and Y should represent at least 3 objects') # Assign the relevant correlation function to the variable 'correlate'. if method == 'pearson': correlate = pearsonr elif method == 'spearman': correlate = spearmanr elif method == 'kendall': correlate = kendalltau else: raise ValueError( 'The method should be set to "pearson", "spearman", or "kendall"') # Determine the size of the matrix (i.e. number of rows/columns). n = Y_as_matrix.shape[0] # Initialize an empty array to store temporary vector permutations of Y. Y_permuted = zeros(Y.shape[0], dtype=float) # Either enumerate all permutations ... if perms >= factorial(n) or perms == 0: # Initialize an empty array to store the correlations. corrs = zeros(factorial(n), dtype=float) # Enumerate all permutations of row/column orders. orders = permutations(range(n)) perms = 0 for order in orders: # Take a permutation of the matrix. Y_as_matrix_permuted = Y_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap( Y_as_matrix_permuted, Y_permuted) # Compute the correlation coefficient and store it to corrs. corrs[perms] = correlate(X, Y_permuted)[0] perms += 1 # ... or randomly sample from the space of permutations. else: # Initialize an empty array to store the correlations. corrs = zeros(perms, dtype=float) # Store the veridical correlation coefficient first. corrs[0] = correlate(X, Y)[0] for i in range(1, perms): # Choose a random order in which to permute the rows and columns. order = random.permutation(n) # Take a permutation of the matrix. Y_as_matrix_permuted = Y_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap( Y_as_matrix_permuted, Y_permuted) # Compute the correlation coefficient and store it to corrs. corrs[i] = correlate(X, Y_permuted)[0] # Assign veridical correlation to r. r = corrs[0] # Calculate the empirical p-value for the upper or lower tail. if tail == 'upper': p = (corrs >= r).sum() / float(perms) elif tail == 'lower': p = (corrs <= r).sum() / float(perms) else: raise ValueError('The tail should be set to "upper" or "lower"') # Calculate the standard score. m = corrs.mean() sd = corrs.std() z = (r - m) / sd return r, p, z