def clustDist(B1, B2, X=None, criterion='direct_euc'): if criterion == 'direct_euc': D = nu.dist_matrices(B1.transpose(), B2.transpose()) elif criterion == 'centres_euc': centres1 = nu.divideaxis(np.dot(B1.transpose(), X), np.sum(B1, axis=0), axis=1) centres2 = nu.divideaxis(np.dot(B2.transpose(), X), np.sum(B2, axis=0), axis=1) D = nu.dist_matrices(centres1, centres2) if np.any(~np.isnan(D)): m = np.max(D[~np.isnan(D)]) D[np.isnan(D)] = m + 1 else: D = np.ones(D.shape) elif criterion == 'union_std': K1 = B1.shape[1] K2 = B2.shape[1] D = np.zeros([K1, K2]) for k1 in range(K1): for k2 in range(K2): bUnion = np.max([B1[:, k1], B2[:, k2]], axis=0) # (1)x(Ng) bCentre = np.dot(bUnion, X) / np.sum(bUnion) # (1)x(Xdim) distsFromCentre = nu.dist_matrices(X, bCentre) # (Ng)x(1) D[k1, k2] = np.dot(bUnion, distsFromCentre) / np.sum( bUnion) # (1)x(1) elif criterion == 'hamming': D = nu.dist_matrices(B1.transpose(), B2.transpose(), criterion='hamming') else: raise ValueError('Invalid distance criterion provided.') return D
def normaliseSampleFeatureMat(X, type): """ X = normalizeSampleFeatureMat(X, type) type: 0 (none), 1 (divide by mean), 2 (divide by the first), 3 (take log2), 31 (take log2 after setting all values < 1.0 to 1.0, i.e. guarantee positive log), 4 (subtract the mean and divide by the std), 5 (divide by the sum), 6 (subtract the mean), 7 (divide by the max), 8 (2 to the power X), 9 (subtract the min), 10 (rank: 1 for lowest, then 2, 3, ...; average on ties), 11 (rank, like 10 but order arbitrarly on ties), 12 (normalise to the [0 1] range), 13 (Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows) 101 (quantile), 102 (subtract columns (samples) means), 103 (subtract global mean) 1000 (Automatically detect normalisation) If (type) was a vector like [3 1], this means to apply normalisation type (3) over (X) then to apply type (1) over the result. And so on. :param X: :param type: :return: """ Xout = np.array(X) codes = np.array( type ) # stays as input types unless auto-normalisation (type 1000) changes it if isinstance(type, (list, tuple, np.ndarray)): # This has a reason, which is if there is a single type (1000), it will replace it with the actual codes j = 0 for i in range(len(type)): Xout, codesi = normaliseSampleFeatureMat(Xout, type[i]) if isinstance(codesi, (list, tuple, np.ndarray)) & codesi.ndim > 0: codes[j] = codesi[0] codes = np.insert(codes, j + 1, codesi[1:]) j = j + len(codesi) else: j = j + 1 return Xout, codes if type == 1: # 1: Divide by the mean Xout = nu.divideaxis(Xout, np.mean(Xout, axis=1), 1) if type == 2: # 2: Divide by the first value Xout = nu.divideaxis(Xout, Xout[:, 1], 1) if type == 3: # 3: Take log2 Xout[Xout <= 0] = float('nan') Xout = np.log2(Xout) ind1 = np.any(isnan(Xout), axis=1) Xout[ind1] = fixnans(Xout[ind1]) if type == 31: # 31: Set all values < 1 to 1 then take log (guarantee a positive log) Xout[Xout <= 1] = 1 Xout = np.log2(Xout) if type == 4: # 4: Subtract the mean and divide by the std Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1) ConstGenesIndices = np.std(Xout, axis=1) == 0 Xout = nu.divideaxis(Xout, np.std(Xout, axis=1), axis=1) Xout[ConstGenesIndices] = 0 if type == 5: # 5: Divide by the sum Xout = nu.divideaxis(Xout, np.sum(Xout, axis=1), axis=1) if type == 6: # 6: Subtract the mean Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1) if type == 7: # 7: Divide by the maximum Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1) if type == 8: # 8: (2 to the power X) Xout = np.power(2, Xout) if type == 9: # 9: Subtract the min Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1) if type == 10: # 10: Rank: 0 for lowest, then 1, 2, ...; average on ties Xout = spmstats.rankdata(Xout, axis=0) - 1 if type == 11: # 11: Rank: 0 for lowest, then 1, 2, ...; arbitrary order on ties Xout = np.argsort(np.argsort(Xout, axis=0), axis=0) if type == 12: # 12: Normalise to the [0 1] range Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1) Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1) if type == 13: # 13: Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows Xout = filterBimodal(X) # 100s if type == 101: # 101: quantile av = np.mean(np.sort(Xout, axis=0), axis=1) II = np.argsort(np.argsort(Xout, axis=0), axis=0) Xout = av[II] if type == 102: # 102: subtract the mean of each sample (column) from it Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=0), axis=0) if type == 103: # 103: subtract the global mean of the data Xout -= np.mean(Xout) if type == 1000: # 1000: automatically detect normalisation codes = autoNormalise(Xout) Xout = normaliseSampleFeatureMat(Xout, codes)[0] codes = np.append([101], codes) return Xout, codes
def generateCoPaM(U, relabel_technique='minmin', w=None, X=None, distCriterion='direct_euc', K=0, GDM=None): # Helping functions def calwmeans(w): wm = [ np.mean(calwmeans(ww)) if isinstance(ww, (list, tuple, np.ndarray)) else np.mean(ww) for ww in w ] return np.array(wm) def CoPaMsdist(CoPaM1, CoPaM2): return np.linalg.norm(CoPaM1 - CoPaM2) def orderpartitions(U, method='rand', X=None, GDM=None): if method == 'rand': return np.random.permutation(range(len(U))), None elif method == 'mn': # TODO: Implement ranking partitions based on M-N plots raise NotImplementedError( 'Ranking partitions based on the M-N plots logic has not been implemented yet.' ) elif method == 'mse': R = len(U) mses = np.zeros(R) for r in range(R): if isinstance(U[r][0][0], (list, tuple, np.ndarray)): mses[r] = np.mean( orderpartitions(U[r], method=method, X=X, GDM=GDM)[1]) else: mses[r] = np.mean([ mn.mseclustersfuzzy(X, U[r], donormalise=False, GDM=GDM) ]) order = np.argsort(mses) return order, mses[order] # Fix parameters Uloc = ds.listofarrays2arrayofarrays(U) R = len(Uloc) if GDM is None: GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool) elif GDM.shape[1] == 1: if R > 1: GDMloc = np.tile(GDM, [1, R]) else: GDMloc = np.array(GDM) else: GDMloc = np.array(GDM) if w is None or (w is str and w in ['all', 'equal']): w = np.ones(R) elif ds.numel(w) == 1: w = np.array([w for i in range(R)]) wmeans = calwmeans(w) # Work! #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0] if GDM is None: permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0] else: permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0] Uloc = Uloc[permR] if GDMloc.shape[1] > 1: GDMloc = GDMloc[:, permR] wmeans = wmeans[permR] if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)): Uloc[0] = generateCoPaM(Uloc[0], relabel_technique=relabel_technique, w=w[0], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float) CoPaM = np.array(Uloc[0], dtype=float) K = CoPaM.shape[1] for r in range(1, R): if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)): Uloc[r] = generateCoPaM(Uloc[r], relabel_technique=relabel_technique, w=w[r], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) if Uloc[r].shape[1] != K: raise ValueError( 'Inequal numbers of clusters in the partition {}.'.format(r)) Uloc[r] = relabelClusts(CoPaM, Uloc[r], method=relabel_technique, X=X, distCriterion=distCriterion) dotprod = np.dot(GDMloc[:, 0:r], wmeans[0:r].transpose()) # (Mxr) * (rx1) = (Mx1) CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0] dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose()) CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) return CoPaM
def correcterrors_weighted_outliers2(B, X, GDM, clustdists=None, stds=3, smallestClusterSize=11): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustweights = np.ones(K) else: clustweights = np.min(clustdists) / clustdists # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) # Added this in this version w[gi:(gi + csize)] = clustweights[k] gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) SCGmeans = np.average(SCG[l], weights=w, axis=0) SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0) SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0), SCGstds, axis=0) # No. of stds away SCGouts = SCGouts > stds # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Solve genes included in two clusters: solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out