def clustDist(B1, B2, X=None, criterion='direct_euc'): if criterion == 'direct_euc': D = nu.dist_matrices(B1.transpose(), B2.transpose()) elif criterion == 'centres_euc': centres1 = nu.divideaxis(np.dot(B1.transpose(), X), np.sum(B1, axis=0), axis=1) centres2 = nu.divideaxis(np.dot(B2.transpose(), X), np.sum(B2, axis=0), axis=1) D = nu.dist_matrices(centres1, centres2) if np.any(~np.isnan(D)): m = np.max(D[~np.isnan(D)]) D[np.isnan(D)] = m + 1 else: D = np.ones(D.shape) elif criterion == 'union_std': K1 = B1.shape[1] K2 = B2.shape[1] D = np.zeros([K1, K2]) for k1 in range(K1): for k2 in range(K2): bUnion = np.max([B1[:,k1], B2[:,k2]], axis = 0) # (1)x(Ng) bCentre = np.dot(bUnion, X) / np.sum(bUnion) # (1)x(Xdim) distsFromCentre = nu.dist_matrices(X, bCentre) # (Ng)x(1) D[k1, k2] = np.dot(bUnion, distsFromCentre) / np.sum(bUnion) # (1)x(1) elif criterion == 'hamming': D = nu.dist_matrices(B1.transpose(), B2.transpose(), criterion='hamming') else: raise ValueError('Invalid distance criterion provided.') return D
def correcterrors_withinworse(B, X, GDM, falsepositivestrimmed=0.01): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Find clusters' means (Cmeans), absolute shifter clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) if falsepositivestrimmed > 0: trimmed = int(falsepositivestrimmed * SCG[l].shape[0]) if trimmed > 0: SCG[l] = SCG[l][ 0:-trimmed] # trim the lowest (trimmed) rows in SCG # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero(B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros( [len(ficlusts), len(fidatasets)]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices( Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape([len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True return B_out
def optimise_tukey_sqrtSCG(B, X, GDM, clustdists=None, smallestClusterSize=11, tails=1, Q3s=2): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustdistsloc = np.ones(K) else: clustdistsloc = [c for c in clustdists] # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap') for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) if k in Cgood: csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) if ds.numel(SCG[l] > 0): if tails == 1: Q3 = np.percentile(SCG[l], q=75, axis=0) thresh = Q3s * Q3 SCGouts = SCG[l] > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on elif tails == 2: Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0) Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0) IQR = np.subtract(Q3, Q1) thresh = np.add(Q3, 1.5 * IQR) SCGouts = np.sqrt(SCG[l]) > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on else: raise ValueError( 'Invalid number of tails. It should be either 1 or 2.') else: SCG[l] = np.zeros((1, SCG[l].shape[1])) # Clusters mins and maxes (NEW) Cmins = np.array([None] * L, dtype=object) Cmaxes = np.array([None] * L, dtype=object) for l in range(L): Cmins[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions Cmaxes[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions for k in range(K): Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0) Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0) # Resolve overlaps between clusters (NEW) for k1 in range(K): for k2 in range(K): # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement # guarantees that k2 will always be a later cluster than k1. if (k1 >= k2): continue # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ... # the dataset (l) and the dimension (d), at which this overlap is found # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min # to max) range of one cluster is within the range of the other cluster. This is the worse overlap. # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop v_smallestoverlap = 0 l_smallestoverlap = -1 d_smallestoverlap = -1 t_smallestoverlap = -1 # Overlap type, read above for l in range(L): Nd = len(Cmins[l][k1]) # Dimensions in this dataset for d in range(Nd): x1 = Cmaxes[l][k1][d] x2 = Cmaxes[l][k2][d] n1 = Cmins[l][k1][d] n2 = Cmins[l][k2][d] if (x1 > n2 and x1 <= x2): if (n1 < n2): ov = x1 - n2 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 0 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d elif (x2 > n1 and x2 <= x1): if (n2 < n1): ov = x2 - n1 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 1 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d else: t_smallestoverlap = 2 continue # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue if (t_smallestoverlap == 2): continue # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue # Sort out the overlap if exists between k1 and k2 if (t_smallestoverlap == -1): # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2). # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0), # so effectively no gene will ever be mapped to it! Cmins[0][k2][0] = 1 Cmaxes[0][k2][0] = 0 elif (t_smallestoverlap == 0): Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon elif (t_smallestoverlap == 1): Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon # Find who belongs (NEW) belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): tmp1 = nu.largerthanaxis(Xloc[l], Cmins[l][k], axis=0, orequal=True) tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True) belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2), axis=1) # # Helping function (OLD - to be removed) # def iswithinworse(ref, x): # return x <= np.max(ref) # # # Find who belongs (OLD - to be removed) # belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets # for l in range(L): # for k in range(K): # for d in range(Xloc[l].shape[1]): # tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) # belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere (OLD - to be removed) B_out = np.all(belongs, axis=2) # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED) solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out
def correcterrors_weighted_outliers2(B, X, GDM, clustdists=None, stds=3, smallestClusterSize=11): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustweights = np.ones(K) else: clustweights = np.min(clustdists) / clustdists # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) # Added this in this version w[gi:(gi + csize)] = clustweights[k] gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) SCGmeans = np.average(SCG[l], weights=w, axis=0) SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0) SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0), SCGstds, axis=0) # No. of stds away SCGouts = SCGouts > stds # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Solve genes included in two clusters: solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out