def fuzzystretch(X, x0=None): Xloc = np.array(X) if x0 is None: x0 = np.array([np.mean(xrow[xrow > 0]) for xrow in Xloc]) x0[x0 == 1] = 0.5 elif ds.numel(x0) == 1: x0 = np.array([x0 for i in range(Xloc.shape[0])]) elif ds.numel(x0) != Xloc.shape[0]: raise ValueError( 'The parameter x0 should either be a single value or a vector of length equal to the number ' 'of rows in X. It can also be left ungiven as it has a default value.' ) y = np.zeros(Xloc.shape) for i in range(Xloc.shape[0]): xrow = Xloc[i] xt = xrow xt[xrow < x0[i]] = (np.pi * xrow[xrow < x0[i]]) / (2 * x0[i]) - np.pi / 2 xt[xrow >= x0[i]] = (xrow[xrow >= x0[i]] - x0[i]) * np.pi / (2 * (1 - x0[i])) yt = np.zeros(len(xt)) yt[xrow < x0[i]] = x0[i] + x0[i] * np.sin(xt[xrow < x0[i]]) yt[xrow >= x0[i]] = x0[i] + (1 - x0[i]) * np.sin(xt[xrow >= x0[i]]) y[i] = yt return y
def combineReplicates(X, replicatesIDs, flipSamples): Xloc = np.array(X) L = len(Xloc) for l in range(L): Xtmp = Xloc[l] arelogs = np.sum(abs(Xtmp) < 30) > 0.98 * ds.numel( Xtmp) # More than 98% of values are below 30.0 if flipSamples is not None and flipSamples[l] is not None and len( flipSamples[l]) == Xtmp.shape[1]: if arelogs: Xtmp[:, flipSamples[l] == 1] = -Xtmp[:, flipSamples[l] == 1] else: Xtmp[:, flipSamples[l] == 1] = np.divide( 1.0, Xtmp[:, flipSamples[l] == 1]) uniqueSamples = np.unique(replicatesIDs[l]) uniqueSamples = uniqueSamples[uniqueSamples != -1] Xloc[l] = np.zeros([Xtmp.shape[0], len(uniqueSamples)]) ss = 0 for s in range(len(uniqueSamples)): if uniqueSamples[s] > -1: Xloc[l][:, ss] = np.median( Xtmp[:, replicatesIDs[l] == uniqueSamples[s]], axis=1) ss += 1 return Xloc
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=1): Xloc = ds.listofarrays2arrayofarrays(X) Genesloc = deepcopy(Genes) if Map is None: OGsDatasets = deepcopy(Genes) OGs = np.unique(ds.flattenAList( OGsDatasets)) # Unique list of genes (or mapped genes) MapNew = None MapSpecies = None else: (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader, OGsFirstColMap, delimGenesInMap) L = len(Genesloc) # Number of datasets # Ng = len(OGs) # Number of unique genes GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets]) # GDM: (Ng)x(L) boolean # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets GDM = GDMall[IncludedOGs] OGs = OGs[IncludedOGs] if MapNew is not None: MapNew = MapNew[IncludedOGs] Ngs = np.sum(GDM, axis=0) # Numbers of unique mapped genes in each dataset Xnew = np.array([None] * L, dtype=object) GenesDatasets = np.array([None] * L, dtype=object) for l in range(L): arelogs = np.nansum( abs(Xloc[l][~isnan(Xloc[l])]) < 30 ) > 0.98 * ds.numel( Xloc[l][~isnan(Xloc[l])]) # More than 98% of values are below 30.0 d = Xloc[l].shape[1] # Number of dimensions (samples) in this dataset Xnew[l] = np.zeros([Ngs[l], d], dtype=float) GenesDatasets[l] = np.empty(Ngs[l], dtype=object) OGsInThisDS = OGs[GDM[:, l]] # Unique OGs in this dataset # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above) for ogi in range(len(OGsInThisDS)): og = OGsInThisDS[ogi] if arelogs: Xnew[l][ogi] = np.log2( np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0)) else: Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0) GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d( OGsDatasets[l], og)]) return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
def autoNormalise(X): """ Automatically normalise dataset X and filter it if needed :param X: Dataset matrix (numpy array) :return: array of normalisation codes """ Xloc = np.array(X) twosided = np.sum(Xloc < 0) > 0.2 * np.sum( Xloc > 0) # negative values are at least 20% of positive values alreadylogs = np.sum(abs(Xloc) < 30) > 0.98 * ds.numel( Xloc) # More than 98% of values are below 30.0 if twosided: return np.array([6]) #return np.array([101, 4]) else: Xloc[isnan(Xloc)] = 0.0 Xloc[Xloc < 0] = 0.0 if alreadylogs: Xf = normaliseSampleFeatureMat(Xloc, [13])[0] if isnormal_68_95_99p7_rule(Xf)[1] < isnormal_68_95_99p7_rule( Xloc)[1]: return np.array([13, 4]) else: return np.array([4]) else: Xl = normaliseSampleFeatureMat( Xloc, [3])[0] # index 1 (Xloc, i.e. original X is index 0) Xlp = normaliseSampleFeatureMat(Xloc, [31])[0] # index 2 Xf = normaliseSampleFeatureMat(Xloc, [13])[0] # index 3 Xlf = normaliseSampleFeatureMat(Xl, [13])[0] # index 4 Xlpf = normaliseSampleFeatureMat(Xlp, [13])[0] # index 5 isnormal_stats = [ isnormal_68_95_99p7_rule(Xloc)[1], isnormal_68_95_99p7_rule(Xl)[1], isnormal_68_95_99p7_rule(Xlp)[1], isnormal_68_95_99p7_rule(Xf)[1], isnormal_68_95_99p7_rule(Xlf)[1], isnormal_68_95_99p7_rule(Xlpf)[1] ] most_normal_index = np.argmin(isnormal_stats) if most_normal_index == 0: return np.array([4]) elif most_normal_index == 1: return np.array([3, 4]) elif most_normal_index == 2: return np.array([31, 4]) elif most_normal_index == 3: return np.array([13, 4]) elif most_normal_index == 4: return np.array([3, 13, 4]) elif most_normal_index == 5: return np.array([31, 13, 4]) else: raise ValueError( 'You should never reach this error. Please contact {0}'. format(glob.email))
def percentage_less_than(X, v): """ Percentage of elements in matrix X that are less than the value v :param X: Matrix of numbers (numpy array) :param v: A value to be compared with :return: A percentage in the range [0.0, 1.0] """ return np.sum(X < v) * 1.0 / ds.numel(X)
def isnan(X): if ds.numel(X) == 1: return math.isnan(X) elif len(np.shape(X)) == 1: res = np.zeros(np.shape(X), dtype=bool) for i in range(len(X)): res[i] = math.isnan(X[i]) return res elif len(np.shape(X)) == 2: res = np.zeros(np.shape(X), dtype=bool) for i in range(np.size(X, 0)): for j in range(np.size(X, 1)): res[i, j] = math.isnan(X[i, j]) return res
def isnormal_68_95_99p7_rule(X): """ Test if data is normally distributed by checking the percentages of values below different stds away from the mean This is not fully implemented and is not used in the current version of the method :param X: Dataset matrix (numpy array) :return: """ n = ds.numel(X) m = np.mean(X) s = np.std(X) bins = np.linspace(np.min(X), np.max(X), 100) d = np.digitize(np.concatenate(X), bins) xd = bins[d - 1] mode = spst.mode(xd)[0] # Find the percentage of elements less than these seven values m3s = percentage_less_than( X, m - 3 * s) # mean minus 3s (theory ~= N(0.0013, s=0.0315/sqrt(n))) m2s = percentage_less_than( X, m - 2 * s) # mean minus 2s (theory ~= N(0.0228, s=0.1153/sqrt(n))) m1s = percentage_less_than( X, m - 1 * s) # mean minus 1s (theory ~= N(0.1587, s=0.2116/sqrt(n))) p0s = percentage_less_than( X, m) # mean (theory ~= N(0.5000, s=0.3013/sqrt(n))) p1s = percentage_less_than( X, m + 1 * s) # mean plus 1s (theory ~= N(0.8413, s=0.2116/sqrt(n))) p2s = percentage_less_than( X, m + 2 * s) # mean plus 2s (theory ~= N(0.9772, s=0.1153/sqrt(n))) p3s = percentage_less_than( X, m + 3 * s) # mean plus 3s (theory ~= N(0.9987, s=0.0315/sqrt(n))) md = percentage_less_than( X, mode) # mode (theory ~= N(0.9987, s=0.0315/sqrt(n))) # How were these theoretical distributions calculated?? # The distributions of these stds were found empirically by calculating them from 1000x26 randomly generated # normally distributed numbers ~N(0.0, 1.0). 26 different population sizes were considered "round(10.^(1:0.2:6))", # at each population size, 1000 random populations were generated. It was observed that at a fixed population size, # the percentages of elements less than (m-3*s) or (m-2*s) ... (etc.) were normally distributed with an average # equal to the expected CDF at (m-3*s) or (m-2*s) ... (etc.) and with a standard deviation that is inversely # linearly proportional to the square root of the size of the population. The empirical values were calculated from # this experiment and are included above. For example: the percentage of elements that are less than (m-2*s) in a # population of n elements is expected to be 0.0228 (2.28%) with a standard deviation of 0.1587/sqrt(n). # This empirical test was run on MATLAB # Calculate one-tailed p-values for the seven values above based on normal distributions pv = np.array([i * 1.0 for i in range(8)]) diff = np.array([i * 1.0 for i in range(8)]) pv[0] = 1 - 2 * spst.norm.cdf( -abs(m3s - 0.0013), loc=0, scale=0.0315 / math.sqrt(n)) diff[0] = abs(m3s - 0.0013) pv[1] = 1 - 2 * spst.norm.cdf( -abs(m2s - 0.0228), loc=0, scale=0.1153 / math.sqrt(n)) diff[1] = abs(m2s - 0.0228) pv[2] = 1 - 2 * spst.norm.cdf( -abs(m1s - 0.1587), loc=0, scale=0.2116 / math.sqrt(n)) diff[2] = abs(m1s - 0.1587) pv[3] = 1 - 2 * spst.norm.cdf( -abs(p0s - 0.5000), loc=0, scale=0.3013 / math.sqrt(n)) diff[3] = abs(p0s - 0.5000) pv[4] = 1 - 2 * spst.norm.cdf( -abs(p1s - 0.8413), loc=0, scale=0.2116 / math.sqrt(n)) diff[4] = abs(p1s - 0.8413) pv[5] = 1 - 2 * spst.norm.cdf( -abs(p2s - 0.9772), loc=0, scale=0.1153 / math.sqrt(n)) diff[5] = abs(p2s - 0.9772) pv[6] = 1 - 2 * spst.norm.cdf( -abs(p3s - 0.9987), loc=0, scale=0.0315 / math.sqrt(n)) diff[6] = abs(p3s - 0.9987) pv[7] = 1 - 2 * spst.norm.cdf( -abs(md - 0.5000), loc=0, scale=0.3013 / math.sqrt(n)) diff[7] = abs(md - 0.5000) return np.mean(np.log10(pv)), np.mean(diff)
def generateCoPaM(U, relabel_technique='minmin', w=None, X=None, distCriterion='direct_euc', K=0, GDM=None): # Helping functions def calwmeans(w): wm = [ np.mean(calwmeans(ww)) if isinstance(ww, (list, tuple, np.ndarray)) else np.mean(ww) for ww in w ] return np.array(wm) def CoPaMsdist(CoPaM1, CoPaM2): return np.linalg.norm(CoPaM1 - CoPaM2) def orderpartitions(U, method='rand', X=None, GDM=None): if method == 'rand': return np.random.permutation(range(len(U))), None elif method == 'mn': # TODO: Implement ranking partitions based on M-N plots raise NotImplementedError( 'Ranking partitions based on the M-N plots logic has not been implemented yet.' ) elif method == 'mse': R = len(U) mses = np.zeros(R) for r in range(R): if isinstance(U[r][0][0], (list, tuple, np.ndarray)): mses[r] = np.mean( orderpartitions(U[r], method=method, X=X, GDM=GDM)[1]) else: mses[r] = np.mean([ mn.mseclustersfuzzy(X, U[r], donormalise=False, GDM=GDM) ]) order = np.argsort(mses) return order, mses[order] # Fix parameters Uloc = ds.listofarrays2arrayofarrays(U) R = len(Uloc) if GDM is None: GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool) elif GDM.shape[1] == 1: if R > 1: GDMloc = np.tile(GDM, [1, R]) else: GDMloc = np.array(GDM) else: GDMloc = np.array(GDM) if w is None or (w is str and w in ['all', 'equal']): w = np.ones(R) elif ds.numel(w) == 1: w = np.array([w for i in range(R)]) wmeans = calwmeans(w) # Work! #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0] if GDM is None: permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0] else: permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0] Uloc = Uloc[permR] if GDMloc.shape[1] > 1: GDMloc = GDMloc[:, permR] wmeans = wmeans[permR] if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)): Uloc[0] = generateCoPaM(Uloc[0], relabel_technique=relabel_technique, w=w[0], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float) CoPaM = np.array(Uloc[0], dtype=float) K = CoPaM.shape[1] for r in range(1, R): if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)): Uloc[r] = generateCoPaM(Uloc[r], relabel_technique=relabel_technique, w=w[r], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) if Uloc[r].shape[1] != K: raise ValueError( 'Inequal numbers of clusters in the partition {}.'.format(r)) Uloc[r] = relabelClusts(CoPaM, Uloc[r], method=relabel_technique, X=X, distCriterion=distCriterion) dotprod = np.dot(GDMloc[:, 0:r], wmeans[0:r].transpose()) # (Mxr) * (rx1) = (Mx1) CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0] dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose()) CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) return CoPaM
def optimise_tukey_sqrtSCG(B, X, GDM, clustdists=None, smallestClusterSize=11, tails=1, Q3s=2): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustdistsloc = np.ones(K) else: clustdistsloc = [c for c in clustdists] # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap') for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) if k in Cgood: csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) if ds.numel(SCG[l] > 0): if tails == 1: Q3 = np.percentile(SCG[l], q=75, axis=0) thresh = Q3s * Q3 SCGouts = SCG[l] > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on elif tails == 2: Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0) Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0) IQR = np.subtract(Q3, Q1) thresh = np.add(Q3, 1.5 * IQR) SCGouts = np.sqrt(SCG[l]) > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on else: raise ValueError( 'Invalid number of tails. It should be either 1 or 2.') else: SCG[l] = np.zeros((1, SCG[l].shape[1])) # Clusters mins and maxes (NEW) Cmins = np.array([None] * L, dtype=object) Cmaxes = np.array([None] * L, dtype=object) for l in range(L): Cmins[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions Cmaxes[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions for k in range(K): Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0) Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0) # Resolve overlaps between clusters (NEW) for k1 in range(K): for k2 in range(K): # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement # guarantees that k2 will always be a later cluster than k1. if (k1 >= k2): continue # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ... # the dataset (l) and the dimension (d), at which this overlap is found # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min # to max) range of one cluster is within the range of the other cluster. This is the worse overlap. # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop v_smallestoverlap = 0 l_smallestoverlap = -1 d_smallestoverlap = -1 t_smallestoverlap = -1 # Overlap type, read above for l in range(L): Nd = len(Cmins[l][k1]) # Dimensions in this dataset for d in range(Nd): x1 = Cmaxes[l][k1][d] x2 = Cmaxes[l][k2][d] n1 = Cmins[l][k1][d] n2 = Cmins[l][k2][d] if (x1 > n2 and x1 <= x2): if (n1 < n2): ov = x1 - n2 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 0 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d elif (x2 > n1 and x2 <= x1): if (n2 < n1): ov = x2 - n1 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 1 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d else: t_smallestoverlap = 2 continue # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue if (t_smallestoverlap == 2): continue # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue # Sort out the overlap if exists between k1 and k2 if (t_smallestoverlap == -1): # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2). # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0), # so effectively no gene will ever be mapped to it! Cmins[0][k2][0] = 1 Cmaxes[0][k2][0] = 0 elif (t_smallestoverlap == 0): Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon elif (t_smallestoverlap == 1): Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon # Find who belongs (NEW) belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): tmp1 = nu.largerthanaxis(Xloc[l], Cmins[l][k], axis=0, orequal=True) tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True) belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2), axis=1) # # Helping function (OLD - to be removed) # def iswithinworse(ref, x): # return x <= np.max(ref) # # # Find who belongs (OLD - to be removed) # belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets # for l in range(L): # for k in range(K): # for d in range(Xloc[l].shape[1]): # tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) # belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere (OLD - to be removed) B_out = np.all(belongs, axis=2) # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED) solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out