def readDataFromFiles(datafiles, delimiter='\t| |, |; |,|;', dtype=float, skiprows=1, skipcolumns=1, returnSkipped=True, comm='#'): L = len(datafiles) X = [None] * L skippedRows = [None] * L skippedCols = [None] * L for l in range(L): with open(datafiles[l]) as f: ncols = len(re.split(delimiter, f.readline())) # This is now using pandas read_csv, if np.loadtxt is re-used, you HAVE TO set ndmin = 2 here X[l] = pdreadcsv_regexdelim(datafiles[l], delimiter=delimiter, dtype=dtype, skiprows=skiprows, usecols=range(skipcolumns, ncols), na_filter=True, comments=comm) if skiprows > 0: skippedRows[l] = pdreadcsv_regexdelim(datafiles[l], delimiter=delimiter, dtype=str, skiprows=0, usecols=range( skipcolumns, ncols), na_filter=False, comments=comm)[0:skiprows] if skiprows == 1: skippedRows[l] = skippedRows[l][0] else: skippedRows[l] = np.array([]).reshape([0, X[l].shape[1]]) if skipcolumns > 0: skippedCols[l] = pdreadcsv_regexdelim(datafiles[l], delimiter=delimiter, dtype=str, skiprows=skiprows, usecols=range(skipcolumns), na_filter=False, comments=comm) else: skippedCols[l] = np.array([]).reshape([0, X[l].shape[1]]) if returnSkipped: return (ds.listofarrays2arrayofarrays(X), skippedRows, skippedCols) else: return ds.listofarrays2arrayofarrays(X)
def readDataFromFiles(datafiles, delimiter='\t| |, |; |,|;', dtype=float, skiprows=1, skipcolumns=1, returnSkipped=True, comm='#'): L = len(datafiles) X = [None] * L skippedRows = [None] * L skippedCols = [None] * L for l in range(L): with open(datafiles[l]) as f: ncols = len(re.split(delimiter, f.readline())) X[l] = nploadtxt_regexdelim(datafiles[l], delimiter=delimiter, dtype=dtype, skiprows=skiprows, usecols=range(skipcolumns, ncols), ndmin=2, comments=comm) if skiprows > 0: skippedRows[l] = nploadtxt_regexdelim(datafiles[l], delimiter=delimiter, dtype=str, skiprows=0, usecols=range( skipcolumns, ncols), comments=comm)[0:skiprows] if skiprows == 1: skippedRows[l] = skippedRows[l][0] else: skippedRows[l] = np.array([]).reshape([0, X[l].shape[1]]) if skipcolumns > 0: skippedCols[l] = nploadtxt_regexdelim(datafiles[l], delimiter=delimiter, dtype=str, skiprows=skiprows, usecols=range(skipcolumns), comments=comm) else: skippedCols[l] = np.array([]).reshape([0, X[l].shape[1]]) if returnSkipped: return (ds.listofarrays2arrayofarrays(X), skippedRows, skippedCols) else: return ds.listofarrays2arrayofarrays(X)
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=1): Xloc = ds.listofarrays2arrayofarrays(X) Genesloc = deepcopy(Genes) if Map is None: OGsDatasets = deepcopy(Genes) OGs = np.unique(ds.flattenAList( OGsDatasets)) # Unique list of genes (or mapped genes) MapNew = None MapSpecies = None else: (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader, OGsFirstColMap, delimGenesInMap) L = len(Genesloc) # Number of datasets # Ng = len(OGs) # Number of unique genes GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets]) # GDM: (Ng)x(L) boolean # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets GDM = GDMall[IncludedOGs] OGs = OGs[IncludedOGs] if MapNew is not None: MapNew = MapNew[IncludedOGs] Ngs = np.sum(GDM, axis=0) # Numbers of unique mapped genes in each dataset Xnew = np.array([None] * L, dtype=object) GenesDatasets = np.array([None] * L, dtype=object) for l in range(L): arelogs = np.nansum( abs(Xloc[l][~isnan(Xloc[l])]) < 30 ) > 0.98 * ds.numel( Xloc[l][~isnan(Xloc[l])]) # More than 98% of values are below 30.0 d = Xloc[l].shape[1] # Number of dimensions (samples) in this dataset Xnew[l] = np.zeros([Ngs[l], d], dtype=float) GenesDatasets[l] = np.empty(Ngs[l], dtype=object) OGsInThisDS = OGs[GDM[:, l]] # Unique OGs in this dataset # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above) for ogi in range(len(OGsInThisDS)): og = OGsInThisDS[ogi] if arelogs: Xnew[l][ogi] = np.log2( np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0)) else: Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0) GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d( OGsDatasets[l], og)]) return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
def reorderClusters(B, X, GDM, returnOrderIndices=False): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) Bloc = Bloc[:, np.any(Bloc, axis=0)] # Only keep non-empty clusters B_ordered = np.zeros(Bloc.shape, dtype=bool) K = Bloc.shape[1] # Number of clusters L = Xloc.shape[0] # Number of datasets if K == 0: return Bloc # Find Cmeans and distances between clusters Cmeans = np.array([None] * L, dtype=object) D = np.zeros([K, K, L]) # KxKxL for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float) # (K) x (X[l] samples) for k in range(K): Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) D[:, :, l] = skdists.euclidean_distances(Cmeans[l]) # KxK D = np.median(D, axis=2) # KxK # Set first cluster as first, then find closest by closest B_ordered[:, 0] = Bloc[:, 0] I = np.zeros(K, dtype=int) I[0] = 0 clustersDone = np.zeros(K, dtype=bool) clustersDone[0] = True for k in range(1, K): relevantD = D[I[k - 1], ~clustersDone] clustersLeft = np.nonzero(~clustersDone)[0] nextCluster = np.argmin(relevantD) nextCluster = clustersLeft[nextCluster] B_ordered[:, k] = Bloc[:, nextCluster] I[k] = nextCluster clustersDone[nextCluster] = True if returnOrderIndices: return (B_ordered, I) else: return B_ordered
def preprocess(X, GDM, normalise=1000, replicatesIDs=None, flipSamples=None, expressionValueThreshold=10.0, replacementVal=0.0, atleastinconditions=1, atleastindatasets=1, absvalue=False, usereplacementval=False, filteringtype='raw', filterflat=True, params=None, datafiles=None): # Fixing parameters Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) if datafiles is None: if L == 1: datafiles = ['X'] else: datafiles = np.array([], dtype=str) for i in range(L): datafiles = np.append(datafiles, 'X{0}'.format(i + 1)) if params is None: params = {} if replicatesIDs is None: replicatesIDsloc = [ np.array([ii for ii in range(x.shape[1])]) for x in Xloc ] else: replicatesIDsloc = ds.listofarrays2arrayofarrays(replicatesIDs) replicatesIDsloc = [np.array(x) for x in replicatesIDsloc] if flipSamples is None: flipSamplesloc = None else: flipSamplesloc = ds.listofarrays2arrayofarrays(flipSamples) flipSamplesloc = [np.array(x) for x in flipSamplesloc] # Revise if the if statement below is accurate! if not isinstance(normalise, (list, tuple, np.ndarray)): normaliseloc = [ normalise if isinstance(normalise, (list, tuple, np.ndarray)) else [normalise] for i in range(L) ] normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc) else: normaliseloc = [ nor if isinstance(nor, (list, tuple, np.ndarray)) else [nor] for nor in normalise ] normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc) # Get rid of nans by fixing Xproc = Xloc for l in range(L): Xproc[l] = fixnans(Xproc[l]) # Prepare applied_norm dictionary before any normalisation takes place applied_norm = collec.OrderedDict(zip(datafiles, deepcopy(normaliseloc))) # Tell the user if any automatic normalisation is taking place allare1000 = True anyis1000 = False for l in range(L): if 1000 in normaliseloc[l]: anyis1000 = True else: allare1000 = False if allare1000: io.log(' - Automatic normalisation mode (default in v1.7.0+).') io.log(' Clust automatically normalises your dataset(s).') io.log(' To switch it off, use the `-n 0` option (not recommended).') io.log(' Check https://github.com/BaselAbujamous/clust for details.') elif anyis1000: io.log( ' - Some datasets are not assigned normalisation codes in the provided' ) io.log( ' normalisation file. Clust automatically identifies and applies the' ) io.log(' most suitable normalisation to them (default in v1.7.0+).') io.log(' If you don' 't want clust to normalise them, assign each of them a') io.log(' normalisation code of 0 in the normalisation file.') io.log(' Check https://github.com/BaselAbujamous/clust for details.') # Quantile normalisation for l in range(L): if 101 in normaliseloc[l] or 1000 in normaliseloc[l]: Xproc[l] = normaliseSampleFeatureMat(Xproc[l], 101)[0] if 101 in normaliseloc[l]: i = np.argwhere(np.array(normaliseloc[l]) == 101) i = i[0][0] normaliseloc[l][i] = 0 # Combine replicates and sort out flipped samples Xproc = combineReplicates(Xproc, replicatesIDsloc, flipSamplesloc) # Filter genes not exceeding the threshold (Xproc, GDMnew, Iincluded) = filterlowgenes(Xproc, GDM, expressionValueThreshold, replacementVal, atleastinconditions, atleastindatasets, absvalue, usereplacementval, filteringtype) # Normalise for l in range(L): (Xproc[l], codes) = normaliseSampleFeatureMat(Xproc[l], normaliseloc[l]) if np.all(codes == normaliseloc[l]): applied_norm[datafiles[l]] = op.arraytostring( applied_norm[datafiles[l]], delim=' ', openbrac='', closebrac='') else: applied_norm[datafiles[l]] = op.arraytostring(codes, delim=' ', openbrac='', closebrac='') if filterflat: io.log( ' - Flat expression profiles filtered out (default in v1.7.0+).') io.log( ' To switch it off, use the --no-fil-flat option (not recommended).' ) io.log(' Check https://github.com/BaselAbujamous/clust for details.') (Xproc, GDMnew, Iincluded) = filterFlat(Xproc, GDMnew, Iincluded) # Prepare params for the output params = dict( params, **{ 'normalise': normaliseloc, 'replicatesIDs': replicatesIDs, 'flipSamples': flipSamplesloc, 'L': L }) return Xproc, GDMnew, Iincluded, params, applied_norm
def mnplotsgreedy(X, B, type='A', params=None, allMSE=None, tightnessweight=1, setsP=None, setsN=None, Xtype='data', mseCache=None, wsets=None, GDM=None, msesummary='average', percentageOfClustersKept=100, smallestClusterSize=11, Xnames=None, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) Bloc = ds.reduceToArrayOfNDArraysAsObjects(B, 2) L = Xloc.shape[0] # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = Xloc.shape[0] if wsets is None: wsets = np.array([1 for x in range(L)]) if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: Ng = np.shape(GDM)[0] GDMloc = GDM[:, setsPN] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] # Put all clusters in one matrix N = Bloc.shape[0] # Number of partitions K = [Bloc[i].shape[1] for i in range(N)] # Number of clusters in each partition # One big matrix for all clusters BB = Bloc[0] for n in range(1, N): BB = np.append(BB, Bloc[n], axis=1) VMc = np.sum(BB, axis=0) NN = len(VMc) # Total number of clusters # Fill Vmse if not provided if mseCache is None and allMSE is None: # Cache all mse values mseCache = np.zeros([NN, L]) io.resetparallelprogress(NN * L) for l in range(L): if Xtype == 'files': # load files here raise NotImplementedError( 'Xtype "files" has not been implemented yet.') elif Xtype == 'data': Xtmp = Xloc[l] else: raise ValueError( 'Xtype has to be "files" or "data". The given Xtype is invalid.' ) with warnings.catch_warnings(): warnings.simplefilter("ignore") mseCachetmp = Parallel(n_jobs=ncores)\ (delayed(mseclusters) (Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0) for nn in range(NN)) mseCachetmp = [mm[0] for mm in mseCachetmp] for nn in range(NN): mseCache[nn, l] = mseCachetmp[nn] gc.collect() #io.updateparallelprogress(NN) ''' for nn in range(NN): mseCache[nn, l] = mseclusters(Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0)[0] io.log('Done cluster evaluation for {0} have been calculated.'.format(Xnames[l])) ''' # Calculate allMSE if needed (Nx1) if allMSE is None: if type == 'A': wsetsloc = wsets[setsPN] wsetsloc = [float(n) / sum(wsetsloc) for n in wsetsloc] if msesummary == 'average' or msesummary == 'mean': allMSE = np.dot(mseCache[:, setsPN], wsets) elif msesummary == 'worse' or msesummary == 'max': allMSE = np.max(np.multiply(mseCache[:, setsPN], wsets), axis=1) else: raise ValueError( 'msesummary value has to be "average", "mean", "worse", or "max".', ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.' ) elif type == 'B': wsetsP = wsets[setsP] wsetsP = [n / sum(wsetsP) for n in wsetsP] wsetsN = wsets[setsN] wsetsN = [n / sum(wsetsN) for n in wsetsN] if msesummary == 'average' or msesummary == 'mean': allMSE = np.dot(mseCache[:, setsP], wsetsP) - np.dot( mseCache[:, setsN], wsetsN) elif msesummary == 'worse' or msesummary == 'max': allMSE = np.max(np.multiply(mseCache[:, setsP], wsetsP), axis=1) \ - np.max(np.multiply(mseCache[:, setsN], wsetsN), axis=1) else: raise ValueError( 'msesummary value has to be "average", "mean", "worse", or "max".', ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.' ) else: raise ValueError( 'Type should be either A or B; given type is invalid.') # Find the distances maxx = np.max(allMSE[~np.isnan(allMSE)]) minx = np.min(allMSE[~np.isnan(allMSE)]) maxy = np.log10(np.max(VMc)) miny = 0 with np.errstate(divide='ignore'): allVecs = np.concatenate( ([(allMSE - minx) / (maxx - minx)], [(np.log10(VMc) - miny) / (maxy - miny)]), axis=0).transpose() allVecs[:, 0] *= tightnessweight allDists = np.array([ np.sqrt(1.1 + np.power(tightnessweight, 2)) if np.any(np.isnan(n)) else sp.spatial.distance.euclidean(n, [0, 1]) for n in allVecs ]) alpha = 0.0001 tmp, uVdsI = np.unique(allDists, return_index=True) while len(uVdsI) != len(allDists): for n in range(len(allDists)): if n not in uVdsI: allDists[n] += alpha * sp.random.normal() tmp, uVdsI = np.unique(allDists, return_index=True) # Helper function for greedy solution below def mngreedy(Bloc, I, Vds, iter=float('inf')): Vdsloc = np.array(Vds) res = np.array([False for n in Vdsloc]) if iter == 0 or not any(I): return res for n in range(len(I)): if not I[n]: Vdsloc[n] = float('inf') p = np.argmin(Vdsloc) res[p] = True #II = I overlaps = np.dot( ds.matlablike_index2D(Bloc, 'all', p).transpose(), Bloc) > 0 I &= ~overlaps return res | mngreedy(Bloc, I, Vdsloc, iter - 1) # ** Find greedy solution ** # Sort clusters based on distances (not important, but benefits the output) II = np.argsort(allDists) allDists = allDists[II] BB = ds.matlablike_index2D(BB, 'a', II) allVecs = ds.matlablike_index2D(allVecs, II, 'a') allMSE = allMSE[II] mseCache = ds.matlablike_index2D(mseCache, II, 'a') VMc = VMc[II] # include the top XX% of the clusters that have at least smallestClusterSize Ismall = VMc < smallestClusterSize Inans = np.isnan(allDists) tmpDists = [ np.max(allDists) if Inans[n] | Ismall[n] else allDists[n] for n in range(len(allDists)) ] percentageOfClustersKept *= float(np.sum(~Ismall)) / len(allDists) Iincluded = (tmpDists <= np.percentile( tmpDists, percentageOfClustersKept)) & (np.bitwise_not(Ismall)) I = mngreedy(BB, Iincluded, allDists) B_out = ds.matlablike_index2D(BB, 'a', I) # Prepare and return the results: params = dict( params, **{ 'tightnessweight': tightnessweight, 'msesummary': msesummary, 'percentageofclusterskept': percentageOfClustersKept, 'smallestclustersize': smallestClusterSize }) MNResults = collections.namedtuple('MNResults', [ 'B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball', 'params' ]) return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB, params)
def uncles(X, type='A', Ks=[n for n in range(2, 21)], params=None, methods=None, methodsDetailed=None, U=None, Utype='PM', relabel_technique='minmin', setsP=None, setsN=None, dofuzzystretch=False, wsets=None, wmethods=None, GDM=None, smallestClusterSize=11, CoPaMfinetrials=1, CoPaMfinaltrials=1, binarise_techniqueP='DTB', binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'), binarise_techniqueN='DTB', binarise_paramN=np.concatenate(([sys.float_info.epsilon], np.arange(0.1, 1.1, 0.1, dtype='float'))), Xnames=None, deterministic=False, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = np.shape(Xloc)[0] # Number of datasets if wsets is None: wsets = np.array([1 for x in range(L)]) else: wsets = np.array(wsets)[setsPN] if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: GDMloc = GDM[:, setsPN] Ng = GDMloc.shape[0] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] if methods is None: largest_DS = np.max([x.shape[0] for x in Xloc]) if (largest_DS <= maxgenesinsetforpdist): if (deterministic): methods = [['k-means'], ['HC']] else: methods = [['k-means'], ['SOMs'], ['HC']] else: if (deterministic): methods = [['k-means']] else: methods = [['k-means'], ['SOMs']] else: largest_DS = np.max([x.shape[0] for x in Xloc]) if (largest_DS > maxgenesinsetforpdist): methods = [ m for m in methods if 'hc' not in [entry.lower() for entry in m] ] if not methods: io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \ 'on datasets with more than {0} genes. You have a dataset with {1} genes.' \ ''.format(maxgenesinsetforpdist, largest_DS)) io.log('Clust will terminate here.') io.log(op.bottomline(), addextrastick=False) sys.exit() if methodsDetailed is None: methodsDetailedloc = np.array([methods for l in range(L)]) else: methodsDetailedloc = methodsDetailed[setsPN] if wmethods is None: wmethods = [[1 for x in m] for m in methodsDetailedloc] elif not isinstance(wmethods[0], (list, tuple, np.ndarray)): wmethods = np.tile(methods, [L, 1]) else: wmethods = np.array(wmethods)[setsPN] setsPloc = [ii for ii in range(len(setsP))] if L > len(setsPloc): setsNloc = [ii for ii in range(len(setsPloc), L)] Ds = [nu.closest_to_square_factors(k) for k in Ks] # Grid sizes for the SOMs method for each value of K NKs = len(Ks) # Number of K values # Clustering if U is None: Utype = 'PM' Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) totalparallel = np.sum(Ks) * np.sum( [len(meths) for meths in methodsDetailedloc]) for meths in methodsDetailedloc: for meth in meths: if 'k-means' in meth: totalparallel += np.max(Ks) * np.max(Ks) continue io.resetparallelprogress(totalparallel) for l in range(L): # Cache kmeans initialisations for the dataset once to save time: cl.cache_kmeans_init(Xloc[l], Ks, methodsDetailedloc[l], datasetID=l) # Now go to parallel clustering with warnings.catch_warnings(): warnings.simplefilter("ignore") Utmp = Parallel(n_jobs=ncores)\ (delayed(clustDataset) (Xloc[l], Ks[ki], Ds[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs)) Utmp = [u for u in Utmp] for ki in range(NKs): Uloc[l, ki] = Utmp[ki] gc.collect() #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc)) else: Uloc = ds.listofarrays2arrayofarrays(U)[setsPN] # Calculate a CoPaM for each dataset at each K CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs]) for l in range(L): for ki in range(NKs): if Utype.lower() == 'pm': CoPaMsFineTmp = [ generateCoPaM(Uloc[l, ki], relabel_technique=relabel_technique, X=[Xloc[l]], w=wmethods[l], K=Ks[ki], GDM=GDMloc[:, l].reshape([-1, 1])) for i in range(CoPaMfinetrials) ] elif Utype.lower() == 'idx': CoPaMsFineTmp = \ [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc, w=wmethods[l], K=Ks[ki]) for i in range(CoPaMfinetrials)] else: raise ValueError('Invalid Utype') CoPaMsFine[l, ki] = generateCoPaM(CoPaMsFineTmp, relabel_technique=relabel_technique, X=[Xloc[l]], GDM=GDMloc[:, l].reshape([-1, 1])) if dofuzzystretch: CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki]) # Calculate the final CoPaM for each K CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': if Utype.lower() == 'pm': CoPaMs[t, ki] = generateCoPaM( CoPaMsFine[:, ki], relabel_technique=relabel_technique, w=wsets, X=Xloc, GDM=GDMloc) elif Utype.lower() == 'idx': CoPaMs[t, ki] = generateCoPaMfromidx( CoPaMsFine[:, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets, GDM=GDMloc) else: raise ValueError('Invalid Utype') elif type == 'B': if Utype.lower() == 'pm': CoPaMsP[t, ki] = generateCoPaM( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaM( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) elif Utype.lower() == 'idx': CoPaMsP[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) else: raise ValueError('Invalid Utype') else: raise ValueError( 'Invalid UNCLES type. It has to be either A or B') # Binarise NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': # Pre-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] # Sorting CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki], smallestClusterSize) # Post-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] elif type == 'B': # Pre-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # Sorting CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP, smallestClusterSize) CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN, smallestClusterSize) # Post-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # UNCLES B logic for pp in range(NPp): for pn in range(NNp): B[t, pp, pn, ki] = BP[pp] B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False # Fill Mc Mc[t, ki] = [None] * Ks[ki] for k in range(Ks[ki]): Mc[t, ki][k] = np.zeros([NPp, NNp]) for pp in range(NPp): for pn in range(NNp): Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:, k]) # Prepare and return the results: params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'Ds': Ds, 'L': L, 'CoPaMs': CoPaMs, 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc)
def generateCoPaM(U, relabel_technique='minmin', w=None, X=None, distCriterion='direct_euc', K=0, GDM=None): # Helping functions def calwmeans(w): wm = [ np.mean(calwmeans(ww)) if isinstance(ww, (list, tuple, np.ndarray)) else np.mean(ww) for ww in w ] return np.array(wm) def CoPaMsdist(CoPaM1, CoPaM2): return np.linalg.norm(CoPaM1 - CoPaM2) def orderpartitions(U, method='rand', X=None, GDM=None): if method == 'rand': return np.random.permutation(range(len(U))), None elif method == 'mn': # TODO: Implement ranking partitions based on M-N plots raise NotImplementedError( 'Ranking partitions based on the M-N plots logic has not been implemented yet.' ) elif method == 'mse': R = len(U) mses = np.zeros(R) for r in range(R): if isinstance(U[r][0][0], (list, tuple, np.ndarray)): mses[r] = np.mean( orderpartitions(U[r], method=method, X=X, GDM=GDM)[1]) else: mses[r] = np.mean([ mn.mseclustersfuzzy(X, U[r], donormalise=False, GDM=GDM) ]) order = np.argsort(mses) return order, mses[order] # Fix parameters Uloc = ds.listofarrays2arrayofarrays(U) R = len(Uloc) if GDM is None: GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool) elif GDM.shape[1] == 1: if R > 1: GDMloc = np.tile(GDM, [1, R]) else: GDMloc = np.array(GDM) else: GDMloc = np.array(GDM) if w is None or (w is str and w in ['all', 'equal']): w = np.ones(R) elif ds.numel(w) == 1: w = np.array([w for i in range(R)]) wmeans = calwmeans(w) # Work! #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0] if GDM is None: permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0] else: permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0] Uloc = Uloc[permR] if GDMloc.shape[1] > 1: GDMloc = GDMloc[:, permR] wmeans = wmeans[permR] if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)): Uloc[0] = generateCoPaM(Uloc[0], relabel_technique=relabel_technique, w=w[0], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float) CoPaM = np.array(Uloc[0], dtype=float) K = CoPaM.shape[1] for r in range(1, R): if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)): Uloc[r] = generateCoPaM(Uloc[r], relabel_technique=relabel_technique, w=w[r], X=X, distCriterion=distCriterion, K=K, GDM=GDMloc) if Uloc[r].shape[1] != K: raise ValueError( 'Inequal numbers of clusters in the partition {}.'.format(r)) Uloc[r] = relabelClusts(CoPaM, Uloc[r], method=relabel_technique, X=X, distCriterion=distCriterion) dotprod = np.dot(GDMloc[:, 0:r], wmeans[0:r].transpose()) # (Mxr) * (rx1) = (Mx1) CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0] dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose()) CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0], dotprod[dotprod > 0], axis=1) return CoPaM
def correcterrors_withinworse(B, X, GDM, falsepositivestrimmed=0.01): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Find clusters' means (Cmeans), absolute shifter clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) if falsepositivestrimmed > 0: trimmed = int(falsepositivestrimmed * SCG[l].shape[0]) if trimmed > 0: SCG[l] = SCG[l][ 0:-trimmed] # trim the lowest (trimmed) rows in SCG # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero(B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros( [len(ficlusts), len(fidatasets)]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices( Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape([len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True return B_out
def optimise_tukey_sqrtSCG(B, X, GDM, clustdists=None, smallestClusterSize=11, tails=1, Q3s=2): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustdistsloc = np.ones(K) else: clustdistsloc = [c for c in clustdists] # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap') for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) if k in Cgood: csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) if ds.numel(SCG[l] > 0): if tails == 1: Q3 = np.percentile(SCG[l], q=75, axis=0) thresh = Q3s * Q3 SCGouts = SCG[l] > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on elif tails == 2: Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0) Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0) IQR = np.subtract(Q3, Q1) thresh = np.add(Q3, 1.5 * IQR) SCGouts = np.sqrt(SCG[l]) > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on else: raise ValueError( 'Invalid number of tails. It should be either 1 or 2.') else: SCG[l] = np.zeros((1, SCG[l].shape[1])) # Clusters mins and maxes (NEW) Cmins = np.array([None] * L, dtype=object) Cmaxes = np.array([None] * L, dtype=object) for l in range(L): Cmins[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions Cmaxes[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions for k in range(K): Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0) Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0) # Resolve overlaps between clusters (NEW) for k1 in range(K): for k2 in range(K): # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement # guarantees that k2 will always be a later cluster than k1. if (k1 >= k2): continue # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ... # the dataset (l) and the dimension (d), at which this overlap is found # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min # to max) range of one cluster is within the range of the other cluster. This is the worse overlap. # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop v_smallestoverlap = 0 l_smallestoverlap = -1 d_smallestoverlap = -1 t_smallestoverlap = -1 # Overlap type, read above for l in range(L): Nd = len(Cmins[l][k1]) # Dimensions in this dataset for d in range(Nd): x1 = Cmaxes[l][k1][d] x2 = Cmaxes[l][k2][d] n1 = Cmins[l][k1][d] n2 = Cmins[l][k2][d] if (x1 > n2 and x1 <= x2): if (n1 < n2): ov = x1 - n2 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 0 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d elif (x2 > n1 and x2 <= x1): if (n2 < n1): ov = x2 - n1 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 1 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d else: t_smallestoverlap = 2 continue # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue if (t_smallestoverlap == 2): continue # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue # Sort out the overlap if exists between k1 and k2 if (t_smallestoverlap == -1): # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2). # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0), # so effectively no gene will ever be mapped to it! Cmins[0][k2][0] = 1 Cmaxes[0][k2][0] = 0 elif (t_smallestoverlap == 0): Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon elif (t_smallestoverlap == 1): Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon # Find who belongs (NEW) belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): tmp1 = nu.largerthanaxis(Xloc[l], Cmins[l][k], axis=0, orequal=True) tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True) belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2), axis=1) # # Helping function (OLD - to be removed) # def iswithinworse(ref, x): # return x <= np.max(ref) # # # Find who belongs (OLD - to be removed) # belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets # for l in range(L): # for k in range(K): # for d in range(Xloc[l].shape[1]): # tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) # belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere (OLD - to be removed) B_out = np.all(belongs, axis=2) # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED) solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out
def correcterrors_weighted_outliers2(B, X, GDM, clustdists=None, stds=3, smallestClusterSize=11): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustweights = np.ones(K) else: clustweights = np.min(clustdists) / clustdists # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) # Added this in this version w[gi:(gi + csize)] = clustweights[k] gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) SCGmeans = np.average(SCG[l], weights=w, axis=0) SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0) SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0), SCGstds, axis=0) # No. of stds away SCGouts = SCGouts > stds # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Solve genes included in two clusters: solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out