def uncles(X, type='A', Ks=[n for n in range(4, 21, 4)], params=None, methods=None, methodsDetailed=None, U=None, Utype='PM', relabel_technique='minmin', setsP=None, setsN=None, dofuzzystretch=False, wsets=None, wmethods=None, GDM=None, smallestClusterSize=11, CoPaMfinetrials=1, CoPaMfinaltrials=1, binarise_techniqueP='DTB', binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'), binarise_techniqueN='DTB', binarise_paramN=np.concatenate(([sys.float_info.epsilon], np.arange(0.1, 1.1, 0.1, dtype='float'))), Xnames=None, deterministic=False, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = np.shape(Xloc)[0] # Number of datasets if wsets is None: wsets = np.array([1 for x in range(L)]) else: wsets = np.array(wsets)[setsPN] if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: GDMloc = GDM[:, setsPN] Ng = GDMloc.shape[0] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] if methods is None: methods = [['k-means']] # largest_DS = np.max([x.shape[0] for x in Xloc]) # if (largest_DS <= maxgenesinsetforpdist): # methods = [['k-means'], ['HC']] # else: # methods = [['k-means']] else: largest_DS = np.max([x.shape[0] for x in Xloc]) if (largest_DS > maxgenesinsetforpdist): methods = [ m for m in methods if 'hc' not in [entry.lower() for entry in m] ] if not methods: io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \ 'on datasets with more than {0} genes. You have a dataset with {1} genes.' \ ''.format(maxgenesinsetforpdist, largest_DS)) io.log('Clust will terminate here.') io.log(op.bottomline(), addextrastick=False) sys.exit() if methodsDetailed is None: methodsDetailedloc = np.array([methods for l in range(L)]) else: methodsDetailedloc = methodsDetailed[setsPN] if wmethods is None: wmethods = [[1 for x in m] for m in methodsDetailedloc] elif not isinstance(wmethods[0], (list, tuple, np.ndarray)): wmethods = np.tile(methods, [L, 1]) else: wmethods = np.array(wmethods)[setsPN] setsPloc = [ii for ii in range(len(setsP))] if L > len(setsPloc): setsNloc = [ii for ii in range(len(setsPloc), L)] Ks = np.array(Ks) Ks = Ks[Ks <= Ng] # Remove Ks that are larger than the number of genes Ng Ks = Ks.tolist() NKs = len(Ks) # Number of K values # If the dataset is empty, return basic output if Ng == 0: NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'Ds': Ds, 'L': L, 'CoPaMs': np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]), 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc) # Clustering if U is None: Utype = 'PM' Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) totalparallel = np.sum(Ks) * np.sum( [len(meths) for meths in methodsDetailedloc]) for meths in methodsDetailedloc: for meth in meths: if 'k-means' in meth: totalparallel += np.max(Ks) * np.max(Ks) continue io.resetparallelprogress(totalparallel) for l in range(L): # Cache kmeans initialisations for the dataset once to save time: cl.cache_kmeans_init(Xloc[l], Ks, methodsDetailedloc[l], datasetID=l) # Now go to parallel clustering with warnings.catch_warnings(): warnings.simplefilter("ignore") Utmp = Parallel(n_jobs=ncores)\ (delayed(clustDataset) (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs)) Utmp = [u for u in Utmp] for ki in range(NKs): Uloc[l, ki] = Utmp[ki] gc.collect() #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc)) else: Uloc = ds.listofarrays2arrayofarrays(U)[setsPN] # Calculate a CoPaM for each dataset at each K CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs]) for l in range(L): for ki in range(NKs): if Utype.lower() == 'pm': CoPaMsFineTmp = [ generateCoPaM(Uloc[l, ki], relabel_technique=relabel_technique, X=[Xloc[l]], w=wmethods[l], K=Ks[ki], GDM=GDMloc[:, l].reshape([-1, 1])) for i in range(CoPaMfinetrials) ] elif Utype.lower() == 'idx': CoPaMsFineTmp = \ [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc, w=wmethods[l], K=Ks[ki]) for i in range(CoPaMfinetrials)] else: raise ValueError('Invalid Utype') CoPaMsFine[l, ki] = generateCoPaM(CoPaMsFineTmp, relabel_technique=relabel_technique, X=[Xloc[l]], GDM=GDMloc[:, l].reshape([-1, 1])) if dofuzzystretch: CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki]) # Calculate the final CoPaM for each K CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': if Utype.lower() == 'pm': CoPaMs[t, ki] = generateCoPaM( CoPaMsFine[:, ki], relabel_technique=relabel_technique, w=wsets, X=Xloc, GDM=GDMloc) elif Utype.lower() == 'idx': CoPaMs[t, ki] = generateCoPaMfromidx( CoPaMsFine[:, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets, GDM=GDMloc) else: raise ValueError('Invalid Utype') elif type == 'B': if Utype.lower() == 'pm': CoPaMsP[t, ki] = generateCoPaM( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaM( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) elif Utype.lower() == 'idx': CoPaMsP[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) else: raise ValueError('Invalid Utype') else: raise ValueError( 'Invalid UNCLES type. It has to be either A or B') # Binarise NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': # Pre-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] # Sorting CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki], smallestClusterSize) # Post-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] elif type == 'B': # Pre-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # Sorting CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP, smallestClusterSize) CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN, smallestClusterSize) # Post-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # UNCLES B logic for pp in range(NPp): for pn in range(NNp): B[t, pp, pn, ki] = BP[pp] B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False # Fill Mc Mc[t, ki] = [None] * Ks[ki] for k in range(Ks[ki]): Mc[t, ki][k] = np.zeros([NPp, NNp]) for pp in range(NPp): for pn in range(NNp): Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:, k]) # Prepare and return the results: params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'L': L, 'CoPaMs': CoPaMs, 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc)
def main(args=None): if args is None: args = sys.argv[1:] # Parse arguments headertxt = op.topline() headertxt += op.msgformated( 'Clust\n' 'Optimised consensus clustering of multiple heterogeneous datasets\n' 'Version {0}\n' '\n' 'By Basel Abu-Jamous\n' 'Department of Plant Sciences\n' 'The University of Oxford\n' '*****@*****.**'.format(version), '^') headertxt += op.midline() headertxt += op.msgformated('Citation\n' '~~~~~~~~', '^') citationtxt = 'When publishing work that uses Clust, please cite:\n' \ 'Basel Abu-Jamous and Steven Kelly (2018) Clust: automatic extraction of optimal co-expressed ' \ 'gene clusters from gene expression data. Genome Biology 19:172; ' \ 'doi: https://doi.org/10.1186/s13059-018-1536-8.' # TODO: citation headertxt += op.msgformated(citationtxt, '<') headertxt += op.midline() headertxt += op.msgformated( 'Full description of usage can be found at:\n' 'https://github.com/BaselAbujamous/clust', '<') headertxt += op.bottomline() parser = argparse.ArgumentParser(description=headertxt, formatter_class=RawTextHelpFormatter) parser.add_argument('datapath', help='Data file path or directory with data file(s).', default=None) parser.add_argument( '-n', metavar='<file or int>', help='Normalisation file or list of codes (default: 1000)', default=['1000'], nargs='+') parser.add_argument('-r', metavar='<file>', help='Replicates structure file', default=None) parser.add_argument('-m', metavar='<file>', help='OrthoGroups (OGs) mapping file', default=None) parser.add_argument('-o', metavar='<directory>', help='Output directory', default=None) parser.add_argument('-t', metavar='<real number>', type=float, help='Cluster tightness (default: 1.0).', default=1.0) parser.add_argument( '-basemethods', metavar='<string>', nargs='+', help='One or more base clustering methods (default: k-means)', default=None) parser.add_argument( '-K', metavar='<integer>', type=int, nargs='+', help='K values, e.g. 2 4 6 10 ... (default: 4 to 20 (step=4))', default=[n for n in range(4, 21, 4)]) parser.add_argument('-s', metavar='<real number>', type=float, help='Outlier standard deviations (default: 3.0)', default=3.0) parser.add_argument( '-d', metavar='<integer>', type=int, help='Min datasets in which a gene must exist (default: 1)', default=1) parser.add_argument( '-fil-v', metavar='<real number>', dest='filv', type=float, help='Filtering: gene expression threshold (default: -inf)', default=-float("inf")) parser.add_argument('-fil-c', metavar='<integer>', dest='filc', type=int, help='Filtering: number of conditions (default: 0)', default=0) parser.add_argument('-fil-d', metavar='<integer>', dest='fild', type=int, help='Filtering: number of datasets (default: 1)', default=0) parser.add_argument('--fil-abs', dest='absval', action='store_true', help='Filter using absolute values of expression') parser.add_argument( '--fil-perc', dest='filperc', action='store_true', help='-fil-v is a percentile of genes rather than raw value') parser.add_argument( '--fil-flat', dest='filflat', action='store_true', help='Filter out genes with flat expression profiles (default)') parser.add_argument('--no-fil-flat', dest='filflat', action='store_false', help='Cancels the default --fil-flat option') parser.add_argument('-cs', metavar='<integer>', type=int, help='Smallest cluster size (default: 11)', default=11) parser.add_argument('-q3s', metavar='<real number>', type=float, help='Q3' 's defining outliers (default: 2.0)', default=2.0) parser.add_argument('--no-optimisation', dest='optimisation', action='store_false', help='Skip cluster optimsation & completion') parser.add_argument( '--deterministic', dest='deterministic', action='store_true', help='Obsolete. All steps are already deterministic (v1.7.4+)') parser.add_argument('-np', metavar='<integer>', type=int, help='Number of parallel processes (default: 1)', default=1) parser.set_defaults(optimisation=True, deterministic=False, absval=False, filperc=False, filflat=True) # parser.add_argument('-ec', type=int, help='Perform error correction, 1 or 0 (default: 1)', default=1) if len(args) == 0: parser.parse_args(['-h']) args = parser.parse_args(args) if args.filperc: filtype = 'perc' else: filtype = 'raw' if args.basemethods is not None: args.basemethods = [[m] for m in args.basemethods] # Call the clust function clustpipeline.clustpipeline(args.datapath, args.m, args.r, args.n, args.o, args.K, args.t, args.s, args.d, args.filv, args.filc, args.fild, args.absval, filtype, args.filflat, args.cs, args.np, args.optimisation, args.q3s, args.basemethods, args.deterministic)