def clustpipeline(datapath, mapfile=None, replicatesfile=None, normalisationfile=['1000'], outpath=None, Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0, OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0, atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11, ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False): # Set the global objects label if mapfile is None: glob.set_object_label_upper('Gene') glob.set_object_label_lower('gene') else: glob.set_object_label_upper('OG') glob.set_object_label_lower('OG') # Output: Prepare the output directory and the log file if outpath is None: outpathbase = os.getcwd() #outpathbase = os.path.abspath(os.path.join(datapath, '..')) outpathbase = '{0}/Results_{1}'.format(outpathbase, dt.datetime.now().strftime('%d_%b_%y')) outpath = outpathbase trial = 0 while os.path.exists(outpath): trial += 1 outpath = '{0}_{1}'.format(outpathbase, trial) if not os.path.exists(outpath): os.makedirs(outpath) glob.set_logfile(os.path.join(outpath, 'log.txt')) glob.set_tmpfile(os.path.join(outpath, 'tmp.txt')) # Output: Copy input files to the output in2out_path = outpath + '/Input_files_and_params' if not os.path.exists(in2out_path): os.makedirs(in2out_path) if mapfile is not None: shutil.copy(mapfile, os.path.join(in2out_path, 'Map.txt')) if replicatesfile is not None: shutil.copy(replicatesfile, os.path.join(in2out_path, 'Replicates.txt')) if normalisationfile is not None: if len(normalisationfile) == 1 and not nu.isint(normalisationfile[0]): shutil.copy(normalisationfile[0], os.path.join(in2out_path, 'Normalisation.txt')) in2out_X_unproc_path = in2out_path + '/Data' if not os.path.exists(in2out_X_unproc_path): os.makedirs(in2out_X_unproc_path) if os.path.isfile(datapath): shutil.copy(datapath, in2out_X_unproc_path) elif os.path.isdir(datapath): for df in io.getFilesInDirectory(datapath): shutil.copy(os.path.join(datapath, df), in2out_X_unproc_path) else: raise ValueError('Data path {0} does not exist. Either provide a path '.format(datapath) + \ 'of a data file or a path to a directory including data file(s)') # Output: Print initial message, and record the starting time: initialmsg, starttime = op.generateinitialmessage() io.log(initialmsg, addextrastick=False) # Read data io.log('1. Reading dataset(s)') (X, replicates, Genes, datafiles) = io.readDatasetsFromDirectory(datapath, delimiter='\t| |, |; |,|;', skiprows=1, skipcolumns=1, returnSkipped=True) datafiles_noext = [os.path.splitext(d)[0] for d in datafiles] # Read map, replicates, and normalisation files: Map = io.readMap(mapfile) (replicatesIDs, conditions) = io.readReplicates(replicatesfile, datapath, datafiles, replicates) normalise = io.readNormalisation(normalisationfile, datafiles) # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering # low expression genes, and normalisation) io.log('2. Data pre-processing') (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \ = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets) (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \ pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None, expressionValueThreshold=expressionValueThreshold, replacementVal=0.0, atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue, filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles) io.writedic('{0}/Normalisation_actual.txt'.format(outpath), applied_norms, delim='\t') OGs = OGs[Iincluded] if MapNew is not None: MapNew = MapNew[Iincluded] # Output: Save processed data Xprocessed = op.processed_X(X_summarised_normalised, conditions, GDM, OGs, MapNew, MapSpecies) # pandas DataFrames X_proc_path = outpath + '/Processed_Data' if not os.path.exists(X_proc_path): os.makedirs(X_proc_path) for l in range(len(datafiles)): pd.DataFrame.to_csv(Xprocessed[l], '{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), Xprocessed[l], fmt='%s', delimiter='\t') # UNCLES and M-N plots io.log('3. Seed clusters production (the Bi-CoPaM method)') ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods, Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic) io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)') mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight, params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext, ncores=ncores) # Post-processing ppmethod = 'tukey_sqrtSCG' if optimisation: io.log('5. Cluster optimisation and completion') if len(mnres.I) > 0 and sum(mnres.I) > 0: # Otherwise, there are no clusters, so nothing to be corrected try: if ppmethod == 'weighted_outliers': B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], stds, smallestClusterSize) elif ppmethod == 'tukey_sqrtSCG': B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], smallestClusterSize, tails=1, Q3s=Q3s) else: raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod)) B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM) except: io.logerror(sys.exc_info()) io.log('\n* Failed to perform cluster optimisation and completion!\n' '* Skipped cluster optimisation and completion!\n') B_corrected = mnres.B else: B_corrected = mnres.B else: io.log('5. Skipping cluster optimisation and completion') B_corrected = mnres.B # Output: Write input parameters: io.log('6. Saving results in\n{0}'.format(outpath)) inputparams = op.params(mnres.params, Q3s, OGsIncludedIfAtLeastInDatasets, expressionValueThreshold, atleastinconditions, atleastindatasets, deterministic, ures.params['methods'], MapNew) io.writedic('{0}/input_params.tsv'.format(in2out_path), inputparams, delim='\t') # Output: Generating and saving clusters res_og = op.clusters_genes_OGs(B_corrected, OGs, MapNew, MapSpecies, '; ') # pandas DataFrame if mapfile is None: pd.DataFrame.to_csv(res_og, '{0}/Clusters_Objects.tsv'.format(outpath), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_Objects.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t') else: pd.DataFrame.to_csv(res_og, '{0}/Clusters_OGs.tsv'.format(outpath), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_OGs.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t') res_sp = op.clusters_genes_Species(B_corrected, OGs, MapNew, MapSpecies) # pandas DataFrame for sp in range(len(res_sp)): pd.DataFrame.to_csv(res_sp[sp], '{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), res_sp[sp], fmt='%s', delimiter='\t') # Output: Save figures to a PDF try: if np.shape(B_corrected)[1] > 0: clusts_fig_file_name = '{0}/Clusters_profiles.pdf'.format(outpath) graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, clusts_fig_file_name, GDM=GDM, Cs='all', setPageToDefault=True, printToPDF=True, showPlots=False) except: io.log('Error: could not save clusters plots in a PDF file.\n' 'Resuming producing the other results files ...') # Output: Generating and writing eigengenes try: if np.shape(B_corrected)[1] > 0: if len(X_summarised_normalised) == 1: eigengene_matrix = eig.eigengenes_dataframe(X_summarised_normalised, B_corrected, conditions) eigengene_matrix.to_csv('{0}/Eigengenes.tsv'.format(outpath), sep='\t', encoding='utf-8') else: io.log('Eigengene computation is currently not supported for multiple datasets.') except: io.log('Error: could not save eigengenes into a file.\n' 'Resuming producing the other results files ...') # Output: Prepare message to standard output and the summary then save the summary to a file and print the message summarymsg, endtime, timeconsumedtxt = \ op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime) summary = op.summarise_results(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime, endtime, timeconsumedtxt) io.writedic(outpath + '/Summary.tsv', summary, delim='\t') io.log(summarymsg, addextrastick=False) io.deletetmpfile()
def uncles(X, type='A', Ks=[n for n in range(4, 21, 4)], params=None, methods=None, methodsDetailed=None, U=None, Utype='PM', relabel_technique='minmin', setsP=None, setsN=None, dofuzzystretch=False, wsets=None, wmethods=None, GDM=None, smallestClusterSize=11, CoPaMfinetrials=1, CoPaMfinaltrials=1, binarise_techniqueP='DTB', binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'), binarise_techniqueN='DTB', binarise_paramN=np.concatenate(([sys.float_info.epsilon], np.arange(0.1, 1.1, 0.1, dtype='float'))), Xnames=None, deterministic=False, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = np.shape(Xloc)[0] # Number of datasets if wsets is None: wsets = np.array([1 for x in range(L)]) else: wsets = np.array(wsets)[setsPN] if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: GDMloc = GDM[:, setsPN] Ng = GDMloc.shape[0] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] if methods is None: methods = [['k-means']] # largest_DS = np.max([x.shape[0] for x in Xloc]) # if (largest_DS <= maxgenesinsetforpdist): # methods = [['k-means'], ['HC']] # else: # methods = [['k-means']] else: largest_DS = np.max([x.shape[0] for x in Xloc]) if (largest_DS > maxgenesinsetforpdist): methods = [ m for m in methods if 'hc' not in [entry.lower() for entry in m] ] if not methods: io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \ 'on datasets with more than {0} genes. You have a dataset with {1} genes.' \ ''.format(maxgenesinsetforpdist, largest_DS)) io.log('Clust will terminate here.') io.log(op.bottomline(), addextrastick=False) sys.exit() if methodsDetailed is None: methodsDetailedloc = np.array([methods for l in range(L)]) else: methodsDetailedloc = methodsDetailed[setsPN] if wmethods is None: wmethods = [[1 for x in m] for m in methodsDetailedloc] elif not isinstance(wmethods[0], (list, tuple, np.ndarray)): wmethods = np.tile(methods, [L, 1]) else: wmethods = np.array(wmethods)[setsPN] setsPloc = [ii for ii in range(len(setsP))] if L > len(setsPloc): setsNloc = [ii for ii in range(len(setsPloc), L)] Ks = np.array(Ks) Ks = Ks[Ks <= Ng] # Remove Ks that are larger than the number of genes Ng Ks = Ks.tolist() NKs = len(Ks) # Number of K values # If the dataset is empty, return basic output if Ng == 0: NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'Ds': Ds, 'L': L, 'CoPaMs': np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]), 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc) # Clustering if U is None: Utype = 'PM' Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) totalparallel = np.sum(Ks) * np.sum( [len(meths) for meths in methodsDetailedloc]) for meths in methodsDetailedloc: for meth in meths: if 'k-means' in meth: totalparallel += np.max(Ks) * np.max(Ks) continue io.resetparallelprogress(totalparallel) for l in range(L): # Cache kmeans initialisations for the dataset once to save time: cl.cache_kmeans_init(Xloc[l], Ks, methodsDetailedloc[l], datasetID=l) # Now go to parallel clustering with warnings.catch_warnings(): warnings.simplefilter("ignore") Utmp = Parallel(n_jobs=ncores)\ (delayed(clustDataset) (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs)) Utmp = [u for u in Utmp] for ki in range(NKs): Uloc[l, ki] = Utmp[ki] gc.collect() #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc)) else: Uloc = ds.listofarrays2arrayofarrays(U)[setsPN] # Calculate a CoPaM for each dataset at each K CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs]) for l in range(L): for ki in range(NKs): if Utype.lower() == 'pm': CoPaMsFineTmp = [ generateCoPaM(Uloc[l, ki], relabel_technique=relabel_technique, X=[Xloc[l]], w=wmethods[l], K=Ks[ki], GDM=GDMloc[:, l].reshape([-1, 1])) for i in range(CoPaMfinetrials) ] elif Utype.lower() == 'idx': CoPaMsFineTmp = \ [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc, w=wmethods[l], K=Ks[ki]) for i in range(CoPaMfinetrials)] else: raise ValueError('Invalid Utype') CoPaMsFine[l, ki] = generateCoPaM(CoPaMsFineTmp, relabel_technique=relabel_technique, X=[Xloc[l]], GDM=GDMloc[:, l].reshape([-1, 1])) if dofuzzystretch: CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki]) # Calculate the final CoPaM for each K CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': if Utype.lower() == 'pm': CoPaMs[t, ki] = generateCoPaM( CoPaMsFine[:, ki], relabel_technique=relabel_technique, w=wsets, X=Xloc, GDM=GDMloc) elif Utype.lower() == 'idx': CoPaMs[t, ki] = generateCoPaMfromidx( CoPaMsFine[:, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets, GDM=GDMloc) else: raise ValueError('Invalid Utype') elif type == 'B': if Utype.lower() == 'pm': CoPaMsP[t, ki] = generateCoPaM( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaM( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) elif Utype.lower() == 'idx': CoPaMsP[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) else: raise ValueError('Invalid Utype') else: raise ValueError( 'Invalid UNCLES type. It has to be either A or B') # Binarise NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': # Pre-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] # Sorting CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki], smallestClusterSize) # Post-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] elif type == 'B': # Pre-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # Sorting CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP, smallestClusterSize) CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN, smallestClusterSize) # Post-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # UNCLES B logic for pp in range(NPp): for pn in range(NNp): B[t, pp, pn, ki] = BP[pp] B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False # Fill Mc Mc[t, ki] = [None] * Ks[ki] for k in range(Ks[ki]): Mc[t, ki][k] = np.zeros([NPp, NNp]) for pp in range(NPp): for pn in range(NNp): Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:, k]) # Prepare and return the results: params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'L': L, 'CoPaMs': CoPaMs, 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc)
def runclust(X, Map=None, replicatesIDs=None, normalise=1000, Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0, OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0, atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11, ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False, showPlots=True, printToConsole=True): # Set the global objects label glob.set_print_to_log_file(False) glob.set_print_to_console(printToConsole) if Map is None: glob.set_object_label_upper('Gene') glob.set_object_label_lower('gene') else: glob.set_object_label_upper('OG') glob.set_object_label_lower('OG') glob.set_tmpfile('clust_tmp.txt') # Output: Print initial message, and record the starting time: initialmsg, starttime = op.generateinitialmessage() io.log(initialmsg, addextrastick=False) # Consider X as a list of arrays or of data frames. Otherwise, make it as such first # If the user entered a single dataset as an input (not as a list of arrays), save this fact in a flag, ... # so the result is returned as a single output input_is_one_dataset = False if isinstance(X, pd.DataFrame): input_is_one_dataset = True X = [X] elif isinstance(X, np.ndarray) and ds.maxDepthOfArray(X) == 2: input_is_one_dataset = True X = [X] # Format data (X: list of arrays, Genes: list of arrays of strings, replicates: list of arrays of strings) L = len(X) # Number of datasets replicates = [None] * L Genes = [None] * L io.log('1. Reading dataset(s)') for l in range(L): if type(X[l]) == pd.DataFrame: Genes[l] = np.array(X[l].index, dtype=str, ndmin=2).transpose() Genes[l] = np.array(Genes[l], dtype=object) replicates[l] = np.array(X[l].columns, dtype=str) X[l] = X[l].values else: X[l] = np.array(X[l]) ngenes_digits = int(math.ceil(math.log10(X[l].shape[0]))) nreps_digits = int(math.ceil(math.log10(X[l].shape[1]))) Genes[l] = np.array([['{0}'.format(str(g).zfill(ngenes_digits))] for g in range(X[l].shape[0])]) Genes[l] = np.array(Genes[l], dtype=object) replicates[l] = np.array(['X{0}'.format(str(r).zfill(nreps_digits)) for r in range(X[l].shape[1])]) ndatasets_digits = int(math.ceil(math.log10(L))) datafiles = np.array(['D{0}'.format(str(r).zfill(ndatasets_digits)) for r in range(L)]) datafiles_noext = datafiles # Sort out conditions based on replicates structure if given if replicatesIDs is None: conditions = replicates else: valresult = val.validate_replicatesIDs(replicatesIDs, X) if valresult[0]: conditions = [None] * L for l in range(L): if replicatesIDs[l] is None: conditions[l] = np.array(replicates[l]) else: uniq_reps, cond_indices = np.unique(replicatesIDs[l], return_index=True) if -1 in uniq_reps: cond_indices = cond_indices[1:] conditions[l] = replicates[l][cond_indices] else: io.log(valresult[1]) io.log("Terminating ...") raise Exception("Terminated by an invalid input argument.") # Validate normalisation valresult = val.validate_normalisation(normalise, X) if not valresult[0]: io.log(valresult[1]) io.log("Terminating ...") raise Exception("Terminated by an invalid input argument.") # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering # low expression genes, and normalisation) io.log('2. Data pre-processing') (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \ = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets) (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \ pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None, expressionValueThreshold=expressionValueThreshold, replacementVal=0.0, atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue, filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles) OGs = OGs[Iincluded] if MapNew is not None: MapNew = MapNew[Iincluded] # UNCLES and M-N plots io.log('3. Seed clusters production (the Bi-CoPaM method)') ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods, Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic) io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)') mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight, params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext, ncores=ncores) # Post-processing ppmethod = 'tukey_sqrtSCG' if optimisation: io.log('5. Cluster optimisation and completion') if len(mnres.I) > 0 and sum(mnres.I) > 0: # Otherwise, there are no clusters, so nothing to be corrected try: if ppmethod == 'weighted_outliers': B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], stds, smallestClusterSize) elif ppmethod == 'tukey_sqrtSCG': B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], smallestClusterSize, tails=1, Q3s=Q3s) else: raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod)) B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM) except: io.logerror(sys.exc_info()) io.log('\n* Failed to perform cluster optimisation and completion!\n' '* Skipped cluster optimisation and completion!\n') B_corrected = mnres.B else: B_corrected = mnres.B else: io.log('5. Skipping cluster optimisation and completion') B_corrected = mnres.B # Output: Preparing output parameters as DataFrames if Map is None: Bout = op.clusters_B_as_dataframes(B_corrected, OGs, None) else: Bout, B_species = op.clusters_B_as_dataframes(B_corrected, OGs, MapNew) Xout = op.processed_X_as_dataframes(X_summarised_normalised, OGs, conditions) if input_is_one_dataset: Xout = Xout[0] # Output: Plot figures if showPlots: try: if np.shape(B_corrected)[1] > 0: graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, GDM=GDM, Cs='all', setPageToDefault=True, showPlots=showPlots, printToPDF=False) except: io.log('Error: could not generate clusters'' plots. Resuming the rest of steps ...') # Output: Prepare message to standard output and the summary then save the summary to a file and print the message summarymsg, endtime, timeconsumedtxt = \ op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime) io.log(summarymsg, addextrastick=False) io.deletetmpfile() return Bout, Xout, GDM
def preprocess(X, GDM, normalise=1000, replicatesIDs=None, flipSamples=None, expressionValueThreshold=10.0, replacementVal=0.0, atleastinconditions=1, atleastindatasets=1, absvalue=False, usereplacementval=False, filteringtype='raw', filterflat=True, params=None, datafiles=None): # Fixing parameters Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) if datafiles is None: if L == 1: datafiles = ['X'] else: datafiles = np.array([], dtype=str) for i in range(L): datafiles = np.append(datafiles, 'X{0}'.format(i+1)) if params is None: params = {} if replicatesIDs is None: replicatesIDsloc = [np.array([ii for ii in range(x.shape[1])]) for x in Xloc] else: replicatesIDsloc = ds.listofarrays2arrayofarrays(replicatesIDs) replicatesIDsloc = [np.array(x) for x in replicatesIDsloc] if flipSamples is None: flipSamplesloc = None else: flipSamplesloc = ds.listofarrays2arrayofarrays(flipSamples) flipSamplesloc = [np.array(x) for x in flipSamplesloc] # Revise if the if statement below is accurate! if not isinstance(normalise, (list, tuple, np.ndarray)): normaliseloc = [normalise if isinstance(normalise, (list, tuple, np.ndarray)) else [normalise] for i in range(L)] normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc) else: normaliseloc = [nor if isinstance(nor, (list, tuple, np.ndarray)) else [nor] for nor in normalise] normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc) # Get rid of nans by fixing Xproc = Xloc for l in range(L): Xproc[l] = fixnans(Xproc[l]) # Prepare applied_norm dictionary before any normalisation takes place applied_norm = collec.OrderedDict(zip(datafiles, deepcopy(normaliseloc))) # Tell the user if any automatic normalisation is taking place allare1000 = True anyis1000 = False for l in range(L): if 1000 in normaliseloc[l]: anyis1000 = True else: allare1000 = False if allare1000: io.log(' - Automatic normalisation mode (default in v1.7.0+).') io.log(' Clust automatically normalises your dataset(s).') io.log(' To switch it off, use the `-n 0` option (not recommended).') io.log(' Check https://github.com/BaselAbujamous/clust for details.') elif anyis1000: io.log(' - Some datasets are not assigned normalisation codes in the provided') io.log(' normalisation file. Clust automatically identifies and applies the') io.log(' most suitable normalisation to them (default in v1.7.0+).') io.log(' If you don''t want clust to normalise them, assign each of them a') io.log(' normalisation code of 0 in the normalisation file.') io.log(' Check https://github.com/BaselAbujamous/clust for details.') # Quantile normalisation for l in range(L): if 101 in normaliseloc[l] or 1000 in normaliseloc[l]: Xproc[l] = normaliseSampleFeatureMat(Xproc[l], 101)[0] if 101 in normaliseloc[l]: i = np.argwhere(np.array(normaliseloc[l]) == 101) i = i[0][0] normaliseloc[l][i] = 0 # Combine replicates and sort out flipped samples Xproc = combineReplicates(Xproc, replicatesIDsloc, flipSamplesloc) # Filter genes not exceeding the threshold (Xproc, GDMnew, Iincluded) = filterlowgenes(Xproc, GDM, expressionValueThreshold, replacementVal, atleastinconditions, atleastindatasets, absvalue, usereplacementval, filteringtype) # Normalise for l in range(L): (Xproc[l], codes) = normaliseSampleFeatureMat(Xproc[l], normaliseloc[l]) if np.all(codes == normaliseloc[l]): applied_norm[datafiles[l]] = op.arraytostring(applied_norm[datafiles[l]], delim=' ', openbrac='', closebrac='') else: applied_norm[datafiles[l]] = op.arraytostring(codes, delim=' ', openbrac='', closebrac='') if filterflat: io.log(' - Flat expression profiles filtered out (default in v1.7.0+).') io.log(' To switch it off, use the --no-fil-flat option (not recommended).') io.log(' Check https://github.com/BaselAbujamous/clust for details.') (Xproc, GDMnew, Iincluded) = filterFlat(Xproc, GDMnew, Iincluded) # Prepare params for the output params = dict(params, **{ 'normalise': normaliseloc, 'replicatesIDs': replicatesIDs, 'flipSamples': flipSamplesloc, 'L': L }) return Xproc, GDMnew, Iincluded, params, applied_norm