def clustpipeline(datapath, mapfile=None, replicatesfile=None, normalisationfile=['1000'], outpath=None, Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0, OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0, atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11, ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False): # Set the global objects label if mapfile is None: glob.set_object_label_upper('Gene') glob.set_object_label_lower('gene') else: glob.set_object_label_upper('OG') glob.set_object_label_lower('OG') # Output: Prepare the output directory and the log file if outpath is None: outpathbase = os.getcwd() #outpathbase = os.path.abspath(os.path.join(datapath, '..')) outpathbase = '{0}/Results_{1}'.format(outpathbase, dt.datetime.now().strftime('%d_%b_%y')) outpath = outpathbase trial = 0 while os.path.exists(outpath): trial += 1 outpath = '{0}_{1}'.format(outpathbase, trial) if not os.path.exists(outpath): os.makedirs(outpath) glob.set_logfile(os.path.join(outpath, 'log.txt')) glob.set_tmpfile(os.path.join(outpath, 'tmp.txt')) # Output: Copy input files to the output in2out_path = outpath + '/Input_files_and_params' if not os.path.exists(in2out_path): os.makedirs(in2out_path) if mapfile is not None: shutil.copy(mapfile, os.path.join(in2out_path, 'Map.txt')) if replicatesfile is not None: shutil.copy(replicatesfile, os.path.join(in2out_path, 'Replicates.txt')) if normalisationfile is not None: if len(normalisationfile) == 1 and not nu.isint(normalisationfile[0]): shutil.copy(normalisationfile[0], os.path.join(in2out_path, 'Normalisation.txt')) in2out_X_unproc_path = in2out_path + '/Data' if not os.path.exists(in2out_X_unproc_path): os.makedirs(in2out_X_unproc_path) if os.path.isfile(datapath): shutil.copy(datapath, in2out_X_unproc_path) elif os.path.isdir(datapath): for df in io.getFilesInDirectory(datapath): shutil.copy(os.path.join(datapath, df), in2out_X_unproc_path) else: raise ValueError('Data path {0} does not exist. Either provide a path '.format(datapath) + \ 'of a data file or a path to a directory including data file(s)') # Output: Print initial message, and record the starting time: initialmsg, starttime = op.generateinitialmessage() io.log(initialmsg, addextrastick=False) # Read data io.log('1. Reading dataset(s)') (X, replicates, Genes, datafiles) = io.readDatasetsFromDirectory(datapath, delimiter='\t| |, |; |,|;', skiprows=1, skipcolumns=1, returnSkipped=True) datafiles_noext = [os.path.splitext(d)[0] for d in datafiles] # Read map, replicates, and normalisation files: Map = io.readMap(mapfile) (replicatesIDs, conditions) = io.readReplicates(replicatesfile, datapath, datafiles, replicates) normalise = io.readNormalisation(normalisationfile, datafiles) # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering # low expression genes, and normalisation) io.log('2. Data pre-processing') (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \ = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets) (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \ pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None, expressionValueThreshold=expressionValueThreshold, replacementVal=0.0, atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue, filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles) io.writedic('{0}/Normalisation_actual.txt'.format(outpath), applied_norms, delim='\t') OGs = OGs[Iincluded] if MapNew is not None: MapNew = MapNew[Iincluded] # Output: Save processed data Xprocessed = op.processed_X(X_summarised_normalised, conditions, GDM, OGs, MapNew, MapSpecies) # pandas DataFrames X_proc_path = outpath + '/Processed_Data' if not os.path.exists(X_proc_path): os.makedirs(X_proc_path) for l in range(len(datafiles)): pd.DataFrame.to_csv(Xprocessed[l], '{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), Xprocessed[l], fmt='%s', delimiter='\t') # UNCLES and M-N plots io.log('3. Seed clusters production (the Bi-CoPaM method)') ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods, Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic) io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)') mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight, params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext, ncores=ncores) # Post-processing ppmethod = 'tukey_sqrtSCG' if optimisation: io.log('5. Cluster optimisation and completion') if len(mnres.I) > 0 and sum(mnres.I) > 0: # Otherwise, there are no clusters, so nothing to be corrected try: if ppmethod == 'weighted_outliers': B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], stds, smallestClusterSize) elif ppmethod == 'tukey_sqrtSCG': B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], smallestClusterSize, tails=1, Q3s=Q3s) else: raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod)) B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM) except: io.logerror(sys.exc_info()) io.log('\n* Failed to perform cluster optimisation and completion!\n' '* Skipped cluster optimisation and completion!\n') B_corrected = mnres.B else: B_corrected = mnres.B else: io.log('5. Skipping cluster optimisation and completion') B_corrected = mnres.B # Output: Write input parameters: io.log('6. Saving results in\n{0}'.format(outpath)) inputparams = op.params(mnres.params, Q3s, OGsIncludedIfAtLeastInDatasets, expressionValueThreshold, atleastinconditions, atleastindatasets, deterministic, ures.params['methods'], MapNew) io.writedic('{0}/input_params.tsv'.format(in2out_path), inputparams, delim='\t') # Output: Generating and saving clusters res_og = op.clusters_genes_OGs(B_corrected, OGs, MapNew, MapSpecies, '; ') # pandas DataFrame if mapfile is None: pd.DataFrame.to_csv(res_og, '{0}/Clusters_Objects.tsv'.format(outpath), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_Objects.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t') else: pd.DataFrame.to_csv(res_og, '{0}/Clusters_OGs.tsv'.format(outpath), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_OGs.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t') res_sp = op.clusters_genes_Species(B_corrected, OGs, MapNew, MapSpecies) # pandas DataFrame for sp in range(len(res_sp)): pd.DataFrame.to_csv(res_sp[sp], '{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), sep='\t', encoding='utf-8', index=None, columns=None, header=False) #np.savetxt('{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), res_sp[sp], fmt='%s', delimiter='\t') # Output: Save figures to a PDF try: if np.shape(B_corrected)[1] > 0: clusts_fig_file_name = '{0}/Clusters_profiles.pdf'.format(outpath) graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, clusts_fig_file_name, GDM=GDM, Cs='all', setPageToDefault=True, printToPDF=True, showPlots=False) except: io.log('Error: could not save clusters plots in a PDF file.\n' 'Resuming producing the other results files ...') # Output: Generating and writing eigengenes try: if np.shape(B_corrected)[1] > 0: if len(X_summarised_normalised) == 1: eigengene_matrix = eig.eigengenes_dataframe(X_summarised_normalised, B_corrected, conditions) eigengene_matrix.to_csv('{0}/Eigengenes.tsv'.format(outpath), sep='\t', encoding='utf-8') else: io.log('Eigengene computation is currently not supported for multiple datasets.') except: io.log('Error: could not save eigengenes into a file.\n' 'Resuming producing the other results files ...') # Output: Prepare message to standard output and the summary then save the summary to a file and print the message summarymsg, endtime, timeconsumedtxt = \ op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime) summary = op.summarise_results(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime, endtime, timeconsumedtxt) io.writedic(outpath + '/Summary.tsv', summary, delim='\t') io.log(summarymsg, addextrastick=False) io.deletetmpfile()
def readNormalisation(normalisefile, datafiles, delimiter='\t| |,|;', defaultnormalisation=1000): """ :param normalisefile: either a list of a single string element which is the normalisation file name, or a list of strings representing normalisation codes. In this case, the strings must be convertable to integers. :param datafiles: :param delimiter: :param defaultnormalisation: :return: """ if normalisefile is None: return defaultnormalisation # This is in case the normalisation file was given as a single integer, it should not though if nu.isint(normalisefile): normalisefile = [normalisefile] L = len(datafiles) normalise = [None] * L # This happens when the normalisation codes are given directly rather than in a file if len(normalisefile) > 1 or nu.isint(normalisefile[0]): for l in range(L): normalise[l] = [int(n) for n in normalisefile] return normalise # This happens when a normalisation file is given with open(normalisefile[0]) as f: lineNumber = 0 for line in f: lineNumber += 1 line = line.partition('#')[0] line = line.rstrip() line = list(filter(None, re.split(delimiter, line))) # Skip to next line if it is an empty line if len(line) < 1: continue if line[0] in datafiles: l = datafiles.index(line[0]) # (l)th dataset else: raise ValueError( 'Unrecognised data file name ({0}) in line {1} in {2}.'. format(line[0], lineNumber, normalisefile[0])) # If no normalisation is set for the dataset, skip to the next line if len(line) < 2: continue # If the normalisation of this dataset has not been set, set it, otherwise append if normalise[l] is None: normalise[l] = line[1:] else: normalise[l] = normalise[l] + line[1:] for l in range(L): if normalise[l] is None: normalise[l] = [defaultnormalisation] else: normalise[l] = [int(n) for n in normalise[l]] return normalise