def reduceEachChromosome(self, chromosome=None, passingData=None, mapEachInputDataLs=None, chromosome2mapEachIntervalDataLs=None,\ reduceEachInputDataLs=None,\ transferOutput=True, \ **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachInputDataLs = mapEachInputDataLs returnData.reduceEachInputDataLs = reduceEachInputDataLs #reduce matrix by chosen column and average p-value outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output, 'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome))) reduceChromosomeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=outputFile, \ parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \ extraDependentInputLs=None, transferOutput=False) #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\ mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome) for mapEachIntervalData in mapEachIntervalDataLs: for jobData in mapEachIntervalData.jobDataLs: self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job]) #add the reduction job to final stat merge job self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob]) return returnData
def linkMapToReduce(self, mapEachIntervalData=None, preReduceReturnData=None, passingData=None, transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def mapEachAlignment(self, alignmentData=None, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. passingData.alignmentJobAndOutputLs = [] passingData.bamFnamePrefix = bamFnamePrefix passingData.individual_alignment = alignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob refFastaF = passingData.refFastaFList[0] alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = alignment.getReadGroup() return returnData
def reduceEachInput(self, chromosome=None, passingData=None, mapEachIntervalDataLs=None, transferOutput=True, **keywords): """ 2013.07.10 #. concatenate all the sub-Inputs into one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs] """ realInputVolume = passingData.jobData.file.noOfIndividuals * \ passingData.jobData.file.noOfLoci baseInputVolume = 200*20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value """ return returnData
def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def preReduce(self, passingData=None, transferOutput=True, **keywords): """ setup additional mkdir folder jobs, before mapEachAlignment, mapEachChromosome, mapReduceOneAlignment """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs return returnData
def reduceAfterEachChromosome(self, chromosome=None, passingData=None, transferOutput=True, mapEachIntervalDataLs=None, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachIntervalDataLs = mapEachIntervalDataLs return returnData
def mapEachChromosome(self, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def map(self, alignmentData=None, intervalData=None,\ VCFJobData=None, passingData=None, mapEachChromosomeData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceBeforeEachAlignment(self, passingData=None, transferOutput=True, **keywords): """ 2012.9 setup some reduce jobs before loop over all intervals of one alignment begins. these reduce jobs will collect stuff from each map() job. the link will be established in linkMapToReduce(). """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] return returnData
def reduceAfterEachAlignment(self, passingData=None, mapEachChromosomeDataLs=None, reduceAfterEachChromosomeDataLs=None,\ transferOutput=True, **keywords): """ """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs return returnData
def mapEachAlignment(self, passingData=None, transferOutput=True, **keywords): """ 2012.9.22 similar to reduceBeforeEachAlignmentData() but for mapping programs that run on one alignment each. """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] return returnData
def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \ chromosome=None,intervalData=None,\ mapEachChromosomeData=None, \ passingData=None, transferOutput=False, **keywords): """ #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and # no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new Input file based on the input split Input file (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] passingData.intervalFileBasenamePrefix passingData.splitInputFile """ ## 2013.06.19 structures available from passingData, specific to the interval passingData.splitInputFile = splitInputFile passingData.unitNumber = unitNumber passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%( chromosome, commonPrefix, unitNumber) passingData.noOfIndividuals = jobData.file.noOfIndividuals passingData.span = self.intervalSize + self.intervalOverlapSize*2 """ #add one computing job outputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\ intervalData.interval))) locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output, "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix, intervalData.interval))) job = self.addAbstractMatrixFileWalkerJob( executable=self.ComputeLiftOverLocusProbability, \ inputFile=selectIntervalJobData.file, outputFile=outputFile, \ whichColumn=None, whichColumnHeader=None, \ logY=None, valueForNonPositiveYValue=-1, \ minNoOfTotal=1, samplingRate=1, \ inputFileFormat=None, outputFileFormat=None,\ extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \ "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)], parentJobLs=[selectIntervalJobData.job], extraOutputLs=[locusIntervalDeltaOutputFile],\ transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False) #For each interval, probabilities are not calculated for loci in # extra segment (from overlapStart to start). returnData.jobDataLs.append(self.constructJobDataFromJob(job)) return returnData
def mapEachChromosome(self, alignmentData=None, chromosome=None,\ VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords): """ 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix """ #2012.9.21 perhaps a downsampling job outputFname = os.path.join(topOutputDirJob.output, \ '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature)) outputFile = File(outputFname) selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob( executable=self.samtools, inputFile=bamF, \ outputFile=outputFile, region=overlapInterval, parentJobLs=[topOutputDirJob] + parentJobLs, \ extraDependentInputLs=[baiF], transferOutput=False, \ extraArguments=None, job_max_memory=2000, needBAMIndexJob=True) """ """ #2012.9.21 count covariates job is moved to map() recalFile = File(os.path.join(topOutputDirJob.output, '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome))) countCovariatesJob = self.addGATKBaseRecalibratorJob( GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, inputFile=bamF, \ VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \ refFastaFList=passingData.refFastaFList, parentJobLs=[topOutputDirJob]+parentJobLs, extraDependentInputLs=[baiF, VCFFile.tbi_F], \ transferOutput=False, \ extraArguments=None, job_max_memory=4000) self.no_of_jobs += 1 returnData.countCovariatesJob = countCovariatesJob returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob], file=countCovariatesJob.recalFile, \ fileLs=[countCovariatesJob.recalFile])) """ return returnData
def reduce(self, passingData=None, reduceEachChromosomeDataLs=None, transferOutput=True, **keywords): """ #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] reduceOutputDirJob = passingData.reduceOutputDirJob realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci baseInputVolume = 200 * 20000 walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=500).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=5000, minJobPropertyValue=5000, maxJobPropertyValue=10000).value outputFile = File( os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv')) reduceJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, outputF=outputFile, parentJobLs=[reduceOutputDirJob], transferOutput=transferOutput, ) returnData.jobDataLs.append( PassingData(jobLs=[reduceJob], file=reduceJob.output, fileLs=[reduceJob.output])) for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs: for mapEachIntervalData in mapEachIntervalDataLs: self.addInputToMergeJob(reduceJob, \ parentJobLs=[mapEachIntervalData.mapJob]) return returnData
def preReduce(self, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2013.06.14 move topOutputDirJob from addAllJobs to here. 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] self.topOutputDirJob = self.addMkDirJob(outputDir="%sRun" % (outputDirPrefix)) passingData.topOutputDirJob = self.topOutputDirJob mapDirJob = self.addMkDirJob(outputDir="%sMap" % (outputDirPrefix)) passingData.mapDirJob = mapDirJob returnData.mapDirJob = mapDirJob self.mapDirJob = mapDirJob reduceOutputDirJob = self.addMkDirJob(outputDir="%sReduce" % (outputDirPrefix)) passingData.reduceOutputDirJob = reduceOutputDirJob returnData.reduceOutputDirJob = reduceOutputDirJob self.plotDirJob = self.addMkDirJob(outputDir="%sPlot" % (outputDirPrefix)) self.statDirJob = self.addMkDirJob(outputDir="%sStat" % (outputDirPrefix)) self.reduceStatDirJob = self.addMkDirJob(outputDir="%sReduceStat" % (outputDirPrefix)) self.reduceEachInputDirJob = self.addMkDirJob( outputDir="%sReduceEachInput" % (outputDirPrefix)) self.reduceEachChromosomeDirJob = self.addMkDirJob( outputDir="%sReduceEachChromosome" % (outputDirPrefix)) self.reduceOutputDirJob = reduceOutputDirJob return returnData
def reduce(self, reduceEachChromosomeDataLs=None, \ mapEachChromosomeDataLs=None, passingData=None, transferOutput=True, **keywords): """ 2013.07.18 return each processed-Input job data so that followup workflows could carry out map-reduce 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs returnData.reduceEachChromosomeDataLs = reduceEachChromosomeDataLs """ #2013.07.18 example to return each processed-Input job data # so that followup workflows could carry out map-reduce for reduceEachInputDataLs in passingData.reduceEachInputDataLsLs: if reduceEachInputDataLs: for reduceEachInputData in reduceEachInputDataLs: if reduceEachInputData: returnData.jobDataLs.append(reduceEachInputData.WHATEVERJobData) """ return returnData
def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, chromosome=None,intervalData=None,\ mapEachChromosomeData=None, \ passingData=None, transferOutput=False, **keywords): """ 2013.04.08 use inputJobData 2012.10.3 #. extract flanking sequences from the input Input # (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and no of # mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new Input file based on the input split Input file (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] #passingData.intervalFileBasenamePrefix #passingData.splitInputFile #passingData.unitNumber """ ## 2013.06.19 structures available from passingData, specific to the interval passingData.splitInputFile = splitInputFile passingData.unitNumber = unitNumber passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%( chromosome, commonPrefix, unitNumber) passingData.noOfIndividuals = jobData.file.noOfIndividuals passingData.span = self.intervalSize + self.intervalOverlapSize*2 #2013.06.19 for memory/walltime gauging """ return returnData
def addJobs(self, inputData=None, db_main=None, genotypeMethodShortName=None, commit=None,\ data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\ maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False): """ 2012.5.9 """ sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... " % (len(inputData.jobDataLs))) topOutputDir = "%sVCF2DB" % (outputDirPrefix) topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) firstVCFFile = inputData.jobDataLs[0].vcfFile logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log')) addGM2DBJob = self.addAddGenotypeMethod2DBJob( executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=None, extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel) updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log')) updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob( executable=self.UpdateGenotypeMethodNoOfLoci, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel) returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.vcfFile if maxContigID: contig_id = self.getContigIDFromFname(inputF.name) try: contig_id = int(contig_id) if contig_id > maxContigID: #skip the small contigs continue except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() logFile = File( os.path.join( topOutputDir, 'AddVCFFile2DB_%s.log' % (self.getChrFromFname(inputF.name)))) addVCFJob = self.addAddVCFFile2DBJob( executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \ parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel) self.add_dependency(updateGMNoOfLociJob, parents=[addVCFJob]) sys.stderr.write("%s jobs.\n" % (self.no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \ fileLs=[updateGMlogFile])) return returnData
def addAllJobs(self, alignmentDataLs=None, chr2IntervalDataLs=None, skipDoneAlignment=False,\ registerReferenceData=None, \ needFastaIndexJob=False, needFastaDictJob=False, \ data_dir=None, \ outputDirPrefix="", transferOutput=True, **keywords): """ 2012.7.26 """ prePreprocessData = self.setup_chr() chrIDSet = prePreprocessData.chrIDSet chrSizeIDList = prePreprocessData.chrSizeIDList chr2VCFJobData = prePreprocessData.chr2VCFJobData print(f"Adding jobs that work on {len(alignmentDataLs)} alignments " f"(& possibly VCFs) for {len(chrIDSet)} chromosomes/contigs ...", flush=True) refFastaFList = registerReferenceData.refFastaFList refFastaF = refFastaFList[0] topOutputDirJob = self.addMkDirJob(outputDir="%sMap"%(outputDirPrefix)) self.mapDirJob = topOutputDirJob plotOutputDirJob = self.addMkDirJob(outputDir="%sPlot"%(outputDirPrefix)) self.plotOutputDirJob = plotOutputDirJob reduceOutputDirJob = self.addMkDirJob(outputDir="%sReduce"%(outputDirPrefix)) self.reduceOutputDirJob = reduceOutputDirJob if needFastaDictJob or registerReferenceData.needPicardFastaDictJob: fastaDictJob = self.addRefFastaDictJob( refFastaF=refFastaF) refFastaDictF = fastaDictJob.refFastaDictF else: fastaDictJob = None refFastaDictF = registerReferenceData.refPicardFastaDictF if needFastaIndexJob or registerReferenceData.needSAMtoolsFastaIndexJob: fastaIndexJob = self.addRefFastaFaiIndexJob(refFastaF=refFastaF) refFastaIndexF = fastaIndexJob.refFastaIndexF else: fastaIndexJob = None refFastaIndexF = registerReferenceData.refSAMtoolsFastaIndexF returnData = PassingData() returnData.jobDataLs = [] #2012.9.22 alignmentJobAndOutputLs is a relic. # but it's similar to mapEachIntervalDataLs but # designed for addAlignmentMergeJob(), # so alignmentJobAndOutputLs gets re-set for every alignment. # mapEachAlignmentDataLs is never reset. # mapEachChromosomeDataLs is reset right after a new alignment is chosen. # mapEachIntervalDataLs is reset right after each chromosome is chosen. # all reduce dataLs never gets reset. passingData = PassingData(alignmentJobAndOutputLs=[], \ alignmentDataLs = alignmentDataLs,\ bamFnamePrefix=None, \ outputDirPrefix=outputDirPrefix, \ topOutputDirJob=topOutputDirJob,\ plotOutputDirJob=plotOutputDirJob,\ reduceOutputDirJob = reduceOutputDirJob,\ refFastaFList=refFastaFList, \ registerReferenceData= registerReferenceData,\ refFastaF=refFastaFList[0],\ fastaDictJob = fastaDictJob,\ refFastaDictF = refFastaDictF,\ fastaIndexJob = fastaIndexJob,\ refFastaIndexF = refFastaIndexF,\ chromosome=None,\ chrIDSet=chrIDSet,\ chrSizeIDList = chrSizeIDList,\ chr2IntervalDataLs=chr2IntervalDataLs,\ mapEachAlignmentData = None,\ mapEachChromosomeData=None, \ mapEachIntervalData=None,\ reduceBeforeEachAlignmentData = None, \ reduceAfterEachAlignmentData=None,\ reduceAfterEachChromosomeData=None,\ mapEachAlignmentDataLs = [],\ mapEachChromosomeDataLs=[], \ mapEachIntervalDataLs=[],\ reduceBeforeEachAlignmentDataLs = [], \ reduceAfterEachAlignmentDataLs=[],\ reduceAfterEachChromosomeDataLs=[],\ gzipReduceAfterEachChromosomeFolderJob=None,\ gzipReduceBeforeEachAlignmentFolderJob = None,\ gzipReduceAfterEachAlignmentFolderJob = None,\ gzipPreReduceFolderJob = None,\ gzipReduceFolderJob=None,\ ) preReduceReturnData = self.preReduce(passingData=passingData, transferOutput=False, **keywords) passingData.preReduceReturnData = preReduceReturnData no_of_alignments_worked_on= 0 for alignmentData in passingData.alignmentDataLs: alignment = alignmentData.alignment parentJobLs = alignmentData.jobLs + [fastaDictJob, fastaIndexJob] bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = alignment.getReadGroup() passingData.alignmentJobAndOutputLs = [] passingData.bamFnamePrefix = bamFnamePrefix passingData.individual_alignment = alignment passingData.alignmentData = alignmentData if skipDoneAlignment and self.isThisAlignmentComplete( individual_alignment=alignment, data_dir=data_dir): continue no_of_alignments_worked_on += 1 mapEachAlignmentData = self.mapEachAlignment( alignmentData=alignmentData, passingData=passingData, \ transferOutput=False, \ preReduceReturnData=preReduceReturnData, **keywords) passingData.mapEachAlignmentDataLs.append(mapEachAlignmentData) passingData.mapEachAlignmentData = mapEachAlignmentData reduceBeforeEachAlignmentData = self.reduceBeforeEachAlignment( passingData=passingData, preReduceReturnData=preReduceReturnData, transferOutput=False, \ **keywords) passingData.reduceBeforeEachAlignmentData = reduceBeforeEachAlignmentData passingData.reduceBeforeEachAlignmentDataLs.append(reduceBeforeEachAlignmentData) mapReduceOneAlignmentReturnData = self.mapReduceOneAlignment( alignmentData=alignmentData, \ passingData=passingData, \ chrIDSet=chrIDSet, chrSizeIDList=chrSizeIDList, \ chr2IntervalDataLs=chr2IntervalDataLs, chr2VCFJobData=chr2VCFJobData, outputDirPrefix=outputDirPrefix, transferOutput=transferOutput) reduceAfterEachAlignmentData = self.reduceAfterEachAlignment(\ mapEachAlignmentData=mapEachAlignmentData,\ mapEachChromosomeDataLs=passingData.mapEachChromosomeDataLs,\ reduceAfterEachChromosomeDataLs=passingData.reduceAfterEachChromosomeDataLs,\ passingData=passingData, \ transferOutput=False, data_dir=data_dir, **keywords) passingData.reduceAfterEachAlignmentData = reduceAfterEachAlignmentData passingData.reduceAfterEachAlignmentDataLs.append(reduceAfterEachAlignmentData) gzipReduceBeforeEachAlignmentData = self.addGzipSubWorkflow(\ inputData=reduceBeforeEachAlignmentData, transferOutput=transferOutput,\ outputDirPrefix="%sReduceBeforeEachAlignment"%(outputDirPrefix), \ topOutputDirJob=passingData.gzipReduceBeforeEachAlignmentFolderJob, report=False) passingData.gzipReduceBeforeEachAlignmentFolderJob = \ gzipReduceBeforeEachAlignmentData.topOutputDirJob gzipReduceAfterEachAlignmentData = self.addGzipSubWorkflow(\ inputData=reduceAfterEachAlignmentData, transferOutput=transferOutput,\ outputDirPrefix="%sReduceAfterEachAlignment"%(outputDirPrefix), \ topOutputDirJob=passingData.gzipReduceAfterEachAlignmentFolderJob, \ report=False) passingData.gzipReduceAfterEachAlignmentFolderJob = \ gzipReduceAfterEachAlignmentData.topOutputDirJob reduceReturnData = self.reduce(passingData=passingData, \ mapEachAlignmentData=passingData.mapEachAlignmentData, \ reduceAfterEachAlignmentDataLs=passingData.reduceAfterEachAlignmentDataLs,\ **keywords) passingData.reduceReturnData = reduceReturnData #2012.9.18 gzip the final output newReturnData = self.addGzipSubWorkflow(inputData=preReduceReturnData, transferOutput=transferOutput,\ outputDirPrefix="%sGzipPreReduce"%(outputDirPrefix), \ topOutputDirJob=passingData.gzipPreReduceFolderJob, \ report=False) passingData.gzipPreReduceFolderJob = newReturnData.topOutputDirJob newReturnData = self.addGzipSubWorkflow(inputData=reduceReturnData, transferOutput=transferOutput,\ outputDirPrefix="%sGzipReduce"%(outputDirPrefix), \ topOutputDirJob=passingData.gzipReduceFolderJob, \ report=False) passingData.gzipReduceFolderJob = newReturnData.topOutputDirJob sys.stderr.write("%s alignments to be worked on. %s jobs.\n"%( no_of_alignments_worked_on, self.no_of_jobs)) return returnData
def reduceAfterEachAlignment(self, passingData=None, transferOutput=False, data_dir=None, **keywords): """ """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] alignmentJobAndOutputLs = getattr(passingData, 'alignmentJobAndOutputLs', []) bamFnamePrefix = passingData.bamFnamePrefix topOutputDirJob = passingData.topOutputDirJob individual_alignment = passingData.individual_alignment reduceOutputDirJob = passingData.reduceOutputDirJob if len(alignmentJobAndOutputLs) > 0: #2012.3.29 merge alignment output only when there is something to merge! #2013.04.09 create a new child alignment local_realigned =1, etc. new_individual_alignment = self.db.copyParentIndividualAlignment( parent_individual_alignment_id=individual_alignment.id,\ mask_genotype_method_id=self.new_mask_genotype_method_id,\ data_dir=self.data_dir, local_realigned=1) baseCoverage = 4 #baseline actualCoverage = getattr(individual_alignment.individual_sequence, 'coverage', baseCoverage) minMergeAlignmentWalltime = 240 #in minutes, 4 hours, when coverage is defaultCoverage maxMergeAlignmentWalltime = 2880 #in minutes, 2 days minMergeAlignmentMaxMemory = 7000 #in MB, when coverage is defaultCoverage maxMergeAlignmentMaxMemory = 12000 #in MB mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=actualCoverage, baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentWalltime * 2, minJobPropertyValue=minMergeAlignmentWalltime, maxJobPropertyValue=maxMergeAlignmentWalltime).value mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=actualCoverage, baseInputVolume=baseCoverage, baseJobPropertyValue=minMergeAlignmentMaxMemory, minJobPropertyValue=minMergeAlignmentMaxMemory, maxJobPropertyValue=maxMergeAlignmentMaxMemory).value # replace read_group with the new one to each alignment job newAlignmentJobAndOutputLs = [] for alignmentJobAndOutput in alignmentJobAndOutputLs: # add a AddReadGroup job alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[: 2] fileBasenamePrefix = os.path.splitext( alignmentJob.output.name)[0] outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix)) # needBAMIndexJob=False because addAlignmentMergeJob() # does not need .bai. addRGJob = self.addReadGroupJob( individual_alignment=new_individual_alignment, inputBamFile=alignmentJob.output, outputBamFile=outputRGBAM, needBAMIndexJob=False, parentJobLs=[alignmentJob, indexAlignmentJob], extraDependentInputLs=alignmentJob.outputLs[1:], job_max_memory=2500, transferOutput=False, walltime=max(180, mergeAlignmentWalltime / 20)) newAlignmentJobAndOutputLs.append( PassingData(jobLs=[addRGJob], file=addRGJob.output)) mergedBamFile = File(os.path.join(reduceOutputDirJob.output, \ '%s_recal.bam'%(bamFnamePrefix))) alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob( alignmentJobAndOutputLs=newAlignmentJobAndOutputLs, outputBamFile=mergedBamFile, needBAMIndexJob=True, parentJobLs=[reduceOutputDirJob], walltime=mergeAlignmentWalltime, job_max_memory=mergeAlignmentMaxMemory, transferOutput=False) #2012.9.19 add/copy the alignment file to db-affliated storage #add the metric file to AddAlignmentFile2DB.py as well # (to be moved into db-affiliated storage) logFile = File( os.path.join(reduceOutputDirJob.output, '%s_2db.log' % (bamFnamePrefix))) alignment2DBJob = self.addAlignmentFile2DBJob( executable=self.AddAlignmentFile2DB, inputFile=alignmentMergeJob.output, baiFile=bamIndexJob.baiFile, individual_alignment_id=new_individual_alignment.id, mask_genotype_method_id=self.new_mask_genotype_method_id, logFile=logFile, data_dir=data_dir, otherInputFileList=None, parentJobLs=[alignmentMergeJob, bamIndexJob], transferOutput=transferOutput, sshDBTunnel=self.needSSHDBTunnel, commit=True, job_max_memory=2000, walltime=max(180, mergeAlignmentWalltime / 2)) self.no_of_jobs += 1 returnData.jobDataLs.append(PassingData(jobLs=[alignment2DBJob], file=alignment2DBJob.logFile, \ fileLs=[alignment2DBJob.logFile])) return returnData
def addAllJobs(self, \ data_dir=None, \ outputDirPrefix="", transferOutput=True, **keywords): """ 2013.2.27 run ms estimate parameters from ms ms2SLiM SLiM forward simulator with estimated ms-parameters or take the output of ms as input SLiM2PolymorphismTableFile AddPopGenSimulation2DB.py """ sys.stderr.write("Adding jobs for pop-gen simulation #jobs=%s... \n"%\ (self.no_of_jobs)) returnData = PassingData() returnData.jobDataLs = [] passingData = PassingData(fileBasenamePrefix=None, \ outputDirPrefix=outputDirPrefix, \ jobData=None,\ preReduceReturnData=None,\ association_group_key2orderIndex = {},\ association_group_key2resultList = {},\ association_group_key2reduceAssociationPeakJobMatrix = {},\ association_group_key2countAssociationLocusJobList = {},\ resultID2defineLandscapeJobData = {}, ) preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=False,\ **keywords) mapDirJob = preReduceReturnData.mapDirJob plotOutputDirJob = preReduceReturnData.plotOutputDirJob countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob passingData.preReduceReturnData = preReduceReturnData #add output pedigree job for i in range(self.noOfReplicates): popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \ parentJobLs=[mapDirJob]) #pending user choice, use ms/sfs-code/slim/ms & slim combination msOutputFile = File(os.path.join(popGenSimulationFolderJob.output, \ 'sim%s_msOutput.txt.gz'%(i))) popSimulationJob = self.addMSSimulationJob(outputFile=msOutputFile, \ recombinationRate=self.recombinationRate, mutationRate=self.mutationRate, \ initialEffectivePopulationSize=self.initialEffectivePopulationSize, \ otherParametersPassedToPopGenSimulator=self.otherParametersPassedToPopGenSimulator, \ sampleSize=self.sampleSize, noOfLociToSimulate=self.noOfLociToSimulate, \ simulateLocusLengthList=self.simulateLocusLengthList, \ parentJobLs=[popGenSimulationFolderJob], \ extraDependentInputLs=None, extraOutputLs=None, \ transferOutput=False, extraArguments=None, extraArgumentList=None, \ job_max_memory=2000, walltime=180) #. convert ms pop-gen output 2 polymorphism-table file msOutputHDF5File = File(os.path.join(popGenSimulationFolderJob.output, \ 'sim%s_msOutput.h5'%(i))) msOutput2PolymorphismTableFileJob = self.addGenericJob(executable=self.msOutput2PolymorphismTableFile, \ inputFile=popSimulationJob.output, \ outputFile=msOutputHDF5File,\ parentJob=None, parentJobLs=[popGenSimulationFolderJob, popSimulationJob], \ extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \ frontArgumentList=None, \ extraArguments=None, \ extraArgumentList=None, job_max_memory=2000, \ no_of_cpus=None, walltime=None) #. add polymorphism-table file to db logFile = File( os.path.join(popGenSimulationFolderJob.output, "sim%s_2DB.log" % (i))) extraArgumentList = ["--r %s"%self.recombinationRate, "--rho %s"%popSimulationJob.rho, "--mu %s"%self.mutationRate,\ "--theta %s"%popSimulationJob.theta, "--n0 %s"%self.initialEffectivePopulationSize,\ "--no_of_populations 1", "--no_of_chromosomes %s"%self.sampleSize,\ "--chromosome_length %s"%popSimulationJob.locusLength,\ "--replicate_index %s"%(i)] """ extraArgumentList.append("--parent_pop_gen_simulation_type_id %s"%self.parent_pop_gen_simulation_type_id) """ simulation2DBJob = self.addPutStuffIntoDBJob(executable=self.AddPopGenSimulation2DB, \ inputFileList=[msOutput2PolymorphismTableFileJob.output], \ logFile=logFile, commit=True, \ parentJobLs=[popGenSimulationFolderJob, msOutput2PolymorphismTableFileJob], \ extraDependentInputLs=None, transferOutput=True, extraArguments=None, \ extraArgumentList=extraArgumentList,\ job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
def mapEachInterval(self, VCFJobData=None, passingData=None, transferOutput=False, **keywords): """ use VCFJobData #. extract flanking sequences from the input VCF (ref sequence file => contig ref sequence) #. blast them #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py #. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates #. output a mapping file between old SNP and new SNP coordinates. #. reduce this thing by combining everything #. make a new VCF file based on the input split VCF file #. (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it) """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob mapDirJob = passingData.mapDirJob reduceOutputDirJob = passingData.reduceOutputDirJob intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix jobData = passingData.jobData VCFFile = VCFJobData.file splitVCFJob = passingData.mapEachVCFData.splitVCFJob chromosome = passingData.chromosome # a flanking sequence extraction job #noOfIndividuals realInputVolume = passingData.noOfIndividuals * passingData.span baseInputVolume = 600 * 2000 #600 individuals at 2000 sites #base is 200 individual X 2Mb region => 120 minutes walltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=60, minJobPropertyValue=60, maxJobPropertyValue=1200).value #base is 4X, => 5000M job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=realInputVolume, baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, minJobPropertyValue=4000, maxJobPropertyValue=8000).value outputFnamePrefix = os.path.join( mapDirJob.output, '%s.sameSite.concordance' % (intervalFileBasenamePrefix)) outputFile = File('%s.tsv' % (outputFnamePrefix)) returnData.mapJob = self.addAbstractMapperLikeJob( executable=self.CalculateSameSiteConcordanceInVCF, inputF=VCFFile, outputF=outputFile, parentJobLs=[mapDirJob] + VCFJobData.jobLs, transferOutput=transferOutput, job_max_memory=job_max_memory, walltime=walltime) return returnData
def reduceAfterEachAlignment(self, passingData=None, transferOutput=False, data_dir=None, **keywords): """ """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] alignmentJobAndOutputLs = getattr(passingData, 'alignmentJobAndOutputLs', []) bamFnamePrefix = passingData.bamFnamePrefix topOutputDirJob = passingData.topOutputDirJob individual_alignment = passingData.individual_alignment reduceOutputDirJob = passingData.reduceOutputDirJob if len(alignmentJobAndOutputLs) > 0: #2012.3.29 merge alignment output only when there is something to merge! #2013.04.09 create a new child alignment local_realigned =1, etc. new_individual_alignment = self.db.copyParentIndividualAlignment( parent_individual_alignment_id=individual_alignment.id,\ data_dir=self.data_dir, local_realigned=individual_alignment.local_realigned,\ reduce_reads=1) # replace read_group with the new one to each alignment job newAlignmentJobAndOutputLs = [] for alignmentJobAndOutput in alignmentJobAndOutputLs: # add a AddReadGroup job alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[: 2] fileBasenamePrefix = os.path.splitext( alignmentJob.output.name)[0] outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix)) # needBAMIndexJob=False because addAlignmentMergeJob() # does not need .bai. addRGJob = self.addReadGroupJob( individual_alignment=new_individual_alignment, inputBamFile=alignmentJob.output, outputBamFile=outputRGBAM, needBAMIndexJob=False, parentJobLs=[alignmentJob, indexAlignmentJob], extraDependentInputLs=alignmentJob.outputLs[1:], job_max_memory=2500, transferOutput=False) newAlignmentJobAndOutputLs.append( PassingData(jobLs=[addRGJob], file=addRGJob.output)) mergedBamFile = File( os.path.join(reduceOutputDirJob.output, '%s.merged.bam' % (bamFnamePrefix))) alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob( alignmentJobAndOutputLs=newAlignmentJobAndOutputLs, outputBamFile=mergedBamFile, needBAMIndexJob=True, parentJobLs=[reduceOutputDirJob], transferOutput=False) #2012.9.19 add/copy the alignment file to db-affliated storage #add the metric file to AddAlignmentFile2DB.py as well # (to be moved into db-affiliated storage) logFile = File( os.path.join(reduceOutputDirJob.output, '%s_2db.log' % (bamFnamePrefix))) alignment2DBJob = self.addAlignmentFile2DBJob( executable=self.AddAlignmentFile2DB, inputFile=alignmentMergeJob.output, baiFile=bamIndexJob.baiFile, individual_alignment_id=new_individual_alignment.id, logFile=logFile, data_dir=data_dir, otherInputFileList=None, parentJobLs=[alignmentMergeJob, bamIndexJob], transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel, commit=True) self.no_of_jobs += 1 returnData.jobDataLs.append( PassingData(jobLs=[alignment2DBJob], file=alignment2DBJob.logFile, \ fileLs=[alignment2DBJob.logFile])) return returnData
def mapEachInterval(self, alignmentData=None, intervalData=None, chromosome=None, VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None, mapEachChromosomeData=None, transferOutput=False, \ **keywords): """ 2013.03.31 use VCFJobData to decide whether to add BQSR jobs, called in ShortRead2Alignment.py 2012.9.17 """ returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob alignment = alignmentData.alignment bamF = alignmentData.bamF baiF = alignmentData.baiF bamFnamePrefix = passingData.bamFnamePrefix #SNPVCFFile = VCFJobData.file #if SNPVCFFile is None or VCFJobData is None: # #2013.04.09 BQSR requires a VCF input regardless of the chromosome # VCFJobData = self.randomSNPVCFJobDataForBQSR #SNPVCFFile = VCFJobData.file #SNPVCFJobLs = VCFJobData.jobLs if intervalData.file: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.file else: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.interval intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature overlapInterval = intervalData.overlapInterval overlapFileBasenameSignature = intervalData.overlapIntervalFileBasenameSignature span = intervalData.span if chromosome is None: chromosome = getattr(passingData, 'chromosome', None) median_depth = getattr(alignment, 'median_depth', 4) readSpace = median_depth * span #base is 4X coverage in 20Mb region => 120 minutes reduceReadsJobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=readSpace, \ baseInputVolume=4*20000000, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=500).value #base is 4X, => 5000M reduceReadsJobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput( realInputVolume=median_depth, \ baseInputVolume=4, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=8000).value reduceReadsBamFile = File(os.path.join(topOutputDirJob.output, \ '%s_%s.reduceReads.bam'%\ (bamFnamePrefix, overlapFileBasenameSignature))) #Default downsampling setting is 40 in GATK 2.4.9 # this downsampling happens at the ReadWalker level, #extraArgumentList= ["--downsample_to_coverage 250", "--downsampling_type BY_SAMPLE"] extraArgumentList = ["--downsample_coverage 250"] #this is for #This level of downsampling only happens after the region has been evaluated, # therefore it can be combined with the engine level downsampling. reduceReadsJob = self.addGATKJob(executable=self.ReduceReadsJava, GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, \ GATKAnalysisType='ReduceReads',\ inputFile=bamF, inputArgumentOption="-I", refFastaFList=passingData.refFastaFList, inputFileList=None,\ argumentForEachFileInInputFileList=None,\ interval=overlapInterval, outputFile=reduceReadsBamFile, \ parentJobLs=alignmentData.jobLs, transferOutput=False, \ job_max_memory=reduceReadsJobMaxMemory,\ frontArgumentList=None, extraArguments=None, \ extraArgumentList=extraArgumentList, \ extraOutputLs=[], \ extraDependentInputLs=[baiF], no_of_cpus=None, \ walltime=reduceReadsJobWalltime) indexBamJob = self.addBAMIndexJob( BuildBamIndexFilesJava=self.BuildBamIndexFilesJava, \ BuildBamIndexJar=self.BuildBamIndexJar, \ inputBamF=reduceReadsJob.output,\ parentJobLs=[reduceReadsJob], \ transferOutput=False, job_max_memory=3000, \ walltime=max(120, int(reduceReadsJobWalltime/3))) passingData.alignmentJobAndOutputLs.append(PassingData( jobLs=[reduceReadsJob, indexBamJob], \ file=reduceReadsJob.output, fileLs=[reduceReadsJob.output])) return returnData
def addAllJobs(self, inputData=None, chr2IntervalDataLs=None, data_dir=None, intervalSize=3000, intervalOverlapSize=0, outputDirPrefix="", passingData=None, \ transferOutput=True, job_max_memory=2000, **keywords): """ 2013.06.14 bugfix regarding noOfUnits, which was all inferred from one file. 2012.7.26 architect of the whole map-reduce framework """ print( f"Adding jobs for {len(inputData.jobDataLs)} input " "genome files ...", flush=True) returnData = PassingData() returnData.jobDataLs = [] #2012.9.22 # mapEachAlignmentDataLs is never reset. # mapEachChromosomeDataLs is reset upon new alignment # mapEachIntervalDataLs is reset upon each new chromosome # all reduce lists never get reset. # fileBasenamePrefix is the prefix of input file's basename, # to be used for temporary output files in reduceEachInput() # but not for output files in mapEachInterval() passingData = PassingData(\ fileBasenamePrefix=None, \ chromosome=None, \ outputDirPrefix=outputDirPrefix, \ intervalFileBasenamePrefix=None,\ registerReferenceData=None, \ refFastaFList=None, \ refFastaF=None,\ fastaDictJob = None,\ refFastaDictF = None,\ fastaIndexJob = None,\ refFastaIndexF = None,\ intervalOverlapSize =intervalOverlapSize, intervalSize=intervalSize, jobData=None,\ splitInputFile=None,\ intervalDataLs=None,\ preReduceReturnData=None,\ mapEachIntervalData=None,\ mapEachIntervalDataLs=None,\ mapEachIntervalDataLsLs=[],\ mapEachInputData=None,\ mapEachInputDataLs=None,\ mapEachInputDataLsLs=[],\ mapEachChromosomeData=None, \ mapEachChromosomeDataLs=[], \ chromosome2mapEachIntervalDataLs = {},\ chromosome2mapEachInputDataLs = {},\ reduceEachInputData=None,\ reduceEachChromosomeData=None,\ reduceEachInputDataLs=None,\ reduceEachInputDataLsLs=[],\ reduceEachChromosomeDataLs=[],\ ) # mapEachIntervalDataLsLs is list of mapEachIntervalDataLs by each Input file. # mapEachInputDataLsLs is list of mapEachInputDataLs by each chromosome # reduceEachInputDataLsLs is list of reduceEachInputDataLs by each chromosome preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix, passingData=passingData, transferOutput=True,\ **keywords) passingData.preReduceReturnData = preReduceReturnData #gzip folder jobs (to avoid repeatedly creating the same folder gzipReduceEachInputFolderJob = None gzipReduceEachChromosomeFolderJob = None gzipReduceFolderJob = None gzipPreReduceFolderJob = None no_of_input_files = 0 firstInterval = True for chromosome, intervalDataLs in chr2IntervalDataLs.items(): passingData.chromosome = chromosome mapEachChromosomeData = self.mapEachChromosome( chromosome=chromosome, passingData=passingData, transferOutput=False, **keywords) passingData.mapEachChromosomeData = mapEachChromosomeData passingData.mapEachChromosomeDataLs.append(mapEachChromosomeData) passingData.mapEachInputDataLsLs.append([]) #the last one from the double list is the current one passingData.mapEachInputDataLs = passingData.mapEachInputDataLsLs[ -1] passingData.mapEachIntervalDataLs = [] passingData.chromosome2mapEachIntervalDataLs[chromosome] = [] passingData.reduceEachInputDataLsLs.append([]) passingData.reduceEachInputDataLs = passingData.reduceEachInputDataLsLs[ -1] for i in range(len(inputData.jobDataLs)): jobData = inputData.jobDataLs[i] passingData.jobData = jobData passingData.inputJobData = jobData InputFile = jobData.file commonFileBasenamePrefix = utils.getFileBasenamePrefixFromPath( InputFile.name) passingData.fileBasenamePrefix = commonFileBasenamePrefix no_of_input_files += 1 if no_of_input_files % 10 == 0: sys.stderr.write("%s\t%s Inputs." % ('\x08' * 40, no_of_input_files)) for intervalData in intervalDataLs: selectIntervalJobData = self.selectIntervalFromInputFile( jobData=jobData, chromosome=chromosome, intervalData=intervalData, mapEachChromosomeData=mapEachChromosomeData, passingData=passingData, transferOutput=firstInterval, **keywords) mapEachIntervalData = self.mapEachInterval( inputJobData=jobData, selectIntervalJobData=selectIntervalJobData, chromosome=chromosome, intervalData=intervalData, mapEachChromosomeData=mapEachChromosomeData, \ passingData=passingData, transferOutput=firstInterval, **keywords) passingData.mapEachIntervalData = mapEachIntervalData passingData.mapEachIntervalDataLs.append( mapEachIntervalData) passingData.chromosome2mapEachIntervalDataLs[ chromosome].append(mapEachIntervalData) linkMapToReduceData = self.linkMapToReduce( mapEachIntervalData=mapEachIntervalData, preReduceReturnData=preReduceReturnData, passingData=passingData, **keywords) if firstInterval == True: firstInterval = False reduceEachInputData = self.reduceEachInput( chromosome=chromosome, passingData=passingData, mapEachIntervalDataLs=passingData.mapEachIntervalDataLs, transferOutput=False, data_dir=data_dir, \ **keywords) passingData.reduceEachInputData = reduceEachInputData passingData.reduceEachInputDataLs.append(reduceEachInputData) gzipReduceEachInputData = self.addGzipSubWorkflow(\ inputData=reduceEachInputData, outputDirPrefix="%sReduceEachInput"%(outputDirPrefix), topOutputDirJob=gzipReduceEachInputFolderJob, \ transferOutput=transferOutput, report=False) gzipReduceEachInputFolderJob = gzipReduceEachInputData.topOutputDirJob reduceEachChromosomeData = self.reduceEachChromosome( chromosome=chromosome, passingData=passingData, \ mapEachInputDataLs=passingData.mapEachInputDataLs, \ chromosome2mapEachIntervalDataLs=passingData.chromosome2mapEachIntervalDataLs, reduceEachInputDataLs=passingData.reduceEachInputDataLs,\ transferOutput=False, data_dir=data_dir, \ **keywords) passingData.reduceEachChromosomeData = reduceEachChromosomeData passingData.reduceEachChromosomeDataLs.append( reduceEachChromosomeData) gzipReduceEachChromosomeData = self.addGzipSubWorkflow( inputData=reduceEachChromosomeData, outputDirPrefix="%sReduceEachChromosome"%(outputDirPrefix), \ topOutputDirJob=gzipReduceEachChromosomeFolderJob, transferOutput=transferOutput, report=False) gzipReduceEachChromosomeFolderJob = \ gzipReduceEachChromosomeData.topOutputDirJob reduceReturnData = self.reduce(passingData=passingData, transferOutput=False, mapEachChromosomeDataLs=passingData.mapEachInputDataLs,\ reduceEachChromosomeDataLs=passingData.reduceEachChromosomeDataLs, **keywords) passingData.reduceReturnData = reduceReturnData if self.needGzipPreReduceReturnData: gzipPreReduceReturnData = self.addGzipSubWorkflow( inputData=preReduceReturnData, transferOutput=transferOutput, outputDirPrefix="%sPreReduce"%(outputDirPrefix), \ topOutputDirJob= gzipPreReduceFolderJob, report=False) gzipPreReduceFolderJob = gzipPreReduceReturnData.topOutputDirJob if self.needGzipReduceReturnData: gzipReduceReturnData = self.addGzipSubWorkflow( inputData=reduceReturnData, transferOutput=transferOutput, outputDirPrefix="%sReduce"%(outputDirPrefix), \ topOutputDirJob=gzipReduceFolderJob, report=False) gzipReduceFolderJob = gzipReduceReturnData.topOutputDirJob print(f" {no_of_input_files} Input files.", flush=True) sys.stderr.write(f"{self.no_of_jobs} jobs.\n") return reduceReturnData
def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, transferOutput=True, makeBlastDBJob=None): """ 2012.5.24 """ sys.stderr.write("Adding blast jobs for %s input ... "%(len(inputData.jobDataLs))) no_of_jobs= 0 topOutputDir = "%sBlast"%(outputDirPrefix) topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir) no_of_jobs += 1 allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv')) allBlastMergeJob = self.addStatMergeJob( statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob]) no_of_jobs += 1 ntDatabaseFile = ntDatabaseFileList[0] returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.output outputFnamePrefix = os.path.join(topOutputDir, os.path.splitext(os.path.basename(inputF.name))[0]) splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \ noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\ parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=None, transferOutput=False, \ extraArguments=None, job_max_memory=500) no_of_jobs += 1 for splitFastaOutput in splitFastaJob.outputList: outputFile = File('%s.tsv'%(splitFastaOutput.name)) blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile, maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \ extraArguments=None, job_max_memory=1000) #add output to some reduce job self.addInputToMergeJob(allBlastMergeJob, \ inputF=blastJob.output, parentJobLs=[blastJob]) no_of_jobs += 1 sys.stderr.write("%s jobs. Done.\n"%(no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \ fileLs=[allBlastResultFile])) return returnData