Example #1
0
 def reduceEachChromosome(self, chromosome=None, passingData=None,
     mapEachInputDataLs=None, 
     chromosome2mapEachIntervalDataLs=None,\
     reduceEachInputDataLs=None,\
     transferOutput=True, \
     **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachInputDataLs = mapEachInputDataLs
     returnData.reduceEachInputDataLs = reduceEachInputDataLs
     #reduce matrix by chosen column and average p-value
     
     outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output,
         'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome)))
     reduceChromosomeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=outputFile, \
         parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \
         extraDependentInputLs=None, transferOutput=False)
         #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\
     mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome)
     for mapEachIntervalData in mapEachIntervalDataLs:
         for jobData in mapEachIntervalData.jobDataLs:
             self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job])
         
     #add the reduction job to final stat merge job
     self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob])
     
     return returnData
 def linkMapToReduce(self, mapEachIntervalData=None,
     preReduceReturnData=None, passingData=None, transferOutput=True, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
    def mapEachAlignment(self, alignmentData=None,  passingData=None,
        transferOutput=True, **keywords):
        """
        2012.9.22
            similar to reduceBeforeEachAlignmentData() but
             for mapping programs that run on one alignment each.

            passingData.alignmentJobAndOutputLs = []
            passingData.bamFnamePrefix = bamFnamePrefix
            passingData.individual_alignment = alignment
        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        refFastaF = passingData.refFastaFList[0]

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF

        bamFnamePrefix = alignment.getReadGroup()

        return returnData
    def reduceEachInput(self,
                        chromosome=None,
                        passingData=None,
                        mapEachIntervalDataLs=None,
                        transferOutput=True,
                        **keywords):
        """
        2013.07.10
            #. concatenate all the sub-Inputs into one
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        returnData.mapEachIntervalDataLs = mapEachIntervalDataLs

        #intervalJobLs = [pdata for pdata in mapEachIntervalDataLs]
        """
        realInputVolume = passingData.jobData.file.noOfIndividuals * \
            passingData.jobData.file.noOfLoci
        baseInputVolume = 200*20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=60,
            minJobPropertyValue=60, maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume, \
            baseInputVolume=baseInputVolume, baseJobPropertyValue=5000,
            minJobPropertyValue=5000, maxJobPropertyValue=10000).value
        """
        return returnData
Example #5
0
    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
            transferOutput=True):
        """
        2012.6.27
        """
        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL,
                relativePath=relativePath, \
                username=username, password=password,\
                targetFolder=outputDir, logFile=logFile,
                cut_dir_number=self.cut_dir_number,
                parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
                transferOutput=transferOutput, \
                extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData
 def preReduce(self, passingData=None, transferOutput=True, **keywords):
     """
     setup additional mkdir folder jobs, before mapEachAlignment,
         mapEachChromosome, mapReduceOneAlignment
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduce(self, passingData=None, reduceAfterEachAlignmentDataLs=None,
         transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.reduceAfterEachAlignmentDataLs = reduceAfterEachAlignmentDataLs
     return returnData
 def reduceAfterEachChromosome(self, chromosome=None, passingData=None,
     transferOutput=True,
     mapEachIntervalDataLs=None, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachIntervalDataLs = mapEachIntervalDataLs
     return returnData
 def mapEachChromosome(self, alignmentData=None, chromosome=None,\
     VCFJobData=None, passingData=None,
     reduceBeforeEachAlignmentData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def map(self, alignmentData=None, intervalData=None,\
     VCFJobData=None, passingData=None,
     mapEachChromosomeData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduceBeforeEachAlignment(self, passingData=None,
     transferOutput=True, **keywords):
     """
     2012.9 setup some reduce jobs before loop over all intervals of one alignment begins.
         these reduce jobs will collect stuff from each map() job.
         the link will be established in linkMapToReduce().
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     return returnData
 def reduceAfterEachAlignment(self, passingData=None,
     mapEachChromosomeDataLs=None,
     reduceAfterEachChromosomeDataLs=None,\
     transferOutput=True, **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
     returnData.reduceAfterEachChromosomeDataLs = reduceAfterEachChromosomeDataLs
     return returnData
 def mapEachAlignment(self,
                      passingData=None,
                      transferOutput=True,
                      **keywords):
     """
     2012.9.22
         similar to reduceBeforeEachAlignmentData()
          but for mapping programs that run on one alignment each.
     """
     returnData = PassingData(no_of_jobs=0)
     returnData.jobDataLs = []
     return returnData
Example #14
0
    def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \
        chromosome=None,intervalData=None,\
        mapEachChromosomeData=None, \
        passingData=None, transferOutput=False, **keywords):
        """
        #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence)
        #. blast them
        #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
            #. where hit length match query length, and
            #    no of mismatches <=2 => good => infer new coordinates
        #. output a mapping file between old SNP and new SNP coordinates.
            #. reduce this thing by combining everything
        #. make a new Input file based on the input split Input file
            (replace contig ID , position with the new one's,
                remove the header part regarding chromosomes or replace it)

        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []
        passingData.intervalFileBasenamePrefix
        passingData.splitInputFile
        """
        ## 2013.06.19 structures available from passingData, specific to the interval
        passingData.splitInputFile = splitInputFile
        passingData.unitNumber = unitNumber
        passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(
            chromosome, commonPrefix, unitNumber)
        passingData.noOfIndividuals = jobData.file.noOfIndividuals
        passingData.span = self.intervalSize + self.intervalOverlapSize*2
        """
        #add one computing job
        outputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\
            intervalData.interval)))
        locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix,
            intervalData.interval)))
        job = self.addAbstractMatrixFileWalkerJob(
            executable=self.ComputeLiftOverLocusProbability, \
            inputFile=selectIntervalJobData.file, outputFile=outputFile, \
            whichColumn=None, whichColumnHeader=None, \
            logY=None, valueForNonPositiveYValue=-1, \
            minNoOfTotal=1, samplingRate=1, \
            inputFileFormat=None, outputFileFormat=None,\
            extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \
                "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)],
            parentJobLs=[selectIntervalJobData.job],
            extraOutputLs=[locusIntervalDeltaOutputFile],\
            transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False)
            #For each interval, probabilities are not calculated for loci in
            #  extra segment (from overlapStart to start).
        returnData.jobDataLs.append(self.constructJobDataFromJob(job))
        return returnData
    def mapEachChromosome(self, alignmentData=None, chromosome=None,\
        VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None,
        transferOutput=True, **keywords):
        """
        2012.9.17
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob

        alignment = alignmentData.alignment
        parentJobLs = alignmentData.jobLs
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF
        bamFnamePrefix = passingData.bamFnamePrefix
        """
        #2012.9.21 perhaps a downsampling job
        outputFname = os.path.join(topOutputDirJob.output, \
            '%s_%s.bam'%(bamFnamePrefix, overlapFileBasenameSignature))
        outputFile = File(outputFname)
        selectAlignmentJob, bamIndexJob1 = self.addSelectAlignmentJob(
            executable=self.samtools, inputFile=bamF, \
            outputFile=outputFile, region=overlapInterval,
            parentJobLs=[topOutputDirJob] + parentJobLs, \
            extraDependentInputLs=[baiF], transferOutput=False, \
            extraArguments=None, job_max_memory=2000, needBAMIndexJob=True)
        """
        """
        #2012.9.21 count covariates job is moved to map()
        recalFile = File(os.path.join(topOutputDirJob.output,
            '%s_%s.recal_data.csv'%(bamFnamePrefix, chromosome)))
        countCovariatesJob = self.addGATKBaseRecalibratorJob(
            GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, inputFile=bamF, \
            VCFFile=VCFFile, interval=chromosome, outputFile=recalFile, \
            refFastaFList=passingData.refFastaFList,
            parentJobLs=[topOutputDirJob]+parentJobLs, 
            extraDependentInputLs=[baiF, VCFFile.tbi_F], \
            transferOutput=False, \
            extraArguments=None, job_max_memory=4000)

        self.no_of_jobs += 1
        returnData.countCovariatesJob = countCovariatesJob
        returnData.jobDataLs.append(PassingData(jobLs=[countCovariatesJob],
            file=countCovariatesJob.recalFile, \
            fileLs=[countCovariatesJob.recalFile]))
        """

        return returnData
    def reduce(self,
               passingData=None,
               reduceEachChromosomeDataLs=None,
               transferOutput=True,
               **keywords):
        """
        #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one
        
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        reduceOutputDirJob = passingData.reduceOutputDirJob

        realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
        baseInputVolume = 200 * 20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=5000,
            minJobPropertyValue=5000,
            maxJobPropertyValue=10000).value

        outputFile = File(
            os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv'))
        reduceJob = self.addStatMergeJob(
            statMergeProgram=self.mergeSameHeaderTablesIntoOne,
            outputF=outputFile,
            parentJobLs=[reduceOutputDirJob],
            transferOutput=transferOutput,
        )
        returnData.jobDataLs.append(
            PassingData(jobLs=[reduceJob],
                        file=reduceJob.output,
                        fileLs=[reduceJob.output]))

        for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs:
            for mapEachIntervalData in mapEachIntervalDataLs:
                self.addInputToMergeJob(reduceJob, \
                        parentJobLs=[mapEachIntervalData.mapJob])

        return returnData
    def preReduce(self,
                  outputDirPrefix="",
                  passingData=None,
                  transferOutput=True,
                  **keywords):
        """
        2013.06.14
            move topOutputDirJob from addAllJobs to here. 
        2012.9.17
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        self.topOutputDirJob = self.addMkDirJob(outputDir="%sRun" %
                                                (outputDirPrefix))
        passingData.topOutputDirJob = self.topOutputDirJob

        mapDirJob = self.addMkDirJob(outputDir="%sMap" % (outputDirPrefix))
        passingData.mapDirJob = mapDirJob
        returnData.mapDirJob = mapDirJob
        self.mapDirJob = mapDirJob

        reduceOutputDirJob = self.addMkDirJob(outputDir="%sReduce" %
                                              (outputDirPrefix))
        passingData.reduceOutputDirJob = reduceOutputDirJob
        returnData.reduceOutputDirJob = reduceOutputDirJob

        self.plotDirJob = self.addMkDirJob(outputDir="%sPlot" %
                                           (outputDirPrefix))
        self.statDirJob = self.addMkDirJob(outputDir="%sStat" %
                                           (outputDirPrefix))
        self.reduceStatDirJob = self.addMkDirJob(outputDir="%sReduceStat" %
                                                 (outputDirPrefix))
        self.reduceEachInputDirJob = self.addMkDirJob(
            outputDir="%sReduceEachInput" % (outputDirPrefix))
        self.reduceEachChromosomeDirJob = self.addMkDirJob(
            outputDir="%sReduceEachChromosome" % (outputDirPrefix))
        self.reduceOutputDirJob = reduceOutputDirJob
        return returnData
 def reduce(self, reduceEachChromosomeDataLs=None, \
     mapEachChromosomeDataLs=None, passingData=None, transferOutput=True,
     **keywords):
     """
     2013.07.18 return each processed-Input job data so that
      followup workflows could carry out map-reduce
     2012.9.17
     """
     returnData = PassingData(no_of_jobs=0)
     returnData.jobDataLs = []
     returnData.mapEachChromosomeDataLs = mapEachChromosomeDataLs
     returnData.reduceEachChromosomeDataLs = reduceEachChromosomeDataLs
     """
     #2013.07.18 example to return each processed-Input job data
     #  so that followup workflows could carry out map-reduce
     for reduceEachInputDataLs in passingData.reduceEachInputDataLsLs:
         if reduceEachInputDataLs:
             for reduceEachInputData in reduceEachInputDataLs:
                 if reduceEachInputData:
                     returnData.jobDataLs.append(reduceEachInputData.WHATEVERJobData)
     """
     return returnData
    def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None,
        chromosome=None,intervalData=None,\
        mapEachChromosomeData=None, \
        passingData=None, transferOutput=False, **keywords):
        """
        2013.04.08 use inputJobData
        2012.10.3
            #. extract flanking sequences from the input Input
            #   (ref sequence file => contig ref sequence)
            #. blast them
            #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
                #. where hit length match query length, and no of
                #  mismatches <=2 => good => infer new coordinates
            #. output a mapping file between old SNP and new SNP coordinates.
                #. reduce this thing by combining everything
            #. make a new Input file based on the input split Input file
                (replace contig ID , position with the new one's,
                 remove the header part regarding chromosomes or replace it)

        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        #passingData.intervalFileBasenamePrefix
        #passingData.splitInputFile
        #passingData.unitNumber
        """
        ## 2013.06.19 structures available from passingData, specific to the interval
        passingData.splitInputFile = splitInputFile
        passingData.unitNumber = unitNumber
        passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(
            chromosome, commonPrefix, unitNumber)
        passingData.noOfIndividuals = jobData.file.noOfIndividuals
        passingData.span = self.intervalSize + self.intervalOverlapSize*2
        #2013.06.19 for memory/walltime gauging
        """
        return returnData
    def addJobs(self, inputData=None, db_main=None, genotypeMethodShortName=None, commit=None,\
            data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\
            maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False):
        """
        2012.5.9
        """
        sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... " %
                         (len(inputData.jobDataLs)))

        topOutputDir = "%sVCF2DB" % (outputDirPrefix)
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)

        firstVCFFile = inputData.jobDataLs[0].vcfFile
        logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log'))
        addGM2DBJob = self.addAddGenotypeMethod2DBJob(
            executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \
            genotypeMethodShortName=genotypeMethodShortName,\
            logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=None,
            extraDependentInputLs=None, transferOutput=True, \
            extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel)
        updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log'))
        updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(
            executable=self.UpdateGenotypeMethodNoOfLoci, \
            genotypeMethodShortName=genotypeMethodShortName,\
            logFile=updateGMlogFile, data_dir=data_dir, commit=commit,
            parentJobLs=[topOutputDirJob], \
            extraDependentInputLs=[], transferOutput=True, \
            extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel)

        returnData = PassingData()
        returnData.jobDataLs = []
        for jobData in inputData.jobDataLs:
            inputF = jobData.vcfFile
            if maxContigID:
                contig_id = self.getContigIDFromFname(inputF.name)
                try:
                    contig_id = int(contig_id)
                    if contig_id > maxContigID:  #skip the small contigs
                        continue
                except:
                    sys.stderr.write('Except type: %s\n' %
                                     repr(sys.exc_info()))
                    import traceback
                    traceback.print_exc()
            logFile = File(
                os.path.join(
                    topOutputDir, 'AddVCFFile2DB_%s.log' %
                    (self.getChrFromFname(inputF.name))))
            addVCFJob = self.addAddVCFFile2DBJob(
                executable=self.AddVCFFile2DB, inputFile=inputF,
                genotypeMethodShortName=genotypeMethodShortName,\
                logFile=logFile, format="VCF", data_dir=data_dir,
                checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \
                parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \
                extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel)
            self.add_dependency(updateGMNoOfLociJob, parents=[addVCFJob])
        sys.stderr.write("%s jobs.\n" % (self.no_of_jobs))
        #include the tfam (outputList[1]) into the fileLs
        returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob],
            file=updateGMlogFile, \
            fileLs=[updateGMlogFile]))
        return returnData
    def addAllJobs(self,
        alignmentDataLs=None, chr2IntervalDataLs=None,
        skipDoneAlignment=False,\
        registerReferenceData=None, \
        needFastaIndexJob=False, needFastaDictJob=False, \
        data_dir=None, \
        outputDirPrefix="", transferOutput=True, **keywords):
        """
        2012.7.26
        """
        prePreprocessData = self.setup_chr()
        chrIDSet = prePreprocessData.chrIDSet
        chrSizeIDList = prePreprocessData.chrSizeIDList
        chr2VCFJobData = prePreprocessData.chr2VCFJobData

        print(f"Adding jobs that work on {len(alignmentDataLs)} alignments "
            f"(& possibly VCFs) for {len(chrIDSet)} chromosomes/contigs ...",
            flush=True)
        refFastaFList = registerReferenceData.refFastaFList
        refFastaF = refFastaFList[0]

        topOutputDirJob = self.addMkDirJob(outputDir="%sMap"%(outputDirPrefix))
        self.mapDirJob = topOutputDirJob

        plotOutputDirJob = self.addMkDirJob(outputDir="%sPlot"%(outputDirPrefix))
        self.plotOutputDirJob = plotOutputDirJob

        reduceOutputDirJob = self.addMkDirJob(outputDir="%sReduce"%(outputDirPrefix))
        self.reduceOutputDirJob = reduceOutputDirJob

        if needFastaDictJob or registerReferenceData.needPicardFastaDictJob:
            fastaDictJob = self.addRefFastaDictJob(
                refFastaF=refFastaF)
            refFastaDictF = fastaDictJob.refFastaDictF
        else:
            fastaDictJob = None
            refFastaDictF = registerReferenceData.refPicardFastaDictF

        if needFastaIndexJob or registerReferenceData.needSAMtoolsFastaIndexJob:
            fastaIndexJob = self.addRefFastaFaiIndexJob(refFastaF=refFastaF)
            refFastaIndexF = fastaIndexJob.refFastaIndexF
        else:
            fastaIndexJob = None
            refFastaIndexF = registerReferenceData.refSAMtoolsFastaIndexF

        returnData = PassingData()
        returnData.jobDataLs = []

        #2012.9.22 alignmentJobAndOutputLs is a relic.
        #	but it's similar to mapEachIntervalDataLs but
        #   designed for addAlignmentMergeJob(),
        #	so alignmentJobAndOutputLs gets re-set for every alignment.
        # 	mapEachAlignmentDataLs is never reset.
        #	mapEachChromosomeDataLs is reset right after a new alignment is chosen.
        #	mapEachIntervalDataLs is reset right after each chromosome is chosen.
        #	all reduce dataLs never gets reset.
        passingData = PassingData(alignmentJobAndOutputLs=[], \
            alignmentDataLs = alignmentDataLs,\
            bamFnamePrefix=None, \

            outputDirPrefix=outputDirPrefix, \
            topOutputDirJob=topOutputDirJob,\
            plotOutputDirJob=plotOutputDirJob,\
            reduceOutputDirJob = reduceOutputDirJob,\

            refFastaFList=refFastaFList, \
            registerReferenceData= registerReferenceData,\
            refFastaF=refFastaFList[0],\

            fastaDictJob = fastaDictJob,\
            refFastaDictF = refFastaDictF,\
            fastaIndexJob = fastaIndexJob,\
            refFastaIndexF = refFastaIndexF,\

            chromosome=None,\
            chrIDSet=chrIDSet,\
            chrSizeIDList = chrSizeIDList,\
            chr2IntervalDataLs=chr2IntervalDataLs,\

            mapEachAlignmentData = None,\
            mapEachChromosomeData=None, \
            mapEachIntervalData=None,\
            reduceBeforeEachAlignmentData = None, \
            reduceAfterEachAlignmentData=None,\
            reduceAfterEachChromosomeData=None,\

            mapEachAlignmentDataLs = [],\
            mapEachChromosomeDataLs=[], \
            mapEachIntervalDataLs=[],\
            reduceBeforeEachAlignmentDataLs = [], \
            reduceAfterEachAlignmentDataLs=[],\
            reduceAfterEachChromosomeDataLs=[],\

            gzipReduceAfterEachChromosomeFolderJob=None,\
            gzipReduceBeforeEachAlignmentFolderJob = None,\
            gzipReduceAfterEachAlignmentFolderJob = None,\
            gzipPreReduceFolderJob = None,\
            gzipReduceFolderJob=None,\
            )
        preReduceReturnData = self.preReduce(passingData=passingData,
            transferOutput=False, **keywords)
        passingData.preReduceReturnData = preReduceReturnData
        no_of_alignments_worked_on= 0
        for alignmentData in passingData.alignmentDataLs:
            alignment = alignmentData.alignment
            parentJobLs = alignmentData.jobLs + [fastaDictJob, fastaIndexJob]
            bamF = alignmentData.bamF
            baiF = alignmentData.baiF

            bamFnamePrefix = alignment.getReadGroup()

            passingData.alignmentJobAndOutputLs = []
            passingData.bamFnamePrefix = bamFnamePrefix
            passingData.individual_alignment = alignment
            passingData.alignmentData = alignmentData

            if skipDoneAlignment and self.isThisAlignmentComplete(
                individual_alignment=alignment, data_dir=data_dir):
                continue
            no_of_alignments_worked_on += 1
            mapEachAlignmentData = self.mapEachAlignment(
                alignmentData=alignmentData, passingData=passingData, \
                transferOutput=False, \
                preReduceReturnData=preReduceReturnData, **keywords)
            passingData.mapEachAlignmentDataLs.append(mapEachAlignmentData)
            passingData.mapEachAlignmentData = mapEachAlignmentData

            reduceBeforeEachAlignmentData = self.reduceBeforeEachAlignment(
                passingData=passingData,
                preReduceReturnData=preReduceReturnData, transferOutput=False, \
                **keywords)
            passingData.reduceBeforeEachAlignmentData = reduceBeforeEachAlignmentData
            passingData.reduceBeforeEachAlignmentDataLs.append(reduceBeforeEachAlignmentData)


            mapReduceOneAlignmentReturnData = self.mapReduceOneAlignment(
                alignmentData=alignmentData, \
                passingData=passingData, \
                chrIDSet=chrIDSet, chrSizeIDList=chrSizeIDList, \
                chr2IntervalDataLs=chr2IntervalDataLs,
                chr2VCFJobData=chr2VCFJobData,
                outputDirPrefix=outputDirPrefix, transferOutput=transferOutput)

            reduceAfterEachAlignmentData = self.reduceAfterEachAlignment(\
                mapEachAlignmentData=mapEachAlignmentData,\
                mapEachChromosomeDataLs=passingData.mapEachChromosomeDataLs,\
                reduceAfterEachChromosomeDataLs=passingData.reduceAfterEachChromosomeDataLs,\
                passingData=passingData, \
                transferOutput=False, data_dir=data_dir, **keywords)
            passingData.reduceAfterEachAlignmentData = reduceAfterEachAlignmentData
            passingData.reduceAfterEachAlignmentDataLs.append(reduceAfterEachAlignmentData)

            gzipReduceBeforeEachAlignmentData = self.addGzipSubWorkflow(\
                inputData=reduceBeforeEachAlignmentData, transferOutput=transferOutput,\
                outputDirPrefix="%sReduceBeforeEachAlignment"%(outputDirPrefix), \
                topOutputDirJob=passingData.gzipReduceBeforeEachAlignmentFolderJob, report=False)
            passingData.gzipReduceBeforeEachAlignmentFolderJob = \
                gzipReduceBeforeEachAlignmentData.topOutputDirJob

            gzipReduceAfterEachAlignmentData = self.addGzipSubWorkflow(\
                inputData=reduceAfterEachAlignmentData, transferOutput=transferOutput,\
                outputDirPrefix="%sReduceAfterEachAlignment"%(outputDirPrefix), \
                topOutputDirJob=passingData.gzipReduceAfterEachAlignmentFolderJob, \
                report=False)
            passingData.gzipReduceAfterEachAlignmentFolderJob = \
                gzipReduceAfterEachAlignmentData.topOutputDirJob
        reduceReturnData = self.reduce(passingData=passingData, \
            mapEachAlignmentData=passingData.mapEachAlignmentData, \
            reduceAfterEachAlignmentDataLs=passingData.reduceAfterEachAlignmentDataLs,\
            **keywords)
        passingData.reduceReturnData = reduceReturnData


        #2012.9.18 gzip the final output
        newReturnData = self.addGzipSubWorkflow(inputData=preReduceReturnData,
            transferOutput=transferOutput,\
            outputDirPrefix="%sGzipPreReduce"%(outputDirPrefix), \
            topOutputDirJob=passingData.gzipPreReduceFolderJob, \
            report=False)
        passingData.gzipPreReduceFolderJob = newReturnData.topOutputDirJob
        newReturnData = self.addGzipSubWorkflow(inputData=reduceReturnData,
            transferOutput=transferOutput,\
            outputDirPrefix="%sGzipReduce"%(outputDirPrefix), \
            topOutputDirJob=passingData.gzipReduceFolderJob, \
            report=False)
        passingData.gzipReduceFolderJob = newReturnData.topOutputDirJob

        sys.stderr.write("%s alignments to be worked on. %s jobs.\n"%(
            no_of_alignments_worked_on, self.no_of_jobs))
        return returnData
    def reduceAfterEachAlignment(self,
                                 passingData=None,
                                 transferOutput=False,
                                 data_dir=None,
                                 **keywords):
        """
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        alignmentJobAndOutputLs = getattr(passingData,
                                          'alignmentJobAndOutputLs', [])
        bamFnamePrefix = passingData.bamFnamePrefix
        topOutputDirJob = passingData.topOutputDirJob
        individual_alignment = passingData.individual_alignment
        reduceOutputDirJob = passingData.reduceOutputDirJob

        if len(alignmentJobAndOutputLs) > 0:
            #2012.3.29	merge alignment output only when there is something to merge!
            #2013.04.09 create a new child alignment local_realigned =1, etc.
            new_individual_alignment = self.db.copyParentIndividualAlignment(
                parent_individual_alignment_id=individual_alignment.id,\
                mask_genotype_method_id=self.new_mask_genotype_method_id,\
                data_dir=self.data_dir, local_realigned=1)

            baseCoverage = 4  #baseline
            actualCoverage = getattr(individual_alignment.individual_sequence,
                                     'coverage', baseCoverage)
            minMergeAlignmentWalltime = 240
            #in minutes, 4 hours, when coverage is defaultCoverage
            maxMergeAlignmentWalltime = 2880  #in minutes, 2 days
            minMergeAlignmentMaxMemory = 7000
            #in MB, when coverage is defaultCoverage
            maxMergeAlignmentMaxMemory = 12000  #in MB

            mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=actualCoverage,
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentWalltime * 2,
                minJobPropertyValue=minMergeAlignmentWalltime,
                maxJobPropertyValue=maxMergeAlignmentWalltime).value
            mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=actualCoverage,
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentMaxMemory,
                minJobPropertyValue=minMergeAlignmentMaxMemory,
                maxJobPropertyValue=maxMergeAlignmentMaxMemory).value

            # replace read_group with the new one to each alignment job
            newAlignmentJobAndOutputLs = []
            for alignmentJobAndOutput in alignmentJobAndOutputLs:
                # add a AddReadGroup job
                alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[:
                                                                              2]
                fileBasenamePrefix = os.path.splitext(
                    alignmentJob.output.name)[0]
                outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix))
                # needBAMIndexJob=False because addAlignmentMergeJob()
                # does not need .bai.
                addRGJob = self.addReadGroupJob(
                    individual_alignment=new_individual_alignment,
                    inputBamFile=alignmentJob.output,
                    outputBamFile=outputRGBAM,
                    needBAMIndexJob=False,
                    parentJobLs=[alignmentJob, indexAlignmentJob],
                    extraDependentInputLs=alignmentJob.outputLs[1:],
                    job_max_memory=2500,
                    transferOutput=False,
                    walltime=max(180, mergeAlignmentWalltime / 20))

                newAlignmentJobAndOutputLs.append(
                    PassingData(jobLs=[addRGJob], file=addRGJob.output))

            mergedBamFile = File(os.path.join(reduceOutputDirJob.output, \
                '%s_recal.bam'%(bamFnamePrefix)))
            alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob(
                alignmentJobAndOutputLs=newAlignmentJobAndOutputLs,
                outputBamFile=mergedBamFile,
                needBAMIndexJob=True,
                parentJobLs=[reduceOutputDirJob],
                walltime=mergeAlignmentWalltime,
                job_max_memory=mergeAlignmentMaxMemory,
                transferOutput=False)
            #2012.9.19 add/copy the alignment file to db-affliated storage
            #add the metric file to AddAlignmentFile2DB.py as well
            #  (to be moved into db-affiliated storage)
            logFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s_2db.log' % (bamFnamePrefix)))
            alignment2DBJob = self.addAlignmentFile2DBJob(
                executable=self.AddAlignmentFile2DB,
                inputFile=alignmentMergeJob.output,
                baiFile=bamIndexJob.baiFile,
                individual_alignment_id=new_individual_alignment.id,
                mask_genotype_method_id=self.new_mask_genotype_method_id,
                logFile=logFile,
                data_dir=data_dir,
                otherInputFileList=None,
                parentJobLs=[alignmentMergeJob, bamIndexJob],
                transferOutput=transferOutput,
                sshDBTunnel=self.needSSHDBTunnel,
                commit=True,
                job_max_memory=2000,
                walltime=max(180, mergeAlignmentWalltime / 2))
            self.no_of_jobs += 1
            returnData.jobDataLs.append(PassingData(jobLs=[alignment2DBJob],
                file=alignment2DBJob.logFile, \
                fileLs=[alignment2DBJob.logFile]))
        return returnData
Example #23
0
    def addAllJobs(self, \
                data_dir=None, \
                outputDirPrefix="", transferOutput=True, **keywords):
        """
        2013.2.27
            run ms
            estimate parameters from ms
            ms2SLiM
            SLiM forward simulator with estimated ms-parameters or take the output of ms as input
            SLiM2PolymorphismTableFile
            
            AddPopGenSimulation2DB.py
            
        """
        sys.stderr.write("Adding jobs for pop-gen simulation #jobs=%s... \n"%\
                            (self.no_of_jobs))

        returnData = PassingData()
        returnData.jobDataLs = []

        passingData = PassingData(fileBasenamePrefix=None, \
                    outputDirPrefix=outputDirPrefix, \
                    jobData=None,\
                    preReduceReturnData=None,\
                    association_group_key2orderIndex = {},\
                    association_group_key2resultList = {},\
                    association_group_key2reduceAssociationPeakJobMatrix = {},\
                    association_group_key2countAssociationLocusJobList = {},\
                    resultID2defineLandscapeJobData = {},
                    )

        preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix, \
                                    passingData=passingData, transferOutput=False,\
                                    **keywords)

        mapDirJob = preReduceReturnData.mapDirJob
        plotOutputDirJob = preReduceReturnData.plotOutputDirJob
        countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob
        reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob

        passingData.preReduceReturnData = preReduceReturnData

        #add output pedigree job

        for i in range(self.noOfReplicates):
            popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \
                                                        parentJobLs=[mapDirJob])
            #pending user choice, use ms/sfs-code/slim/ms & slim combination
            msOutputFile = File(os.path.join(popGenSimulationFolderJob.output, \
                                    'sim%s_msOutput.txt.gz'%(i)))
            popSimulationJob = self.addMSSimulationJob(outputFile=msOutputFile, \
                                recombinationRate=self.recombinationRate, mutationRate=self.mutationRate, \
                                initialEffectivePopulationSize=self.initialEffectivePopulationSize, \
                                otherParametersPassedToPopGenSimulator=self.otherParametersPassedToPopGenSimulator, \
                                sampleSize=self.sampleSize, noOfLociToSimulate=self.noOfLociToSimulate, \
                                simulateLocusLengthList=self.simulateLocusLengthList, \
                                parentJobLs=[popGenSimulationFolderJob], \
                                extraDependentInputLs=None, extraOutputLs=None, \
                                transferOutput=False, extraArguments=None, extraArgumentList=None, \
                                job_max_memory=2000, walltime=180)

            #. convert ms pop-gen output 2 polymorphism-table file
            msOutputHDF5File = File(os.path.join(popGenSimulationFolderJob.output, \
                                    'sim%s_msOutput.h5'%(i)))
            msOutput2PolymorphismTableFileJob = self.addGenericJob(executable=self.msOutput2PolymorphismTableFile, \
                    inputFile=popSimulationJob.output, \
                    outputFile=msOutputHDF5File,\
                    parentJob=None, parentJobLs=[popGenSimulationFolderJob, popSimulationJob], \
                    extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \
                    frontArgumentList=None, \
                    extraArguments=None, \
                    extraArgumentList=None, job_max_memory=2000,  \
                    no_of_cpus=None, walltime=None)

            #. add polymorphism-table file to db
            logFile = File(
                os.path.join(popGenSimulationFolderJob.output,
                             "sim%s_2DB.log" % (i)))
            extraArgumentList = ["--r %s"%self.recombinationRate, "--rho %s"%popSimulationJob.rho, "--mu %s"%self.mutationRate,\
                                "--theta %s"%popSimulationJob.theta, "--n0 %s"%self.initialEffectivePopulationSize,\
                                "--no_of_populations 1", "--no_of_chromosomes %s"%self.sampleSize,\
                                "--chromosome_length %s"%popSimulationJob.locusLength,\
                                "--replicate_index %s"%(i)]
            """
            extraArgumentList.append("--parent_pop_gen_simulation_type_id %s"%self.parent_pop_gen_simulation_type_id)
            """
            simulation2DBJob = self.addPutStuffIntoDBJob(executable=self.AddPopGenSimulation2DB, \
                    inputFileList=[msOutput2PolymorphismTableFileJob.output], \
                    logFile=logFile, commit=True, \
                    parentJobLs=[popGenSimulationFolderJob, msOutput2PolymorphismTableFileJob], \
                    extraDependentInputLs=None, transferOutput=True, extraArguments=None, \
                    extraArgumentList=extraArgumentList,\
                    job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
    def mapEachInterval(self,
                        VCFJobData=None,
                        passingData=None,
                        transferOutput=False,
                        **keywords):
        """
        use VCFJobData
        
        #. extract flanking sequences from the input VCF (ref sequence file => contig ref sequence)
        #. blast them
        #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
            #. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates
        #. output a mapping file between old SNP and new SNP coordinates.
            #. reduce this thing by combining everything
        #. make a new VCF file based on the input split VCF file
            #. (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it)
        """

        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        mapDirJob = passingData.mapDirJob
        reduceOutputDirJob = passingData.reduceOutputDirJob

        intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix
        jobData = passingData.jobData
        VCFFile = VCFJobData.file

        splitVCFJob = passingData.mapEachVCFData.splitVCFJob
        chromosome = passingData.chromosome

        # a flanking sequence extraction job
        #noOfIndividuals
        realInputVolume = passingData.noOfIndividuals * passingData.span
        baseInputVolume = 600 * 2000  #600 individuals at 2000 sites
        #base is 200 individual X 2Mb region => 120 minutes
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=1200).value
        #base is 4X, => 5000M
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=4000,
            minJobPropertyValue=4000,
            maxJobPropertyValue=8000).value

        outputFnamePrefix = os.path.join(
            mapDirJob.output,
            '%s.sameSite.concordance' % (intervalFileBasenamePrefix))
        outputFile = File('%s.tsv' % (outputFnamePrefix))

        returnData.mapJob = self.addAbstractMapperLikeJob(
            executable=self.CalculateSameSiteConcordanceInVCF,
            inputF=VCFFile,
            outputF=outputFile,
            parentJobLs=[mapDirJob] + VCFJobData.jobLs,
            transferOutput=transferOutput,
            job_max_memory=job_max_memory,
            walltime=walltime)

        return returnData
Example #25
0
    def reduceAfterEachAlignment(self,
                                 passingData=None,
                                 transferOutput=False,
                                 data_dir=None,
                                 **keywords):
        """
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        alignmentJobAndOutputLs = getattr(passingData,
                                          'alignmentJobAndOutputLs', [])
        bamFnamePrefix = passingData.bamFnamePrefix
        topOutputDirJob = passingData.topOutputDirJob
        individual_alignment = passingData.individual_alignment
        reduceOutputDirJob = passingData.reduceOutputDirJob

        if len(alignmentJobAndOutputLs) > 0:
            #2012.3.29	merge alignment output only when there is something to merge!
            #2013.04.09 create a new child alignment local_realigned =1, etc.
            new_individual_alignment = self.db.copyParentIndividualAlignment(
                parent_individual_alignment_id=individual_alignment.id,\
                data_dir=self.data_dir,
                local_realigned=individual_alignment.local_realigned,\
                reduce_reads=1)

            # replace read_group with the new one to each alignment job
            newAlignmentJobAndOutputLs = []
            for alignmentJobAndOutput in alignmentJobAndOutputLs:
                # add a AddReadGroup job
                alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[:
                                                                              2]
                fileBasenamePrefix = os.path.splitext(
                    alignmentJob.output.name)[0]
                outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix))
                # needBAMIndexJob=False because addAlignmentMergeJob()
                # does not need .bai.
                addRGJob = self.addReadGroupJob(
                    individual_alignment=new_individual_alignment,
                    inputBamFile=alignmentJob.output,
                    outputBamFile=outputRGBAM,
                    needBAMIndexJob=False,
                    parentJobLs=[alignmentJob, indexAlignmentJob],
                    extraDependentInputLs=alignmentJob.outputLs[1:],
                    job_max_memory=2500,
                    transferOutput=False)

                newAlignmentJobAndOutputLs.append(
                    PassingData(jobLs=[addRGJob], file=addRGJob.output))
            mergedBamFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s.merged.bam' % (bamFnamePrefix)))
            alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob(
                alignmentJobAndOutputLs=newAlignmentJobAndOutputLs,
                outputBamFile=mergedBamFile,
                needBAMIndexJob=True,
                parentJobLs=[reduceOutputDirJob],
                transferOutput=False)
            #2012.9.19 add/copy the alignment file to db-affliated storage
            #add the metric file to AddAlignmentFile2DB.py as well
            #  (to be moved into db-affiliated storage)
            logFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s_2db.log' % (bamFnamePrefix)))
            alignment2DBJob = self.addAlignmentFile2DBJob(
                executable=self.AddAlignmentFile2DB,
                inputFile=alignmentMergeJob.output,
                baiFile=bamIndexJob.baiFile,
                individual_alignment_id=new_individual_alignment.id,
                logFile=logFile,
                data_dir=data_dir,
                otherInputFileList=None,
                parentJobLs=[alignmentMergeJob, bamIndexJob],
                transferOutput=transferOutput,
                job_max_memory=2000,
                sshDBTunnel=self.needSSHDBTunnel,
                commit=True)
            self.no_of_jobs += 1
            returnData.jobDataLs.append(
                PassingData(jobLs=[alignment2DBJob],
                file=alignment2DBJob.logFile, \
                fileLs=[alignment2DBJob.logFile]))
        return returnData
Example #26
0
    def mapEachInterval(self, alignmentData=None, intervalData=None, chromosome=None,
        VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None,
        mapEachChromosomeData=None, transferOutput=False, \
        **keywords):
        """
        2013.03.31 use VCFJobData to decide whether to add BQSR jobs, called in ShortRead2Alignment.py
        2012.9.17
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob

        alignment = alignmentData.alignment
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF
        bamFnamePrefix = passingData.bamFnamePrefix

        #SNPVCFFile = VCFJobData.file
        #if SNPVCFFile is None or VCFJobData is None:
        # #2013.04.09	BQSR requires a VCF input regardless of the chromosome
        #	VCFJobData = self.randomSNPVCFJobDataForBQSR

        #SNPVCFFile = VCFJobData.file
        #SNPVCFJobLs = VCFJobData.jobLs

        if intervalData.file:
            mpileupInterval = intervalData.interval
            bcftoolsInterval = intervalData.file
        else:
            mpileupInterval = intervalData.interval
            bcftoolsInterval = intervalData.interval
        intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
        overlapInterval = intervalData.overlapInterval
        overlapFileBasenameSignature = intervalData.overlapIntervalFileBasenameSignature
        span = intervalData.span

        if chromosome is None:
            chromosome = getattr(passingData, 'chromosome', None)

        median_depth = getattr(alignment, 'median_depth', 4)
        readSpace = median_depth * span
        #base is 4X coverage in 20Mb region => 120 minutes
        reduceReadsJobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=readSpace, \
            baseInputVolume=4*20000000, baseJobPropertyValue=60, \
            minJobPropertyValue=60, maxJobPropertyValue=500).value
        #base is 4X, => 5000M
        reduceReadsJobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=median_depth, \
            baseInputVolume=4, baseJobPropertyValue=4000, \
            minJobPropertyValue=4000, maxJobPropertyValue=8000).value

        reduceReadsBamFile = File(os.path.join(topOutputDirJob.output, \
            '%s_%s.reduceReads.bam'%\
            (bamFnamePrefix, overlapFileBasenameSignature)))
        #Default downsampling setting is 40 in GATK 2.4.9
        # this downsampling happens at the ReadWalker level,
        #extraArgumentList= ["--downsample_to_coverage 250", "--downsampling_type BY_SAMPLE"]

        extraArgumentList = ["--downsample_coverage 250"]  #this is for
        #This level of downsampling only happens after the region has been evaluated,
        #  therefore it can be combined with the engine level downsampling.

        reduceReadsJob = self.addGATKJob(executable=self.ReduceReadsJava,
            GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, \
            GATKAnalysisType='ReduceReads',\
            inputFile=bamF, inputArgumentOption="-I",
            refFastaFList=passingData.refFastaFList, inputFileList=None,\
            argumentForEachFileInInputFileList=None,\
            interval=overlapInterval, outputFile=reduceReadsBamFile, \
            parentJobLs=alignmentData.jobLs, transferOutput=False, \
            job_max_memory=reduceReadsJobMaxMemory,\
            frontArgumentList=None, extraArguments=None, \
            extraArgumentList=extraArgumentList, \
            extraOutputLs=[], \
            extraDependentInputLs=[baiF], no_of_cpus=None, \
            walltime=reduceReadsJobWalltime)
        indexBamJob = self.addBAMIndexJob(
            BuildBamIndexFilesJava=self.BuildBamIndexFilesJava, \
            BuildBamIndexJar=self.BuildBamIndexJar, \
            inputBamF=reduceReadsJob.output,\
            parentJobLs=[reduceReadsJob], \
            transferOutput=False, job_max_memory=3000, \
            walltime=max(120, int(reduceReadsJobWalltime/3)))
        passingData.alignmentJobAndOutputLs.append(PassingData(
            jobLs=[reduceReadsJob, indexBamJob], \
            file=reduceReadsJob.output, fileLs=[reduceReadsJob.output]))
        return returnData
    def addAllJobs(self, inputData=None, chr2IntervalDataLs=None,
        data_dir=None,
        intervalSize=3000, intervalOverlapSize=0,
        outputDirPrefix="", passingData=None, \
        transferOutput=True, job_max_memory=2000, **keywords):
        """
        2013.06.14 bugfix regarding noOfUnits,
            which was all inferred from one file.
        2012.7.26
            architect of the whole map-reduce framework
        """
        print(
            f"Adding jobs for {len(inputData.jobDataLs)} input "
            "genome files ...",
            flush=True)

        returnData = PassingData()
        returnData.jobDataLs = []

        #2012.9.22
        # 	mapEachAlignmentDataLs is never reset.
        #	mapEachChromosomeDataLs is reset upon new alignment
        #	mapEachIntervalDataLs is reset upon each new chromosome
        #	all reduce lists never get reset.
        #	fileBasenamePrefix is the prefix of input file's basename,
        #    to be used for temporary output files in reduceEachInput()
        #		but not for output files in mapEachInterval()
        passingData = PassingData(\
            fileBasenamePrefix=None, \
            chromosome=None, \

            outputDirPrefix=outputDirPrefix, \
            intervalFileBasenamePrefix=None,\

            registerReferenceData=None, \
            refFastaFList=None, \
            refFastaF=None,\

            fastaDictJob = None,\
            refFastaDictF = None,\
            fastaIndexJob = None,\
            refFastaIndexF = None,\

            intervalOverlapSize =intervalOverlapSize,
            intervalSize=intervalSize,
            jobData=None,\
            splitInputFile=None,\
            intervalDataLs=None,\
            preReduceReturnData=None,\

            mapEachIntervalData=None,\
            mapEachIntervalDataLs=None,\
            mapEachIntervalDataLsLs=[],\
            mapEachInputData=None,\
            mapEachInputDataLs=None,\
            mapEachInputDataLsLs=[],\
            mapEachChromosomeData=None, \
            mapEachChromosomeDataLs=[], \

            chromosome2mapEachIntervalDataLs = {},\
            chromosome2mapEachInputDataLs = {},\

            reduceEachInputData=None,\
            reduceEachChromosomeData=None,\
            reduceEachInputDataLs=None,\
            reduceEachInputDataLsLs=[],\
            reduceEachChromosomeDataLs=[],\
            )
        # mapEachIntervalDataLsLs is list of mapEachIntervalDataLs by each Input file.
        # mapEachInputDataLsLs is list of mapEachInputDataLs by each chromosome
        # reduceEachInputDataLsLs is list of reduceEachInputDataLs by each chromosome

        preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix,
            passingData=passingData, transferOutput=True,\
            **keywords)
        passingData.preReduceReturnData = preReduceReturnData

        #gzip folder jobs (to avoid repeatedly creating the same folder
        gzipReduceEachInputFolderJob = None
        gzipReduceEachChromosomeFolderJob = None
        gzipReduceFolderJob = None
        gzipPreReduceFolderJob = None
        no_of_input_files = 0

        firstInterval = True

        for chromosome, intervalDataLs in chr2IntervalDataLs.items():
            passingData.chromosome = chromosome
            mapEachChromosomeData = self.mapEachChromosome(
                chromosome=chromosome,
                passingData=passingData,
                transferOutput=False,
                **keywords)
            passingData.mapEachChromosomeData = mapEachChromosomeData
            passingData.mapEachChromosomeDataLs.append(mapEachChromosomeData)

            passingData.mapEachInputDataLsLs.append([])
            #the last one from the double list is the current one
            passingData.mapEachInputDataLs = passingData.mapEachInputDataLsLs[
                -1]
            passingData.mapEachIntervalDataLs = []
            passingData.chromosome2mapEachIntervalDataLs[chromosome] = []

            passingData.reduceEachInputDataLsLs.append([])
            passingData.reduceEachInputDataLs = passingData.reduceEachInputDataLsLs[
                -1]

            for i in range(len(inputData.jobDataLs)):
                jobData = inputData.jobDataLs[i]
                passingData.jobData = jobData
                passingData.inputJobData = jobData

                InputFile = jobData.file
                commonFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(
                    InputFile.name)
                passingData.fileBasenamePrefix = commonFileBasenamePrefix

                no_of_input_files += 1
                if no_of_input_files % 10 == 0:
                    sys.stderr.write("%s\t%s Inputs." %
                                     ('\x08' * 40, no_of_input_files))

                for intervalData in intervalDataLs:
                    selectIntervalJobData = self.selectIntervalFromInputFile(
                        jobData=jobData,
                        chromosome=chromosome,
                        intervalData=intervalData,
                        mapEachChromosomeData=mapEachChromosomeData,
                        passingData=passingData,
                        transferOutput=firstInterval,
                        **keywords)
                    mapEachIntervalData = self.mapEachInterval(
                        inputJobData=jobData,
                        selectIntervalJobData=selectIntervalJobData,
                        chromosome=chromosome,
                        intervalData=intervalData,
                        mapEachChromosomeData=mapEachChromosomeData, \
                        passingData=passingData,
                        transferOutput=firstInterval,
                        **keywords)

                    passingData.mapEachIntervalData = mapEachIntervalData
                    passingData.mapEachIntervalDataLs.append(
                        mapEachIntervalData)
                    passingData.chromosome2mapEachIntervalDataLs[
                        chromosome].append(mapEachIntervalData)

                    linkMapToReduceData = self.linkMapToReduce(
                        mapEachIntervalData=mapEachIntervalData,
                        preReduceReturnData=preReduceReturnData,
                        passingData=passingData,
                        **keywords)
                    if firstInterval == True:
                        firstInterval = False
                reduceEachInputData = self.reduceEachInput(
                    chromosome=chromosome, passingData=passingData,
                    mapEachIntervalDataLs=passingData.mapEachIntervalDataLs,
                    transferOutput=False, data_dir=data_dir, \
                    **keywords)
                passingData.reduceEachInputData = reduceEachInputData
                passingData.reduceEachInputDataLs.append(reduceEachInputData)

                gzipReduceEachInputData = self.addGzipSubWorkflow(\
                    inputData=reduceEachInputData,
                    outputDirPrefix="%sReduceEachInput"%(outputDirPrefix),
                    topOutputDirJob=gzipReduceEachInputFolderJob, \
                    transferOutput=transferOutput,
                    report=False)
                gzipReduceEachInputFolderJob = gzipReduceEachInputData.topOutputDirJob
            reduceEachChromosomeData = self.reduceEachChromosome(
                chromosome=chromosome, passingData=passingData, \
                mapEachInputDataLs=passingData.mapEachInputDataLs, \
                chromosome2mapEachIntervalDataLs=passingData.chromosome2mapEachIntervalDataLs,
                reduceEachInputDataLs=passingData.reduceEachInputDataLs,\
                transferOutput=False, data_dir=data_dir, \
                **keywords)
            passingData.reduceEachChromosomeData = reduceEachChromosomeData
            passingData.reduceEachChromosomeDataLs.append(
                reduceEachChromosomeData)

            gzipReduceEachChromosomeData = self.addGzipSubWorkflow(
                inputData=reduceEachChromosomeData,
                outputDirPrefix="%sReduceEachChromosome"%(outputDirPrefix), \
                topOutputDirJob=gzipReduceEachChromosomeFolderJob,
                transferOutput=transferOutput,
                report=False)
            gzipReduceEachChromosomeFolderJob = \
                gzipReduceEachChromosomeData.topOutputDirJob

        reduceReturnData = self.reduce(passingData=passingData,
            transferOutput=False,
            mapEachChromosomeDataLs=passingData.mapEachInputDataLs,\
            reduceEachChromosomeDataLs=passingData.reduceEachChromosomeDataLs,
            **keywords)
        passingData.reduceReturnData = reduceReturnData

        if self.needGzipPreReduceReturnData:
            gzipPreReduceReturnData = self.addGzipSubWorkflow(
                inputData=preReduceReturnData, transferOutput=transferOutput,
                outputDirPrefix="%sPreReduce"%(outputDirPrefix), \
                topOutputDirJob= gzipPreReduceFolderJob, report=False)
            gzipPreReduceFolderJob = gzipPreReduceReturnData.topOutputDirJob

        if self.needGzipReduceReturnData:
            gzipReduceReturnData = self.addGzipSubWorkflow(
                inputData=reduceReturnData, transferOutput=transferOutput,
                outputDirPrefix="%sReduce"%(outputDirPrefix), \
                topOutputDirJob=gzipReduceFolderJob, report=False)
            gzipReduceFolderJob = gzipReduceReturnData.topOutputDirJob

        print(f" {no_of_input_files} Input files.", flush=True)
        sys.stderr.write(f"{self.no_of_jobs} jobs.\n")
        return reduceReturnData
Example #28
0
 def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None,
     noOfTotalSequences=None, transferOutput=True, makeBlastDBJob=None):
     """
     2012.5.24
     """
     
     sys.stderr.write("Adding blast jobs for %s input ... "%(len(inputData.jobDataLs)))
     no_of_jobs= 0
     
     topOutputDir = "%sBlast"%(outputDirPrefix)
     topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
     no_of_jobs += 1
     
     allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv'))
     allBlastMergeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=allBlastResultFile, transferOutput=transferOutput,
         parentJobLs=[topOutputDirJob])
     no_of_jobs += 1
     
     ntDatabaseFile = ntDatabaseFileList[0]
     returnData = PassingData()
     returnData.jobDataLs = []
     
     for jobData in inputData.jobDataLs:
         inputF = jobData.output
         outputFnamePrefix = os.path.join(topOutputDir,
             os.path.splitext(os.path.basename(inputF.name))[0])
         
         splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile,
             inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
             noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta",
             noOfTotalSequences=noOfTotalSequences,\
             parentJobLs=jobData.jobLs + [topOutputDirJob],
             extraDependentInputLs=None, transferOutput=False, \
             extraArguments=None, job_max_memory=500)
         no_of_jobs += 1
         for splitFastaOutput in splitFastaJob.outputList:
             outputFile = File('%s.tsv'%(splitFastaOutput.name))
             blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper,
                 inputFile=splitFastaOutput, outputFile=outputFile,
                 outputFnamePrefix=splitFastaOutput.name ,
                 databaseFile=ntDatabaseFile,
                 maxNoOfMismatches=self.maxNoOfMismatches,
                 minNoOfIdentities=self.minNoOfIdentities,
                 minIdentityPercentage=self.minIdentityPercentage,
                 blastallPath=self.blastallPath,
                 parentJobLs=[splitFastaJob, makeBlastDBJob],
                 extraDependentInputLs=ntDatabaseFileList,
                 transferOutput=False, \
                 extraArguments=None, job_max_memory=1000)
             
             #add output to some reduce job
             self.addInputToMergeJob(allBlastMergeJob, \
                 inputF=blastJob.output, parentJobLs=[blastJob])
             no_of_jobs += 1
     sys.stderr.write("%s jobs. Done.\n"%(no_of_jobs))
     #include the tfam (outputList[1]) into the fileLs
     returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \
         fileLs=[allBlastResultFile]))
     return returnData