Example #1
0
 def addMakeBlastDBJob(self, executable:Transformation=None, inputFile=None, \
     parentJobLs=None, extraDependentInputLs=None, transferOutput=False, \
     extraArguments=None, job_max_memory=500, **keywords):
     """
     2012.10.9 use addGenericJob() instead
     2012.5.24
         untested
     """
     extraOutputLs = []
     for suffix in ['.nin', '.nhr', '.nsq']:	#start from 0
         dbIndexFile = File('%s%s'%(inputFile.name, suffix))
         extraOutputLs.append(dbIndexFile)
     # 2013.07.09
     extraOutputLs.append(File("formatdb.log"))
     
     extraArgumentList = ["-p F"]
     job = self.addGenericJob(executable=executable,
         inputFile=inputFile, outputFile=None, \
         extraArguments=extraArguments, extraArgumentList=extraArgumentList, \
         parentJobLs=parentJobLs, extraDependentInputLs=extraDependentInputLs, \
         extraOutputLs=extraOutputLs,\
         transferOutput=transferOutput, \
         key2ObjectForJob=None,\
         job_max_memory=job_max_memory)
     return job
Example #2
0
    def mapEachInterval(self, inputJobData=None, selectIntervalJobData=None, \
        chromosome=None,intervalData=None,\
        mapEachChromosomeData=None, \
        passingData=None, transferOutput=False, **keywords):
        """
        #. extract flanking sequences from the input Input (ref sequence file => contig ref sequence)
        #. blast them
        #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
            #. where hit length match query length, and
            #    no of mismatches <=2 => good => infer new coordinates
        #. output a mapping file between old SNP and new SNP coordinates.
            #. reduce this thing by combining everything
        #. make a new Input file based on the input split Input file
            (replace contig ID , position with the new one's,
                remove the header part regarding chromosomes or replace it)

        """
        returnData = PassingData(no_of_jobs = 0)
        returnData.jobDataLs = []
        passingData.intervalFileBasenamePrefix
        passingData.splitInputFile
        """
        ## 2013.06.19 structures available from passingData, specific to the interval
        passingData.splitInputFile = splitInputFile
        passingData.unitNumber = unitNumber
        passingData.intervalFileBasenamePrefix = '%s_%s_splitInput_u%s'%(
            chromosome, commonPrefix, unitNumber)
        passingData.noOfIndividuals = jobData.file.noOfIndividuals
        passingData.span = self.intervalSize + self.intervalOverlapSize*2
        """
        #add one computing job
        outputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.probability.tsv.gz"%(passingData.fileBasenamePrefix,\
            intervalData.interval)))
        locusIntervalDeltaOutputFile = File(os.path.join(self.mapDirJob.output,
            "%s.%s.locusIntervalDelta.tsv.gz"%(passingData.fileBasenamePrefix,
            intervalData.interval)))
        job = self.addAbstractMatrixFileWalkerJob(
            executable=self.ComputeLiftOverLocusProbability, \
            inputFile=selectIntervalJobData.file, outputFile=outputFile, \
            whichColumn=None, whichColumnHeader=None, \
            logY=None, valueForNonPositiveYValue=-1, \
            minNoOfTotal=1, samplingRate=1, \
            inputFileFormat=None, outputFileFormat=None,\
            extraArgumentList=["--locusIntervalDeltaOutputFname", locusIntervalDeltaOutputFile, \
                "--startPosition %s"%(intervalData.start), "--stopPosition %s"%(intervalData.stop)],
            parentJobLs=[selectIntervalJobData.job],
            extraOutputLs=[locusIntervalDeltaOutputFile],\
            transferOutput=transferOutput, job_max_memory=2000, sshDBTunnel=False)
            #For each interval, probabilities are not calculated for loci in
            #  extra segment (from overlapStart to start).
        returnData.jobDataLs.append(self.constructJobDataFromJob(job))
        return returnData
Example #3
0
 def reduceEachChromosome(self, chromosome=None, passingData=None,
     mapEachInputDataLs=None, 
     chromosome2mapEachIntervalDataLs=None,\
     reduceEachInputDataLs=None,\
     transferOutput=True, \
     **keywords):
     """
     """
     returnData = PassingData(no_of_jobs = 0)
     returnData.jobDataLs = []
     returnData.mapEachInputDataLs = mapEachInputDataLs
     returnData.reduceEachInputDataLs = reduceEachInputDataLs
     #reduce matrix by chosen column and average p-value
     
     outputFile = File(os.path.join(self.reduceEachChromosomeDirJob.output,
         'chr_%s_LocusLiftOverProbability.tsv.gz'%(chromosome)))
     reduceChromosomeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=outputFile, \
         parentJobLs=[self.reduceEachChromosomeDirJob],extraOutputLs=None, \
         extraDependentInputLs=None, transferOutput=False)
         #extraArgumentList=['--keyColumnLs 0-6 --valueColumnLs 7'],\
     mapEachIntervalDataLs = chromosome2mapEachIntervalDataLs.get(chromosome)
     for mapEachIntervalData in mapEachIntervalDataLs:
         for jobData in mapEachIntervalData.jobDataLs:
             self.addInputToMergeJob(reduceChromosomeJob, parentJobLs=[jobData.job])
         
     #add the reduction job to final stat merge job
     self.addInputToMergeJob(self.reduceJob, parentJobLs=[reduceChromosomeJob])
     
     return returnData
Example #4
0
 def addSplitFastaFileJob(self, executable:Transformation=None, inputFile:File=None,
     outputFnamePrefix=None, \
     noOfSequencesPerSplitFile=1000, filenameSuffix="", noOfTotalSequences=1000000,\
     parentJobLs=[], extraDependentInputLs=[], transferOutput=False, \
     extraArguments=None, job_max_memory=500, **keywords):
     """
     2012.5.24
     """
     noOfSplitFiles = int(math.ceil(noOfTotalSequences/float(noOfSequencesPerSplitFile)))
     suffixLength = len(repr(noOfSplitFiles))
     
     job = self.addGenericJob(executable=executable, inputArgumentOption="-i",
         inputFile=inputFile,
         extraArguments=extraArguments, \
         extraArgumentList=["--noOfSequences %s"%(noOfSequencesPerSplitFile), \
             "--outputFnamePrefix", outputFnamePrefix,
             '--filenameSuffix %s'%(filenameSuffix),
             '--suffixLength %s'%(suffixLength)],
         parentJobLs=parentJobLs,
         extraDependentInputLs=extraDependentInputLs,
         job_max_memory=job_max_memory)
     
     for i in range(noOfSplitFiles):	#start from 0
         splitFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix,
             suffixLength=suffixLength, fileOrder=i,\
             filenameSuffix=filenameSuffix)
         splitFile = File(splitFname)
         self.addJobUse(job, file=splitFile, is_input=False, transfer=transferOutput)
     return job
Example #5
0
    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
            transferOutput=True):
        """
        2012.6.27
        """
        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL,
                relativePath=relativePath, \
                username=username, password=password,\
                targetFolder=outputDir, logFile=logFile,
                cut_dir_number=self.cut_dir_number,
                parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
                transferOutput=transferOutput, \
                extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData
    def reduce(self,
               passingData=None,
               reduceEachChromosomeDataLs=None,
               transferOutput=True,
               **keywords):
        """
        #. merge all output of input jobs (passingData.mapEachIntervalDataLsLs) into one big one
        
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        reduceOutputDirJob = passingData.reduceOutputDirJob

        realInputVolume = passingData.jobData.file.noOfIndividuals * passingData.jobData.file.noOfLoci
        baseInputVolume = 200 * 20000
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=500).value
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=5000,
            minJobPropertyValue=5000,
            maxJobPropertyValue=10000).value

        outputFile = File(
            os.path.join(reduceOutputDirJob.output, 'sameSiteConcordance.tsv'))
        reduceJob = self.addStatMergeJob(
            statMergeProgram=self.mergeSameHeaderTablesIntoOne,
            outputF=outputFile,
            parentJobLs=[reduceOutputDirJob],
            transferOutput=transferOutput,
        )
        returnData.jobDataLs.append(
            PassingData(jobLs=[reduceJob],
                        file=reduceJob.output,
                        fileLs=[reduceJob.output]))

        for mapEachIntervalDataLs in passingData.mapEachIntervalDataLsLs:
            for mapEachIntervalData in mapEachIntervalDataLs:
                self.addInputToMergeJob(reduceJob, \
                        parentJobLs=[mapEachIntervalData.mapJob])

        return returnData
 def run(self):
     """
     """
     self.setup_run()
     
     logDir = "Log"
     logDirJob = self.addMkDirJob(outputDir=logDir)
     individualSequenceID2FilePairLs = self.db_main.getIndividualSequenceID2FilePairLs(
         self.ind_seq_id_ls, data_dir=self.data_dir)
     for ind_seq_id, FilePairLs in individualSequenceID2FilePairLs.items():
         individual_sequence = self.db_main.queryTable(
             SunsetDB.IndividualSequence).get(ind_seq_id)
         if individual_sequence is None or individual_sequence.format!='fastq':
             continue
         for filePair in FilePairLs:
             for fileRecord in filePair:
                 relativePath = fileRecord[0]
                 prefix, suffix = utils.getRealPrefixSuffix(
                     os.path.basename(relativePath))
                 if suffix=='.fastq':
                     filepath = os.path.join(self.data_dir, relativePath)
                     #Do not register the input fastq because InspectBaseQuality
                     #  will output directly into self.data_dir.
                     logFile = File(os.path.join(logDir, f'{prefix}.log'))
                     job = self.addDBJob(
                         executable=self.InspectBaseQuality,
                         outputArgumentOption="--logFilename",
                         outputFile=logFile,
                         extraArgumentList=[
                             '-i', filepath,
                             '--read_sampling_rate', '0.01',
                             '--quality_score_format',
                             individual_sequence.quality_score_format,
                             ],
                         parentJobLs=[logDirJob],
                         transferOutput=True,
                         objectWithDBArguments=self,
                         job_max_memory=5000,
                         walltime=120)
     
     self.end_run()
 def selectIntervalFromInputFile(self, jobData=None, chromosome=None,\
     intervalData=None, mapEachChromosomeData=None,\
     passingData=None, transferOutput=False,\
     **keywords):
     """
     2013.11.24
     """
     inputSuffix = utils.getRealPrefixSuffix(jobData.file.name)[1]
     outputFile = File(os.path.join(self.mapDirJob.output, \
         '%s_%s%s'%(passingData.fileBasenamePrefix, \
         intervalData.overlapInterval, inputSuffix)))
     tabixRetrieveJob = self.addTabixRetrieveJob(
         executable=self.tabixRetrieve, \
         tabixPath=self.tabixPath, \
         inputF=jobData.file, outputF=outputFile, \
         regionOfInterest=intervalData.overlapInterval,
         includeHeader=True,\
         parentJobLs=jobData.jobLs + [self.mapDirJob],
         job_max_memory=100,
         extraDependentInputLs=jobData.fileLs[1:], \
         transferOutput=False)
     return self.constructJobDataFromJob(job=tabixRetrieveJob)
Example #9
0
    def addWgetJob(self, executable=None, url=None, relativePath=None, username=None, password=None,\
        targetFolder=None, logFile=None, cut_dir_number=1, parentJobLs=[],
        extraDependentInputLs=[], transferOutput=False, \
        extraArguments=None, job_max_memory=2000, **keywords):
        """
        2012.6.27
        """
        extraArgumentList = ['--user=%s'%(username), '--password=%s'%(password), '--recursive', '--no-parent',\
            '--continue', "--reject='index.html*'", "-nc -nH --cut-dirs=%s"%(cut_dir_number), "-P %s"%(targetFolder), \
            "%s/%s"%(url, relativePath)]
        """
        # unlike -nd, --cut-dirs does not lose with subdirectories---for instance, with
        # -nH --cut-dirs=1, a beta/ subdirectory will be placed to xemacs/beta, as one would expect.
        
        -c
        --continue
           Continue getting a partially-downloaded file.  This is useful when you want to finish up a download started
           by a previous instance of Wget, or by another program. 
        
        -nc
        --no-clobber
           If a file is downloaded more than once in the same directory, Wget's behavior depends on a few options,
           including -nc.  In certain cases, the local file will be clobbered, or overwritten, upon repeated download.
           In other cases it will be preserved.		
        
        -nd
        --no-directories
           Do not create a hierarchy of directories when retrieving recursively.  With this option turned on, all
           files will get saved to the current directory, without clobbering (if a name shows up more than once, the
           filenames will get extensions .n).
        
        -np
        --no-parent
           Do not ever ascend to the parent directory when retrieving recursively.  This is a useful option, since it
           guarantees that only the files below a certain hierarchy will be downloaded.
    
        -nH
        --no-host-directories
           Disable generation of host-prefixed directories.  By default, invoking Wget with -r http://fly.srk.fer.hr/
           will create a structure of directories beginning with fly.srk.fer.hr/.  This option disables such behavior.

        -P prefix
        --directory-prefix=prefix
           Set directory prefix to prefix.  The directory prefix is the directory where all other files and
           subdirectories will be saved to, i.e. the top of the retrieval tree.  The default is . (the current
           directory)
"""
        if extraArguments:
            extraArgumentList.append(extraArguments)
        #wget will add some portion of the URL path to the final output files depending on the cut_dir_number
        import urlparse
        url_path_list = urlparse.urlparse(url).path.split('/')[
            1:]  #[0] is empty because the path starts with '/'
        subPath = '/'.join(url_path_list[cut_dir_number:])

        if relativePath.find(
                '/'
        ) >= 0:  #2013.06.26 it's a folder itself. so no straight output.
            sys.stderr.write("\n\t Warning: item %s is a folder, will not be staged out. You have to manually copy them out of scratch folder.\n"%\
                (relativePath))
            extraOutputLs = None
        else:
            extraOutputLs = [
                File(
                    os.path.join(targetFolder,
                                 os.path.join(subPath, relativePath)))
            ]
        #2012.6.27 don't pass the downloaded outputFile to argument outputFile of addGenericJob()
        # because it will add "-o" in front of it. "-o" of wget is reserved for logFile.
        return self.addGenericJob(executable=executable,
            inputFile=None, outputFile=logFile, \
            parentJobLs=parentJobLs,
            extraDependentInputLs=extraDependentInputLs, \
            extraOutputLs=extraOutputLs,\
            transferOutput=transferOutput, \
            extraArgumentList=extraArgumentList, job_max_memory=job_max_memory)
Example #10
0
def build_pegasus_wf(cwl_wf: cwl.Workflow, wf_files: dict,
                     wf_input_str: dict) -> Workflow:
    log.info("Building Pegasus workflow")

    wf = Workflow("cwl-converted-pegasus-workflow", infer_dependencies=True)

    for step in cwl_wf.steps:
        step_name = get_basename(step.id)
        log.info("Processing step: {}".format(step_name))
        cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance(
            step.run, str) else step.run)

        job = Job(PurePath(cwl_cmd_ln_tool.baseCommand).name,
                  _id=get_basename(step.id))

        # collect current step inputs
        log.info("Collecting step inputs from {}".format(step_name))
        step_inputs = dict()
        for _input in step.in_:
            input_id = get_basename(_input.id)

            step_inputs[input_id] = get_basename(_input.source)
            log.debug("step_inputs[{}] = {}".format(input_id,
                                                    step_inputs[input_id]))

        # add inputs that are of type File
        for _input in cwl_cmd_ln_tool.inputs:
            if _input.type == "File":
                wf_file = File(wf_files[step_inputs[get_name(
                    step.id, _input.id)]])

                job.add_inputs(wf_file)
                log.info("Step: {} added input file: {}".format(
                    step_name, wf_file.lfn))
            """
            # TODO: handle File[] inputs
            elif isinstance(_input.type, cwl.CommandInputArraySchema):
                if _input.type.items == "File":
                    for f in step_inputs[get_name(step.id, _input.id)]:
                        wf_file = File(wf_files[f])

                        job.add_inputs(wf_file)
                        log.info(
                            "Step: {} added input file: {}".format(
                                step_name, wf_file.lfn
                            )
                        )
            """
        # add job outputs that are of type File
        log.info("Collecting step outputs from {}".format(step_name))
        for output in cwl_cmd_ln_tool.outputs:
            if output.type == "File":
                wf_file = File(wf_files[get_name(step.id, output.id)])

                job.add_outputs(wf_file)
                log.info("Step: {} added output file: {}".format(
                    step_name, wf_file.lfn))
            else:
                raise NotImplementedError(
                    "Support for output types other than File is in development"
                )

        # add job args
        args = (cwl_cmd_ln_tool.arguments
                if cwl_cmd_ln_tool.arguments is not None else list())

        # args will be added in the order of their assigned inputBinding
        def get_input_binding(_input):
            key = 0
            if hasattr(_input, "inputBinding") and hasattr(
                    _input.inputBinding, "position"):
                key = _input.inputBinding.position

            return key if key else 0

        cwl_cmd_ln_tool_inputs = sorted(cwl_cmd_ln_tool.inputs,
                                        key=get_input_binding)

        for _input in cwl_cmd_ln_tool_inputs:
            # indicates whether or not input will appear in args
            if _input.inputBinding is not None:
                prefix = _input.inputBinding.prefix
                separate = _input.inputBinding.separate

                current_arg = ""
                if prefix:
                    current_arg += prefix

                if separate:
                    current_arg += " "

                if _input.type == "File":
                    current_arg += wf_files[step_inputs[get_name(
                        step.id, _input.id)]]
                elif _input.type == "string":
                    current_arg += wf_input_str[step_inputs[get_name(
                        step.id, _input.id)]]

                # TODO: provide better support for array inputs being used in args (see https://www.commonwl.org/user_guide/09-array-inputs/index.html)
                elif isinstance(_input.type, cwl.CommandInputArraySchema):
                    separator = (" "
                                 if _input.inputBinding.itemSeparator is None
                                 else _input.inputBinding.itemSeparator)

                    if _input.type.items == "File":
                        current_arg += separator.join(
                            wf_files[f]
                            for f in step_inputs[get_name(step.id, _input.id)])
                    elif _input.type.items == "string":

                        current_arg += separator.join(
                            wf_input_str[step_inputs[get_name(
                                step.id, _input.id)]])

                args.append(current_arg)

        job.add_args(*args)
        wf.add_jobs(job)

        log.info("Added job: {}".format(step.run))
        log.info("\tcmd: {}".format(job.transformation))
        log.info("\targs: {}".format(job.args))
        log.info("\tinputs: {}".format([f.lfn for f in job.get_inputs()]))
        log.info("\toutputs: {}".format([f.lfn for f in job.get_outputs()]))

    log.info("Building workflow complete. {} jobs added".format(len(wf.jobs)))

    return wf
Example #11
0
    def mapEachInterval(self, alignmentData=None, intervalData=None, chromosome=None,
        VCFJobData=None, passingData=None, reduceBeforeEachAlignmentData=None,
        mapEachChromosomeData=None, transferOutput=False, \
        **keywords):
        """
        2013.03.31 use VCFJobData to decide whether to add BQSR jobs, called in ShortRead2Alignment.py
        2012.9.17
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob

        alignment = alignmentData.alignment
        bamF = alignmentData.bamF
        baiF = alignmentData.baiF
        bamFnamePrefix = passingData.bamFnamePrefix

        #SNPVCFFile = VCFJobData.file
        #if SNPVCFFile is None or VCFJobData is None:
        # #2013.04.09	BQSR requires a VCF input regardless of the chromosome
        #	VCFJobData = self.randomSNPVCFJobDataForBQSR

        #SNPVCFFile = VCFJobData.file
        #SNPVCFJobLs = VCFJobData.jobLs

        if intervalData.file:
            mpileupInterval = intervalData.interval
            bcftoolsInterval = intervalData.file
        else:
            mpileupInterval = intervalData.interval
            bcftoolsInterval = intervalData.interval
        intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
        overlapInterval = intervalData.overlapInterval
        overlapFileBasenameSignature = intervalData.overlapIntervalFileBasenameSignature
        span = intervalData.span

        if chromosome is None:
            chromosome = getattr(passingData, 'chromosome', None)

        median_depth = getattr(alignment, 'median_depth', 4)
        readSpace = median_depth * span
        #base is 4X coverage in 20Mb region => 120 minutes
        reduceReadsJobWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=readSpace, \
            baseInputVolume=4*20000000, baseJobPropertyValue=60, \
            minJobPropertyValue=60, maxJobPropertyValue=500).value
        #base is 4X, => 5000M
        reduceReadsJobMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=median_depth, \
            baseInputVolume=4, baseJobPropertyValue=4000, \
            minJobPropertyValue=4000, maxJobPropertyValue=8000).value

        reduceReadsBamFile = File(os.path.join(topOutputDirJob.output, \
            '%s_%s.reduceReads.bam'%\
            (bamFnamePrefix, overlapFileBasenameSignature)))
        #Default downsampling setting is 40 in GATK 2.4.9
        # this downsampling happens at the ReadWalker level,
        #extraArgumentList= ["--downsample_to_coverage 250", "--downsampling_type BY_SAMPLE"]

        extraArgumentList = ["--downsample_coverage 250"]  #this is for
        #This level of downsampling only happens after the region has been evaluated,
        #  therefore it can be combined with the engine level downsampling.

        reduceReadsJob = self.addGATKJob(executable=self.ReduceReadsJava,
            GenomeAnalysisTKJar=self.GenomeAnalysisTK2Jar, \
            GATKAnalysisType='ReduceReads',\
            inputFile=bamF, inputArgumentOption="-I",
            refFastaFList=passingData.refFastaFList, inputFileList=None,\
            argumentForEachFileInInputFileList=None,\
            interval=overlapInterval, outputFile=reduceReadsBamFile, \
            parentJobLs=alignmentData.jobLs, transferOutput=False, \
            job_max_memory=reduceReadsJobMaxMemory,\
            frontArgumentList=None, extraArguments=None, \
            extraArgumentList=extraArgumentList, \
            extraOutputLs=[], \
            extraDependentInputLs=[baiF], no_of_cpus=None, \
            walltime=reduceReadsJobWalltime)
        indexBamJob = self.addBAMIndexJob(
            BuildBamIndexFilesJava=self.BuildBamIndexFilesJava, \
            BuildBamIndexJar=self.BuildBamIndexJar, \
            inputBamF=reduceReadsJob.output,\
            parentJobLs=[reduceReadsJob], \
            transferOutput=False, job_max_memory=3000, \
            walltime=max(120, int(reduceReadsJobWalltime/3)))
        passingData.alignmentJobAndOutputLs.append(PassingData(
            jobLs=[reduceReadsJob, indexBamJob], \
            file=reduceReadsJob.output, fileLs=[reduceReadsJob.output]))
        return returnData
Example #12
0
    def doAllAccurityAlignmentJob(self, data_dir=None, normal_bam_bai=None,
        pair_bam_file_list = None,
        outputDirPrefix=None, parentJobLs=None,
        AccurityFolder=None, AccurityFolderJob=None):
        print("Adding Accurity jobs for %s pair individual sequences ..." % \
            (len(pair_bam_file_list)), flush=True)
        jobLs = []
        for pair_bam in pair_bam_file_list:
            tumor_bam = pair_bam[0]
            tumor_bam_bai = parentJobLs[1].baiFile
            normal_bam = pair_bam[1]

            if tumor_bam is None or normal_bam is None:
                sys.stderr.write("the pair sample bam file is note exist!!!")
                exit(2)
            #tumor_bam_path = os.path.join(data_dir, tumor_bam)
            #tumor_bai_path = tumor_bam_path + ".bai"
            #normal_bam_path = os.path.join(data_dir, normal_bam)
            #normal_bai_path = normal_bam_path + ".bai"
            Accurity_configure_path = os.path.dirname(self.AccurityPath) + "/configure"

            outputList = []
            sample_id = os.path.basename(tumor_bam.name).strip(".bam")
            sample_folder = AccurityFolder + "/" + sample_id

            sample_folder_Job = self.addMkDirJob(outputDir=sample_folder, 
                parentJobLs=parentJobLs.append(AccurityFolderJob))
            outputList.append(File(sample_folder + "/infer.out.tsv"))
            outputList.append(File(sample_folder + "/infer.out.details.tsv"))
            outputList.append(File(sample_folder + "/auto.tsv"))
            outputList.append(File(sample_folder + "/cnv.plot.pdf"))
            outputList.append(File(sample_folder + "/cnv.output.tsv"))
            outputList.append(File(sample_folder + "/rc_ratio_window_count_smoothed.tsv"))
            outputList.append(File(sample_folder + "/rc_ratio_no_of_windows_by_chr.tsv"))
            outputList.append(File(sample_folder + "/cnv.intervel.tsv"))
            outputList.append(File(sample_folder + "/major_allele_fraction_exp_vs_obs.tsv"))
            outputList.append(File(sample_folder + "/peak_bounds.tsv"))
            outputList.append(File(sample_folder + "/rc_logLikelihood.log.tsv"))
            outputList.append(File(sample_folder + "/rc_ratios_of_peaks_based_on_period_from_autocor.tsv"))
            outputList.append(File(sample_folder + "/runTime.log.txt"))

            #tumor_bam_file = self.registerOneInputFile(tumor_bam_path)
            #tumor_bai_file = self.registerOneInputFile(tumor_bai_path)
            #normal_bam_file = self.registerOneInputFile(normal_bam_path)
            #normal_bai_file = self.registerOneInputFile(normal_bai_path)
            configure_file = self.registerOneInputFile(Accurity_configure_path)
            argumentList = ["-c", configure_file, "-t", tumor_bam, "-n", normal_bam, "-o", sample_folder, "-d", "1", "-l", "4"]
            inputFileList = [tumor_bam, tumor_bam_bai, normal_bam, normal_bam_bai, configure_file]

            job = self.addPurityJobToWorkflow(executable=self.AccurityExecutableFile,\
                argumentList=argumentList, \
                inputFileList=inputFileList, outputFileList=outputList, \
                parentJobLs=[sample_folder_Job], \
                job_max_memory=10000, no_of_cpus=8, walltime=400, sshDBTunnel=0)
            jobLs.append(job)
        return jobLs
Example #13
0
    def run(self):
        """
        """
        self.setup_run()

        isq_id2LibrarySplitOrder2FileLs = self.db_main.getISQ_ID2LibrarySplitOrder2FileLs(
            self.ind_seq_id_ls,
            data_dir=self.data_dir,
            filtered=0,
            ignoreEmptyReadFile=False)
        to_work_ind_seq_id_set = set()
        parent_individual_sequence_file_id_set = set()
        for ind_seq_id, LibrarySplitOrder2FileLs in isq_id2LibrarySplitOrder2FileLs.items(
        ):
            parent_individual_sequence = self.db_main.queryTable(
                SunsetDB.IndividualSequence).get(ind_seq_id)
            if parent_individual_sequence is not None and parent_individual_sequence.format == 'fastq':
                """
                check if the child individual_sequence already exists in db or not.
                if it does, what about its files?? if not, go add filtering jobs.
                """
                # 2012.6.8
                individual_sequence = self.db_main.copyParentIndividualSequence(
                    parent_individual_sequence=parent_individual_sequence,
                    parent_individual_sequence_id=ind_seq_id,
                    quality_score_format='Standard',
                    filtered=1,
                    data_dir=self.data_dir)
                library_split_order2filtered_db_entry_ls = self.getLibrarySplitOrder2DBEntryLs(
                    individual_sequence)

                sequenceOutputDirJob = None
                filteredReadOutputDirJob = None
                for key, fileObjLs in LibrarySplitOrder2FileLs.items():
                    if key in library_split_order2filtered_db_entry_ls:
                        sys.stderr.write(
                            "Warning: this pair of filtered individual_sequence_file(s), "
                            f"{repr(key)}, parent_individual_sequence "
                            f"(id={parent_individual_sequence.id}, {parent_individual_sequence.individual.code}), "
                            f"individual_sequence (id={individual_sequence.id}, {individual_sequence.individual.code}) "
                            "are already in db. skip.\n")
                        continue
                    else:
                        if sequenceOutputDirJob is None:
                            sequenceOutputDir = os.path.join(
                                self.data_dir, individual_sequence.path)
                            sequenceOutputDirJob = self.addMkDirJob(
                                outputDir=sequenceOutputDir)
                        if filteredReadOutputDirJob is None:
                            filteredReadOutputDir = os.path.join(
                                os.path.basename(individual_sequence.path))
                            filteredReadOutputDirJob = self.addMkDirJob(
                                outputDir=filteredReadOutputDir)

                    # add filter jobs
                    extraDependentInputLs = []
                    extraOutputLs = []
                    extraArgumentList = [
                        "-a", self.adapter, "-j", self.no_of_threads,
                        "--quality-base", self.quality_base, "-m",
                        self.minimum_length
                    ]
                    if self.adapter2 is not None:
                        extraArgumentList.extend(["-A", self.adapter2])
                    if self.maximum_length is not None:
                        extraArgumentList.extend(["-M", self.maximum_length])
                    if self.trim_n:
                        extraArgumentList.append("--trim-n")
                    if self.quality_cutoff is not None:
                        extraArgumentList.extend(["-q", self.quality_cutoff])
                    input_fastq_list = []
                    for i in range(len(fileObjLs)):
                        fileObj = fileObjLs[i]
                        try:  # 2012.7.2
                            inputFile = self.registerOneInputFile(
                                input_path=fileObj.path,
                                folderName='inputIndividualSequenceFile')
                        except Exception as e:
                            import pdb
                            pdb.set_trace()
                        # take the base filename as the output filename. it'll be in scratch/.
                        outputFname = os.path.join(
                            filteredReadOutputDir,
                            os.path.basename(fileObj.path))
                        outputFile = File(outputFname)
                        extraDependentInputLs.append(inputFile)
                        extraOutputLs.append(outputFile)
                        if i == 0:  # 1st mate
                            input_fastq_list.append(inputFile)
                            extraArgumentList.extend(["-o", outputFile])
                        elif i == 1:  # 2nd mate
                            input_fastq_list.append(inputFile)
                            extraArgumentList.extend(["-p", outputFile])
                        else:
                            sys.stderr.write(
                                "Error: mate %s appeared in paired-end data (individualSequenceID=%s).\n"
                                % (i + 1, ind_seq_id))
                            sys.exit(4)

                    extraArgumentList.extend(input_fastq_list)
                    filterShortRead_job = self.addFilterReadJob(
                        executable=self.cutadapt,
                        extraOutputLs=extraOutputLs,
                        parentJobLs=[filteredReadOutputDirJob],
                        job_max_memory=2000,
                        walltime=120,
                        extraDependentInputLs=extraDependentInputLs,
                        extraArgumentList=extraArgumentList,
                        no_of_cpus=self.no_of_threads,
                        transferOutput=False)
                    for fileObj, outputFile in zip(fileObjLs, extraOutputLs):
                        logFile = File(
                            '%s_%s.register.log' %
                            (individual_sequence.id, fileObj.db_entry.id))
                        addFilteredSequences2DB_job = self.addAddFilteredSequences2DB_job(
                            executable=self.AddFilteredSequences2DB,
                            inputFile=outputFile,
                            individual_sequence_id=individual_sequence.id,
                            outputDir=sequenceOutputDir,
                            logFile=logFile,
                            parent_individual_sequence_file_id=fileObj.
                            db_entry.id,
                            parentJobLs=[
                                sequenceOutputDirJob, filterShortRead_job
                            ],
                            commit=self.commit,
                            extraDependentInputLs=None,
                            transferOutput=True,
                            sshDBTunnel=self.needSSHDBTunnel)
                    to_work_ind_seq_id_set.add(ind_seq_id)
                    parent_individual_sequence_file_id_set.add(
                        fileObj.db_entry.id)
        sys.stderr.write(
            f"{self.no_of_jobs} jobs, {len(to_work_ind_seq_id_set)} individual_sequence entries, "
            f"{len(parent_individual_sequence_file_id_set)} parent_individual_sequence_file_id s.\n"
        )

        self.end_run()
Example #14
0
    pipe2File = pegaflow.registerExecutable(wflow,
                                            pipe2File_path,
                                            args.site_handler,
                                            cluster_size=args.cluster_size)
    mergeWC = pegaflow.registerExecutable(wflow,
                                          pipe2File_path,
                                          args.site_handler,
                                          executableName='mergeWC',
                                          cluster_size=args.cluster_size)
    sleep = pegaflow.registerExecutable(wflow,
                                        "/bin/sleep",
                                        args.site_handler,
                                        cluster_size=args.cluster_size)

    mergedOutputFile = File("merged.txt")
    # request 500MB memory, 30 minutes run time (walltime).
    mergeJob = pegaflow.addJob2workflow(
        wflow,
        mergeWC,
        argv=[mergedOutputFile, '/bin/cat'],
        input_file_list=None,
        output_file_transfer_list=[mergedOutputFile],
        output_file_notransfer_list=None,
        job_max_memory=500,
        walltime=30)

    mkdir = pegaflow.registerExecutable(wflow, '/bin/mkdir', args.site_handler)
    outputDir = 'output'
    outputDirJob = pegaflow.addMkDirJob(wflow, mkdir, outputDir)
Example #15
0
    def run(self):
        ## setup_run() will call registerExecutables()
        self.setup_run()

        # Register all .py files from the input folder
        #  self.registerOneInputFile('/tmp/abc.txt') can be used to register
        #  one input file.
        inputData = self.registerFilesOfInputDir(
            inputDir=self.input_path,
            input_site_handler=self.input_site_handler,
            inputSuffixSet=self.inputSuffixSet,
            pegasusFolderName='input')

        # Pegasus jobs do NOT allow pipes. So use pipe2File (already
        #   registered in Workflow.py).
        # register wc and cat as they will be used by pipe2File.
        wcCommand = self.registerOneExecutableAsFile(path="/usr/bin/wc")
        catCommand = self.registerOneExecutableAsFile(path="/bin/cat")

        mergedOutputFile = File("merged.txt")
        # request 500MB memory, 30 minutes run time (walltime).
        # executable=self.mergeWC tells this function to use a different
        #  executable.
        #  In order to give this job a different name.
        #  If executable=None or not given, self.pipe2File is used.
        mergeJob = self.addPipe2FileJob(executable=self.mergeWC,
                                        commandFile=catCommand,
                                        outputFile=mergedOutputFile,
                                        transferOutput=True,
                                        job_max_memory=500,
                                        walltime=30)

        outputDir = 'output'
        outputDirJob = self.addMkDirJob(outputDir)
        for jobData in inputData.jobDataLs:
            outputFile = File(
                os.path.join(
                    outputDir,
                    f'{os.path.basename(jobData.file.name)}.wc.output.txt'))
            ## wc each input file
            # Argument "executable" is not given, use self.pipe2File.
            wcJob = self.addPipe2FileJob(commandFile=wcCommand,
                                         outputFile=outputFile,
                                         parentJob=None,
                                         parentJobLs=[outputDirJob],
                                         extraArgumentList=[jobData.file],
                                         extraDependentInputLs=[jobData.file],
                                         extraOutputLs=None,
                                         transferOutput=False)
            # add wcJob.output (outputFile passed to addPipe2FileJob() above)
            #   as the input of mergeJob.
            #   It appends input to the end of a job's exising arguments).
            #   wcJob.output will be a dependent input of mergeJob.
            # addInputToMergeJob() also adds wcJob as a parent of mergeJob.
            self.addInputToMergeJob(mergeJob=mergeJob,
                                    inputF=wcJob.output,
                                    inputArgumentOption="",
                                    parentJobLs=[wcJob])
        # a sleep job to slow down the workflow for 30 seconds
        # sleepJob has no output.
        sleepJob = self.addGenericJob(executable=self.sleep,
                                      extraArgumentList=[30])
        # add sleepJob as mergeJob's parent.
        self.addInputToMergeJob(mergeJob=mergeJob, parentJobLs=[sleepJob])

        # end_run() will output the DAG to output_path
        self.end_run()
    def mapEachInterval(self,
                        VCFJobData=None,
                        passingData=None,
                        transferOutput=False,
                        **keywords):
        """
        use VCFJobData
        
        #. extract flanking sequences from the input VCF (ref sequence file => contig ref sequence)
        #. blast them
        #. run FindSNPPositionOnNewRefFromFlankingBlastOutput.py
            #. where hit length match query length, and no of mismatches <=2 => good => infer new coordinates
        #. output a mapping file between old SNP and new SNP coordinates.
            #. reduce this thing by combining everything
        #. make a new VCF file based on the input split VCF file
            #. (replace contig ID , position with the new one's, remove the header part regarding chromosomes or replace it)
        """

        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []

        topOutputDirJob = passingData.topOutputDirJob
        mapDirJob = passingData.mapDirJob
        reduceOutputDirJob = passingData.reduceOutputDirJob

        intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix
        jobData = passingData.jobData
        VCFFile = VCFJobData.file

        splitVCFJob = passingData.mapEachVCFData.splitVCFJob
        chromosome = passingData.chromosome

        # a flanking sequence extraction job
        #noOfIndividuals
        realInputVolume = passingData.noOfIndividuals * passingData.span
        baseInputVolume = 600 * 2000  #600 individuals at 2000 sites
        #base is 200 individual X 2Mb region => 120 minutes
        walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=60,
            minJobPropertyValue=60,
            maxJobPropertyValue=1200).value
        #base is 4X, => 5000M
        job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(
            realInputVolume=realInputVolume,
            baseInputVolume=baseInputVolume,
            baseJobPropertyValue=4000,
            minJobPropertyValue=4000,
            maxJobPropertyValue=8000).value

        outputFnamePrefix = os.path.join(
            mapDirJob.output,
            '%s.sameSite.concordance' % (intervalFileBasenamePrefix))
        outputFile = File('%s.tsv' % (outputFnamePrefix))

        returnData.mapJob = self.addAbstractMapperLikeJob(
            executable=self.CalculateSameSiteConcordanceInVCF,
            inputF=VCFFile,
            outputF=outputFile,
            parentJobLs=[mapDirJob] + VCFJobData.jobLs,
            transferOutput=transferOutput,
            job_max_memory=job_max_memory,
            walltime=walltime)

        return returnData
Example #17
0
    def addDownsamplejob(self, data_dir=None, idDict=None,
        DownSamplePrefix=None,
        downSampleJava=None, downSampleJar=None, transferOutput=False):
        AccurityFolder = "AccurityResult"
        AccurityFolderJob = self.addMkDirJob(outputDir=AccurityFolder)

        sys.stderr.write("Adding downsample jobs for %s individual sequences ..." % (len(idDict)))
        SampleFolder = "%swithSeed1.0" % (DownSamplePrefix)
        SampleFolderJob = self.addMkDirJob(outputDir=SampleFolder)

        alignNormal = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(idDict['normalFile'])
        alignNormalFilePath = os.path.join(data_dir, alignNormal.path)
        inputNormalBamFile = self.registerOneInputFile(alignNormalFilePath)
        coverageNormal = int(alignNormal.mean_depth)
        #alignNormalIndiv = self.db_main.queryTable(SunsetDB.IndividualSequence).get(alignNormal.ind_seq_id)
        #coverageNormal = int(alignNormalIndiv.coverage)

        alignTumor = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(idDict['tumorFile'])
        alignTumorFilePath = os.path.join(data_dir, alignTumor.path)
        inputTumorBamFile = self.registerOneInputFile(alignTumorFilePath)
        coverageTumor = int(alignTumor.mean_depth)
        #alignTumorIndiv = self.db_main.queryTable(SunsetDB.IndividualSequence).get(alignTumor.ind_seq_id)
        #coverageTumor = int(alignTumorIndiv.coverage)

        job_max_memory = "5000"
        walltime = '600'
        for i in range(1,10):
            jobLs = []
            pair_bam_file_list = []
            probNormal = float(i) / float(coverageNormal)
            probTumor = float(10 - i) / float(coverageTumor)
            outputNormalFile = File(os.path.join(SampleFolder, str(probNormal) + "_normal_downsample.bam"))
            outputTumorFile = File(os.path.join(SampleFolder, str(probTumor) + "_tumor_downsample.bam"))
            mergeJobAndOutputLs = []
            normal_down_sample_job = self.addGenericJavaJob(
                executable=downSampleJava,
                jarFile=downSampleJar,
                inputFile=inputNormalBamFile, inputArgumentOption="INPUT=",
                inputFileList=None,
                argumentForEachFileInInputFileList=None,
                outputFile=outputNormalFile, outputArgumentOption="OUTPUT=",
                parentJobLs=[SampleFolderJob], transferOutput=False,
                job_max_memory=job_max_memory,
                frontArgumentList=['DownsampleSam'], extraArguments=None,
                extraArgumentList=['PROBABILITY=' + str(probNormal), \
                                    'RANDOM_SEED=','1' ,\
                                    'STRATEGY=','ConstantMemory', \
                                    'VALIDATION_STRINGENCY=','LENIENT'
                                    ],
                extraOutputLs=None, \
                extraDependentInputLs=None, no_of_cpus=None, walltime=walltime,
                sshDBTunnel=None)
            mergeJobAndOutputLs.append(PassingData(
                jobLs=[normal_down_sample_job], file=outputNormalFile))

            tumor_down_sample_job = self.addGenericJavaJob(
                executable=downSampleJava,
                jarFile=downSampleJar,
                inputFile=inputTumorBamFile, inputArgumentOption="INPUT=",
                inputFileList=None,
                argumentForEachFileInInputFileList=None,
                outputFile=outputTumorFile, outputArgumentOption="OUTPUT=",
                parentJobLs=[SampleFolderJob],
                transferOutput=False,
                job_max_memory=job_max_memory,
                frontArgumentList=['DownsampleSam'], extraArguments=None,
                extraArgumentList=['PROBABILITY=' , str(probTumor), \
                                    'RANDOM_SEED=','1', \
                                    'STRATEGY=', 'ConstantMemory', \
                                    'VALIDATION_STRINGENCY=' ,'LENIENT'
                                    ],
                extraOutputLs=None, \
                extraDependentInputLs=None, no_of_cpus=None,
                walltime=walltime,
                sshDBTunnel=None)
            mergeJobAndOutputLs.append(PassingData(jobLs=[tumor_down_sample_job], file=outputTumorFile))

            puritySampleFolder = "puritySample"
            SampleFolderJob = self.addMkDirJob(outputDir=puritySampleFolder)
            purity = str((10-i) * 0.1)
            purityDir = "purity" + str(purity)
            purityFolderJob = self.addMkDirJob(outputDir=os.path.join(puritySampleFolder,purityDir))
            mergedBamFile = File(os.path.join(puritySampleFolder,purityDir, "purity_"+ purity + ".bam"))
            baseCoverage = 4 * 3000000000  # baseline
            minMergeAlignmentWalltime = 240
            # in minutes, 4 hours, when coverage is defaultCoverage
            maxMergeAlignmentWalltime = 2980  # in minutes, 2 days
            minMergeAlignmentMaxMemory = 8000
            # in MB, when coverage is defaultCoverage
            maxMergeAlignmentMaxMemory = 21000  # in MB

            mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=max(i, 10-i) * 3000000000,
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentWalltime,
                minJobPropertyValue=minMergeAlignmentWalltime,
                maxJobPropertyValue=maxMergeAlignmentWalltime).value
            mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=max(i, 10-i) * 3000000000, \
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentMaxMemory,
                minJobPropertyValue=minMergeAlignmentMaxMemory,
                maxJobPropertyValue=maxMergeAlignmentMaxMemory).value

            MergeJob, bamIndexJob = self.addAlignmentMergeJob(
                alignmentJobAndOutputLs=mergeJobAndOutputLs,
                outputBamFile=mergedBamFile,
                needBAMIndexJob=True,
                parentJobLs=[SampleFolderJob, purityFolderJob],
                transferOutput=transferOutput,
                job_max_memory=mergeAlignmentMaxMemory,
                walltime=mergeAlignmentWalltime)
            normal_part_refer = self.registerOneInputFile(
                inputFname="/y/Sunset/workflow/real_data/downsample/normal_0.2.bam", 
                folderName=os.path.join(puritySampleFolder,purityDir))
            normal_bam_bai = self.registerOneInputFile(
                inputFname="/y/Sunset/workflow/real_data/downsample/normal_0.2.bam.bai",
                folderName=os.path.join(puritySampleFolder,purityDir))
            pair_bam_file_list.append([mergedBamFile, normal_part_refer])
            AccurityJob = self.doAllAccurityAlignmentJob(data_dir=None,
                normal_bam_bai=normal_bam_bai,
                pair_bam_file_list=pair_bam_file_list,\
                outputDirPrefix=None, parentJobLs=[MergeJob, bamIndexJob],
                AccurityFolder=AccurityFolder,
                AccurityFolderJob=AccurityFolderJob)

        return jobLs
Example #18
0
 def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None,
     noOfTotalSequences=None, transferOutput=True, makeBlastDBJob=None):
     """
     2012.5.24
     """
     
     sys.stderr.write("Adding blast jobs for %s input ... "%(len(inputData.jobDataLs)))
     no_of_jobs= 0
     
     topOutputDir = "%sBlast"%(outputDirPrefix)
     topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
     no_of_jobs += 1
     
     allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv'))
     allBlastMergeJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=allBlastResultFile, transferOutput=transferOutput,
         parentJobLs=[topOutputDirJob])
     no_of_jobs += 1
     
     ntDatabaseFile = ntDatabaseFileList[0]
     returnData = PassingData()
     returnData.jobDataLs = []
     
     for jobData in inputData.jobDataLs:
         inputF = jobData.output
         outputFnamePrefix = os.path.join(topOutputDir,
             os.path.splitext(os.path.basename(inputF.name))[0])
         
         splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile,
             inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
             noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta",
             noOfTotalSequences=noOfTotalSequences,\
             parentJobLs=jobData.jobLs + [topOutputDirJob],
             extraDependentInputLs=None, transferOutput=False, \
             extraArguments=None, job_max_memory=500)
         no_of_jobs += 1
         for splitFastaOutput in splitFastaJob.outputList:
             outputFile = File('%s.tsv'%(splitFastaOutput.name))
             blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper,
                 inputFile=splitFastaOutput, outputFile=outputFile,
                 outputFnamePrefix=splitFastaOutput.name ,
                 databaseFile=ntDatabaseFile,
                 maxNoOfMismatches=self.maxNoOfMismatches,
                 minNoOfIdentities=self.minNoOfIdentities,
                 minIdentityPercentage=self.minIdentityPercentage,
                 blastallPath=self.blastallPath,
                 parentJobLs=[splitFastaJob, makeBlastDBJob],
                 extraDependentInputLs=ntDatabaseFileList,
                 transferOutput=False, \
                 extraArguments=None, job_max_memory=1000)
             
             #add output to some reduce job
             self.addInputToMergeJob(allBlastMergeJob, \
                 inputF=blastJob.output, parentJobLs=[blastJob])
             no_of_jobs += 1
     sys.stderr.write("%s jobs. Done.\n"%(no_of_jobs))
     #include the tfam (outputList[1]) into the fileLs
     returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \
         fileLs=[allBlastResultFile]))
     return returnData
Example #19
0
    def addJobs(self,
                inputData=None,
                topOutputDir="output",
                needSSHDBTunnel=0):
        """
        2012.3.14
        """
        sys.stderr.write("Adding read counting jobs on %s input ..."%\
            (len(inputData.jobDataLs)))
        no_of_jobs = 0
        if topOutputDir:
            topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)
            no_of_jobs += 1
        else:
            topOutputDirJob = None

        finalReduceFile = File(
            os.path.join(topOutputDir, 'read_base_count.tsv'))

        readBaseCountMergeJob = self.addStatMergeJob(
            statMergeProgram=self.mergeSameHeaderTablesIntoOne,
            outputF=finalReduceFile,
            transferOutput=True,
            extraArguments=None,
            parentJobLs=[topOutputDirJob])

        logFile = File(os.path.join(topOutputDir,
                                    'PutReadBaseCountIntoDB.log'))
        putCountIntoDBJob = self.addPutReadBaseCountIntoDBJob(
            executable=self.PutReadBaseCountIntoDB,
            inputFileLs=[finalReduceFile],
            logFile=logFile,
            commit=self.commit,
            parentJobLs=[readBaseCountMergeJob],
            extraDependentInputLs=[],
            transferOutput=True,
            extraArguments=None,
            job_max_memory=10,
            sshDBTunnel=needSSHDBTunnel)
        no_of_jobs += 2
        for jobData in inputData.jobDataLs:
            #add the read count job
            outputFile = File(os.path.join(topOutputDir,
                'read_count_isq_%s_isqf_%s.tsv'%\
                (jobData.isq_id, jobData.isqf_id)))
            readCountJob = self.addCountFastqReadBaseCountJob(
                executable=self.CountFastqReadBaseCount,
                inputFile=jobData.output,
                outputFile=outputFile,
                isq_id=jobData.isq_id,
                isqf_id=jobData.isqf_id,
                parentJobLs=jobData.jobLs + [topOutputDirJob],
                extraDependentInputLs=None,
                transferOutput=False,
                extraArguments=None,
                job_max_memory=10,
                no_of_cpus=4)

            no_of_jobs += 1
            self.addInputToMergeJob(readBaseCountMergeJob,
                                    inputF=readCountJob.output,
                                    parentJobLs=[readCountJob])

        sys.stderr.write("%s jobs.\n" % (no_of_jobs))
        return putCountIntoDBJob
Example #20
0
 def preReduce(self, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
     """
     2012.9.17
     """
     returnData = ParentClass.preReduce(self, outputDirPrefix=outputDirPrefix,\
         passingData=passingData, transferOutput=transferOutput, **keywords)
     #add a stat merge job and a genome wide plot job
     outputFile = File(os.path.join(self.reduceOutputDirJob.output,
         'locusLiftOverProbability.tsv'))
     self.reduceJob = self.addStatMergeJob(
         statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
         outputF=outputFile, \
         parentJobLs=[self.reduceOutputDirJob],
         extraDependentInputLs=None, transferOutput=False)
     
     sortProbabilityFile = File(os.path.join(self.reduceOutputDirJob.output,
         'locusLiftOverProbability.sorted.tsv'))
     sortProbabilityJob = self.addSortJob(inputFile=self.reduceJob.output,
         outputFile=sortProbabilityFile, \
         parentJobLs=[self.reduceJob], \
         extraOutputLs=None, transferOutput=False, \
         extraArgumentList=["""-k1,1 -k2,3n """], \
         sshDBTunnel=None,\
         job_max_memory=4000, walltime=120)
     #2013.12.3 Tab delimiter syntax (-t$'\t') is removed because it can't be passed correctly.
     #2013.12.3 Tried -t "`/bin/echo -e '\t'`" as well, didn't work either.
     # However since each column field doesn't contain blank,
     #   it is fine to just use the default separator (non-blank to blank).
     
     returnData.jobDataLs.append(self.constructJobDataFromJob(sortProbabilityJob))
     
     outputFile = File(os.path.join(self.plotDirJob.output, 'locusLiftOverProbability.png'))
     self.addPlotGenomeWideDataJob(inputFileList=None,
         inputFile=self.reduceJob.output,
         outputFile=outputFile,\
         whichColumn=None,
         whichColumnHeader="mapPvalue", whichColumnPlotLabel="mapPvalue", \
         logX=None, logY=2, valueForNonPositiveYValue=-1, \
         xScaleLog=None, yScaleLog=None,\
         missingDataNotation='NA',\
         xColumnPlotLabel="genomePosition", xColumnHeader="oldStart", \
         xtickInterval=0,\
         drawCentromere=True, chrColumnHeader="oldChromosome", \
         minChrLength=None, minNoOfTotal=None, maxNoOfTotal=None, \
         figureDPI=100, formatString=".", ylim_type=2,
         samplingRate=1, logCount=False, need_svg=True,\
         tax_id=self.ref_genome_tax_id,
         sequence_type_id=self.ref_genome_sequence_type_id, chrOrder=1,\
         inputFileFormat=1, outputFileFormat=None,\
         parentJobLs=[self.reduceJob], \
         extraDependentInputLs=None, \
         extraArguments=None, extraArgumentList=None, \
         transferOutput=True, job_max_memory=1000, sshDBTunnel=self.needSSHDBTunnel)
     #xtickInterval=0 means no ticks on x-axis.
     
     outputFile = File( os.path.join(self.plotDirJob.output, 'locusLiftOverProbabilityHist.png'))
     #no spaces or parenthesis or any other shell-vulnerable letters in the x
     #   or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
     self.addDrawHistogramJob(executable=self.DrawHistogram,
         inputFileList=[self.reduceJob.output], \
         outputFile=outputFile, \
         whichColumnHeader="mapPvalue", whichColumnPlotLabel="minusLogLiftOverPvalue", \
         xScaleLog=0, yScaleLog=1, \
         logCount=False, logY=2, valueForNonPositiveYValue=50,\
         minNoOfTotal=10,\
         figureDPI=100, samplingRate=1,legendType=1, \
         parentJobLs=[self.plotDirJob, self.reduceJob], \
         extraDependentInputLs=None, \
         transferOutput=True, job_max_memory=8000)
     
     return returnData
Example #21
0
    def addAllJobs(self, \
                data_dir=None, \
                outputDirPrefix="", transferOutput=True, **keywords):
        """
        2013.2.27
            run ms
            estimate parameters from ms
            ms2SLiM
            SLiM forward simulator with estimated ms-parameters or take the output of ms as input
            SLiM2PolymorphismTableFile
            
            AddPopGenSimulation2DB.py
            
        """
        sys.stderr.write("Adding jobs for pop-gen simulation #jobs=%s... \n"%\
                            (self.no_of_jobs))

        returnData = PassingData()
        returnData.jobDataLs = []

        passingData = PassingData(fileBasenamePrefix=None, \
                    outputDirPrefix=outputDirPrefix, \
                    jobData=None,\
                    preReduceReturnData=None,\
                    association_group_key2orderIndex = {},\
                    association_group_key2resultList = {},\
                    association_group_key2reduceAssociationPeakJobMatrix = {},\
                    association_group_key2countAssociationLocusJobList = {},\
                    resultID2defineLandscapeJobData = {},
                    )

        preReduceReturnData = self.preReduce(outputDirPrefix=outputDirPrefix, \
                                    passingData=passingData, transferOutput=False,\
                                    **keywords)

        mapDirJob = preReduceReturnData.mapDirJob
        plotOutputDirJob = preReduceReturnData.plotOutputDirJob
        countAssociationLocusOutputDirJob = preReduceReturnData.countAssociationLocusOutputDirJob
        reduceOutputDirJob = preReduceReturnData.reduceOutputDirJob

        passingData.preReduceReturnData = preReduceReturnData

        #add output pedigree job

        for i in range(self.noOfReplicates):
            popGenSimulationFolderJob = self.addMkDirJob(outputDir=os.path.join(mapDirJob.output, 'popGenSim%s'%(i)), \
                                                        parentJobLs=[mapDirJob])
            #pending user choice, use ms/sfs-code/slim/ms & slim combination
            msOutputFile = File(os.path.join(popGenSimulationFolderJob.output, \
                                    'sim%s_msOutput.txt.gz'%(i)))
            popSimulationJob = self.addMSSimulationJob(outputFile=msOutputFile, \
                                recombinationRate=self.recombinationRate, mutationRate=self.mutationRate, \
                                initialEffectivePopulationSize=self.initialEffectivePopulationSize, \
                                otherParametersPassedToPopGenSimulator=self.otherParametersPassedToPopGenSimulator, \
                                sampleSize=self.sampleSize, noOfLociToSimulate=self.noOfLociToSimulate, \
                                simulateLocusLengthList=self.simulateLocusLengthList, \
                                parentJobLs=[popGenSimulationFolderJob], \
                                extraDependentInputLs=None, extraOutputLs=None, \
                                transferOutput=False, extraArguments=None, extraArgumentList=None, \
                                job_max_memory=2000, walltime=180)

            #. convert ms pop-gen output 2 polymorphism-table file
            msOutputHDF5File = File(os.path.join(popGenSimulationFolderJob.output, \
                                    'sim%s_msOutput.h5'%(i)))
            msOutput2PolymorphismTableFileJob = self.addGenericJob(executable=self.msOutput2PolymorphismTableFile, \
                    inputFile=popSimulationJob.output, \
                    outputFile=msOutputHDF5File,\
                    parentJob=None, parentJobLs=[popGenSimulationFolderJob, popSimulationJob], \
                    extraDependentInputLs=None, extraOutputLs=None, transferOutput=False, \
                    frontArgumentList=None, \
                    extraArguments=None, \
                    extraArgumentList=None, job_max_memory=2000,  \
                    no_of_cpus=None, walltime=None)

            #. add polymorphism-table file to db
            logFile = File(
                os.path.join(popGenSimulationFolderJob.output,
                             "sim%s_2DB.log" % (i)))
            extraArgumentList = ["--r %s"%self.recombinationRate, "--rho %s"%popSimulationJob.rho, "--mu %s"%self.mutationRate,\
                                "--theta %s"%popSimulationJob.theta, "--n0 %s"%self.initialEffectivePopulationSize,\
                                "--no_of_populations 1", "--no_of_chromosomes %s"%self.sampleSize,\
                                "--chromosome_length %s"%popSimulationJob.locusLength,\
                                "--replicate_index %s"%(i)]
            """
            extraArgumentList.append("--parent_pop_gen_simulation_type_id %s"%self.parent_pop_gen_simulation_type_id)
            """
            simulation2DBJob = self.addPutStuffIntoDBJob(executable=self.AddPopGenSimulation2DB, \
                    inputFileList=[msOutput2PolymorphismTableFileJob.output], \
                    logFile=logFile, commit=True, \
                    parentJobLs=[popGenSimulationFolderJob, msOutput2PolymorphismTableFileJob], \
                    extraDependentInputLs=None, transferOutput=True, extraArguments=None, \
                    extraArgumentList=extraArgumentList,\
                    job_max_memory=10, sshDBTunnel=self.needSSHDBTunnel)
    def addAddRG2BamJobsAsNeeded(self, alignmentDataLs=None, transferOutput=True):
        """
        2011-9-15
            add a read group only when the alignment doesn't have it according to db record
            DBVervet.pokeBamReadGroupPresence() from misc.py helps to fill in db records if it's unclear.
        2011-9-14
            The read-group adding jobs will have a "move" part that overwrites
                the original bam&bai if site_handler and input_site_handler is same.
            For those alignment files that don't need to. It doesn't matter.
             pegasus will transfer/symlink them.
        """
        print(f"Adding add-read-group2BAM jobs for {len(alignmentDataLs)} "
            f"alignments if read group is not detected ... ", flush=True)
        job_max_memory = 3500	#in MB

        addRG2BamDir = None
        addRG2BamDirJob = None

        no_of_rg_jobs = 0
        returnData = []
        for alignmentData in alignmentDataLs:
            alignment = alignmentData.alignment
            parentJobLs = alignmentData.jobLs
            bamF = alignmentData.bamF
            baiF = alignmentData.baiF
            if alignment.read_group_added!=1:
                if addRG2BamDir is None:
                    addRG2BamDir = "addRG2Bam"
                    addRG2BamDirJob = self.addMkDirJob(outputDir=addRG2BamDir)
                # add RG to this bam
                sequencer = alignment.individual_sequence.sequencer
                read_group = alignment.getReadGroup()
                if sequencer=='454':
                    platform_id = 'LS454'
                elif sequencer=='GA':
                    platform_id = 'ILLUMINA'
                else:
                    platform_id = 'ILLUMINA'
                outputRGSAM = File(os.path.join(addRG2BamDir,\
                    os.path.basename(alignment.path)))
                addRGJob:Job = self.addJavaJob(self.AddOrReplaceReadGroupsJava,
                    jarFile=self.AddOrReplaceReadGroupsJar,
                    inputArgumentOption="INPUT=", inputFile=bamF,
                    outputArgumentOption="OUTPUT=", outputFile=outputRGSAM,
                    transferOutput=transferOutput,
                    extraArgumentList=['RGID=%s'%(read_group), 
                        'RGLB=%s'%(platform_id),
                        'RGPL=%s'%(platform_id), 'RGPU=%s'%(read_group), 'RGSM=%s'%(read_group),
                        'SORT_ORDER=coordinate', "VALIDATION_STRINGENCY=LENIENT"],
                    parentJobLs=parentJobLs,
                    extraDependentInputLs=[baiF], job_max_memory=job_max_memory)
                    #(adding the SORT_ORDER doesn't do sorting but it marks the header
                    #  as sorted so that BuildBamIndexJar won't fail.)
                if self.tmpDir:
                    addRGJob.add_args("TMP_DIR=%s"%self.tmpDir)
                if addRG2BamDirJob:
                    self.add_dependency(addRGJob, parents=[addRG2BamDirJob])

                index_sam_job = self.addBAMIndexJob(
                    inputBamF=outputRGSAM, parentJobLs=[addRGJob],
                    transferOutput=transferOutput, javaMaxMemory=2000)
                newAlignmentData = PassingData(alignment=alignment)
                newAlignmentData.jobLs = [index_sam_job, addRGJob]
                newAlignmentData.bamF = index_sam_job.bamFile
                newAlignmentData.baiF = index_sam_job.baiFile
                no_of_rg_jobs += 1
            else:
                newAlignmentData = alignmentData
            returnData.append(newAlignmentData)
        print(f"{no_of_rg_jobs} alignments need read-group addition.",
            flush=True)
        return returnData
Example #23
0
    def reduceAfterEachAlignment(self,
                                 passingData=None,
                                 transferOutput=False,
                                 data_dir=None,
                                 **keywords):
        """
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        alignmentJobAndOutputLs = getattr(passingData,
                                          'alignmentJobAndOutputLs', [])
        bamFnamePrefix = passingData.bamFnamePrefix
        topOutputDirJob = passingData.topOutputDirJob
        individual_alignment = passingData.individual_alignment
        reduceOutputDirJob = passingData.reduceOutputDirJob

        if len(alignmentJobAndOutputLs) > 0:
            #2012.3.29	merge alignment output only when there is something to merge!
            #2013.04.09 create a new child alignment local_realigned =1, etc.
            new_individual_alignment = self.db.copyParentIndividualAlignment(
                parent_individual_alignment_id=individual_alignment.id,\
                data_dir=self.data_dir,
                local_realigned=individual_alignment.local_realigned,\
                reduce_reads=1)

            # replace read_group with the new one to each alignment job
            newAlignmentJobAndOutputLs = []
            for alignmentJobAndOutput in alignmentJobAndOutputLs:
                # add a AddReadGroup job
                alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[:
                                                                              2]
                fileBasenamePrefix = os.path.splitext(
                    alignmentJob.output.name)[0]
                outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix))
                # needBAMIndexJob=False because addAlignmentMergeJob()
                # does not need .bai.
                addRGJob = self.addReadGroupJob(
                    individual_alignment=new_individual_alignment,
                    inputBamFile=alignmentJob.output,
                    outputBamFile=outputRGBAM,
                    needBAMIndexJob=False,
                    parentJobLs=[alignmentJob, indexAlignmentJob],
                    extraDependentInputLs=alignmentJob.outputLs[1:],
                    job_max_memory=2500,
                    transferOutput=False)

                newAlignmentJobAndOutputLs.append(
                    PassingData(jobLs=[addRGJob], file=addRGJob.output))
            mergedBamFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s.merged.bam' % (bamFnamePrefix)))
            alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob(
                alignmentJobAndOutputLs=newAlignmentJobAndOutputLs,
                outputBamFile=mergedBamFile,
                needBAMIndexJob=True,
                parentJobLs=[reduceOutputDirJob],
                transferOutput=False)
            #2012.9.19 add/copy the alignment file to db-affliated storage
            #add the metric file to AddAlignmentFile2DB.py as well
            #  (to be moved into db-affiliated storage)
            logFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s_2db.log' % (bamFnamePrefix)))
            alignment2DBJob = self.addAlignmentFile2DBJob(
                executable=self.AddAlignmentFile2DB,
                inputFile=alignmentMergeJob.output,
                baiFile=bamIndexJob.baiFile,
                individual_alignment_id=new_individual_alignment.id,
                logFile=logFile,
                data_dir=data_dir,
                otherInputFileList=None,
                parentJobLs=[alignmentMergeJob, bamIndexJob],
                transferOutput=transferOutput,
                job_max_memory=2000,
                sshDBTunnel=self.needSSHDBTunnel,
                commit=True)
            self.no_of_jobs += 1
            returnData.jobDataLs.append(
                PassingData(jobLs=[alignment2DBJob],
                file=alignment2DBJob.logFile, \
                fileLs=[alignment2DBJob.logFile]))
        return returnData
    def addJobs(self, inputData=None, db_main=None, genotypeMethodShortName=None, commit=None,\
            data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\
            maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False):
        """
        2012.5.9
        """
        sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... " %
                         (len(inputData.jobDataLs)))

        topOutputDir = "%sVCF2DB" % (outputDirPrefix)
        topOutputDirJob = self.addMkDirJob(outputDir=topOutputDir)

        firstVCFFile = inputData.jobDataLs[0].vcfFile
        logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log'))
        addGM2DBJob = self.addAddGenotypeMethod2DBJob(
            executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \
            genotypeMethodShortName=genotypeMethodShortName,\
            logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=None,
            extraDependentInputLs=None, transferOutput=True, \
            extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel)
        updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log'))
        updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(
            executable=self.UpdateGenotypeMethodNoOfLoci, \
            genotypeMethodShortName=genotypeMethodShortName,\
            logFile=updateGMlogFile, data_dir=data_dir, commit=commit,
            parentJobLs=[topOutputDirJob], \
            extraDependentInputLs=[], transferOutput=True, \
            extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel)

        returnData = PassingData()
        returnData.jobDataLs = []
        for jobData in inputData.jobDataLs:
            inputF = jobData.vcfFile
            if maxContigID:
                contig_id = self.getContigIDFromFname(inputF.name)
                try:
                    contig_id = int(contig_id)
                    if contig_id > maxContigID:  #skip the small contigs
                        continue
                except:
                    sys.stderr.write('Except type: %s\n' %
                                     repr(sys.exc_info()))
                    import traceback
                    traceback.print_exc()
            logFile = File(
                os.path.join(
                    topOutputDir, 'AddVCFFile2DB_%s.log' %
                    (self.getChrFromFname(inputF.name))))
            addVCFJob = self.addAddVCFFile2DBJob(
                executable=self.AddVCFFile2DB, inputFile=inputF,
                genotypeMethodShortName=genotypeMethodShortName,\
                logFile=logFile, format="VCF", data_dir=data_dir,
                checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \
                parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \
                extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel)
            self.add_dependency(updateGMNoOfLociJob, parents=[addVCFJob])
        sys.stderr.write("%s jobs.\n" % (self.no_of_jobs))
        #include the tfam (outputList[1]) into the fileLs
        returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob],
            file=updateGMlogFile, \
            fileLs=[updateGMlogFile]))
        return returnData
    def reduceAfterEachAlignment(self,
                                 passingData=None,
                                 transferOutput=False,
                                 data_dir=None,
                                 **keywords):
        """
        """
        returnData = PassingData(no_of_jobs=0)
        returnData.jobDataLs = []
        alignmentJobAndOutputLs = getattr(passingData,
                                          'alignmentJobAndOutputLs', [])
        bamFnamePrefix = passingData.bamFnamePrefix
        topOutputDirJob = passingData.topOutputDirJob
        individual_alignment = passingData.individual_alignment
        reduceOutputDirJob = passingData.reduceOutputDirJob

        if len(alignmentJobAndOutputLs) > 0:
            #2012.3.29	merge alignment output only when there is something to merge!
            #2013.04.09 create a new child alignment local_realigned =1, etc.
            new_individual_alignment = self.db.copyParentIndividualAlignment(
                parent_individual_alignment_id=individual_alignment.id,\
                mask_genotype_method_id=self.new_mask_genotype_method_id,\
                data_dir=self.data_dir, local_realigned=1)

            baseCoverage = 4  #baseline
            actualCoverage = getattr(individual_alignment.individual_sequence,
                                     'coverage', baseCoverage)
            minMergeAlignmentWalltime = 240
            #in minutes, 4 hours, when coverage is defaultCoverage
            maxMergeAlignmentWalltime = 2880  #in minutes, 2 days
            minMergeAlignmentMaxMemory = 7000
            #in MB, when coverage is defaultCoverage
            maxMergeAlignmentMaxMemory = 12000  #in MB

            mergeAlignmentWalltime = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=actualCoverage,
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentWalltime * 2,
                minJobPropertyValue=minMergeAlignmentWalltime,
                maxJobPropertyValue=maxMergeAlignmentWalltime).value
            mergeAlignmentMaxMemory = self.scaleJobWalltimeOrMemoryBasedOnInput(
                realInputVolume=actualCoverage,
                baseInputVolume=baseCoverage,
                baseJobPropertyValue=minMergeAlignmentMaxMemory,
                minJobPropertyValue=minMergeAlignmentMaxMemory,
                maxJobPropertyValue=maxMergeAlignmentMaxMemory).value

            # replace read_group with the new one to each alignment job
            newAlignmentJobAndOutputLs = []
            for alignmentJobAndOutput in alignmentJobAndOutputLs:
                # add a AddReadGroup job
                alignmentJob, indexAlignmentJob = alignmentJobAndOutput.jobLs[:
                                                                              2]
                fileBasenamePrefix = os.path.splitext(
                    alignmentJob.output.name)[0]
                outputRGBAM = File("%s.isq_RG.bam" % (fileBasenamePrefix))
                # needBAMIndexJob=False because addAlignmentMergeJob()
                # does not need .bai.
                addRGJob = self.addReadGroupJob(
                    individual_alignment=new_individual_alignment,
                    inputBamFile=alignmentJob.output,
                    outputBamFile=outputRGBAM,
                    needBAMIndexJob=False,
                    parentJobLs=[alignmentJob, indexAlignmentJob],
                    extraDependentInputLs=alignmentJob.outputLs[1:],
                    job_max_memory=2500,
                    transferOutput=False,
                    walltime=max(180, mergeAlignmentWalltime / 20))

                newAlignmentJobAndOutputLs.append(
                    PassingData(jobLs=[addRGJob], file=addRGJob.output))

            mergedBamFile = File(os.path.join(reduceOutputDirJob.output, \
                '%s_recal.bam'%(bamFnamePrefix)))
            alignmentMergeJob, bamIndexJob = self.addAlignmentMergeJob(
                alignmentJobAndOutputLs=newAlignmentJobAndOutputLs,
                outputBamFile=mergedBamFile,
                needBAMIndexJob=True,
                parentJobLs=[reduceOutputDirJob],
                walltime=mergeAlignmentWalltime,
                job_max_memory=mergeAlignmentMaxMemory,
                transferOutput=False)
            #2012.9.19 add/copy the alignment file to db-affliated storage
            #add the metric file to AddAlignmentFile2DB.py as well
            #  (to be moved into db-affiliated storage)
            logFile = File(
                os.path.join(reduceOutputDirJob.output,
                             '%s_2db.log' % (bamFnamePrefix)))
            alignment2DBJob = self.addAlignmentFile2DBJob(
                executable=self.AddAlignmentFile2DB,
                inputFile=alignmentMergeJob.output,
                baiFile=bamIndexJob.baiFile,
                individual_alignment_id=new_individual_alignment.id,
                mask_genotype_method_id=self.new_mask_genotype_method_id,
                logFile=logFile,
                data_dir=data_dir,
                otherInputFileList=None,
                parentJobLs=[alignmentMergeJob, bamIndexJob],
                transferOutput=transferOutput,
                sshDBTunnel=self.needSSHDBTunnel,
                commit=True,
                job_max_memory=2000,
                walltime=max(180, mergeAlignmentWalltime / 2))
            self.no_of_jobs += 1
            returnData.jobDataLs.append(PassingData(jobLs=[alignment2DBJob],
                file=alignment2DBJob.logFile, \
                fileLs=[alignment2DBJob.logFile]))
        return returnData