Python addMkDirJob Examples

Programming Language: Python

Namespace/Package Name: pymodule.yh_pegasus

Method/Function: addMkDirJob

Examples at hotexamples.com: 16

Python addMkDirJob - 16 examples found. These are the top rated real world Python examples of pymodule.yh_pegasus.addMkDirJob extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: MummerTwoGenomesPipeline.py Project: mjmontague/vervet-web

	def addSplitFastaFileJobs(self, workflow, refFastaF, SelectAndSplitFastaRecords, fastaTitleLs, mkdirWrap=None, 
								site_handler=None, namespace='workflow', version='1.0', fastaOutputDir = "fasta"):
		"""
		2011-7-25
			split the whole fasta file into files, each containing one fasta record (from fastaTitleLs)
			return the data
		"""
		sys.stderr.write("Adding job to split %s into %s records ..."%(refFastaF.name, len(fastaTitleLs)))
		# Add a mkdir job
		mkDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=mkdirWrap, outputDir=fastaOutputDir, namespace=namespace, version=version)
		
		
		selectAndSplitFastaJob = Job(namespace=namespace, name=SelectAndSplitFastaRecords.name, version=version)
		selectAndSplitFastaJob.addArguments('-i', refFastaF, "-o", fastaOutputDir)
		selectAndSplitFastaJob.uses(refFastaF, transfer=True, register=True, link=Link.INPUT)
		workflow.addJob(selectAndSplitFastaJob)
		workflow.depends(parent=mkDirJob, child=selectAndSplitFastaJob)
		
		refName2jobDataLs = {}
		for refName in fastaTitleLs:
			if refName not in refName2jobDataLs:
				refName2jobDataLs[refName] = []
			selectAndSplitFastaJob.addArguments(refName)
			
			fastaFname = os.path.join(fastaOutputDir, '%s.fasta'%(refName))
			fastaFile = File(fastaFname)
			selectAndSplitFastaJob.uses(fastaFile, transfer=False, register=True, link=Link.OUTPUT)
			
			refName2jobDataLs[refName] = [selectAndSplitFastaJob, fastaFile]
		sys.stderr.write("Done.\n")
		return PassingData(refName2jobDataLs=refName2jobDataLs, workflow=workflow)

Example #2

Show file

    def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \
      transferOutput=True):
        """
		2012.6.27
		"""

        sys.stderr.write("Adding wget jobs for %s input ... " %
                         (len(relativePathList)))
        no_of_jobs = 0

        topOutputDir = outputDir
        topOutputDirJob = yh_pegasus.addMkDirJob(self,
                                                 mkdir=self.mkdirWrap,
                                                 outputDir=topOutputDir)
        no_of_jobs += 1
        returnData = PassingData()
        returnData.jobDataLs = []

        for relativePath in relativePathList:
            #2013.06.26 remove all "/" from  relativePath in case it's a folder
            relativePathNoFolder = relativePath.replace('/', '_')
            logFile = File('%s.log' % (relativePathNoFolder))
            wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \
               username=username, password=password,\
               targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \
               transferOutput=transferOutput, \
               extraArguments=None, job_max_memory=50)
            #include the tfam (outputList[1]) into the fileLs
            returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \
                    fileLs=wgetJob.outputLs))
            no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))

        return returnData

Example #3

Show file

File: MarkDuplicatesWorkflow.py Project: mjmontague/vervet-web

	def addJobs(self, workflow, inputData=None, pegasusFolderName="", tmpDir="/tmp"):
		"""
		2012.3.21
		"""
		sys.stderr.write("Adding MarkDuplicates jobs on %s input datasets ..."%(len(inputData.jobDataLs)))
		returnJobData = PassingData(jobDataLs = [])
		
		topOutputDir = pegasusFolderName
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
		
		no_of_jobs = 1
		for jobData in inputData.jobDataLs:
			inputFile = jobData.output
			
			bamIndexJob = self.addBAMIndexJob(workflow, BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, \
											BuildBamIndexJar=workflow.BuildBamIndexJar, \
											inputBamF=inputFile,\
											parentJobLs=[topOutputDirJob]+jobData.jobLs, transferOutput=False)
			
			outputFname = self.getMarkDupOutputFnameBasedOnInputFname(inputFile.abspath)
			
			finalBamFileName = os.path.join(topOutputDir, outputFname)
			finalBamFile = File(finalBamFileName)
			
			markDupJob, markDupBamIndexJob = self.addMarkDupJob(workflow, parentJobLs=[bamIndexJob]+jobData.jobLs, \
						inputBamF=bamIndexJob.bamFile, \
						inputBaiF=bamIndexJob.output, outputBamFile=finalBamFile,\
						MarkDuplicatesJava=workflow.MarkDuplicatesJava, MarkDuplicatesJar=workflow.MarkDuplicatesJar, tmpDir=tmpDir,\
						BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \
						transferOutput=True)
			
			no_of_jobs += 3
			returnJobData.jobDataLs.append(PassingData(output=finalBamFile, jobLs=[markDupJob, markDupBamIndexJob]))
		sys.stderr.write("%s jobs.\n"%(no_of_jobs))
		return returnJobData

Example #4

Show file

    def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, \
      transferOutput=True, makeBlastDBJob=None):
        """
		2012.5.24
		"""

        sys.stderr.write("Adding blast jobs for %s input ... " %
                         (len(inputData.jobDataLs)))
        no_of_jobs = 0

        topOutputDir = "%sBlast" % (outputDirPrefix)
        topOutputDirJob = yh_pegasus.addMkDirJob(self,
                                                 mkdir=self.mkdirWrap,
                                                 outputDir=topOutputDir)
        no_of_jobs += 1

        allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv'))
        allBlastMergeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \
             outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob])
        no_of_jobs += 1

        ntDatabaseFile = ntDatabaseFileList[0]
        returnData = PassingData()
        returnData.jobDataLs = []

        for jobData in inputData.jobDataLs:
            inputF = jobData.output
            outputFnamePrefix = os.path.join(
                topOutputDir,
                os.path.splitext(os.path.basename(inputF.name))[0])

            splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \
               noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\
               parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=[], transferOutput=False, \
               extraArguments=None, job_max_memory=500)
            no_of_jobs += 1
            for splitFastaOutput in splitFastaJob.outputList:
                outputFile = File('%s.tsv' % (splitFastaOutput.name))
                blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, \
                    outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile,\
                    maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, \
                    minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, \
                    parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \
                    extraArguments=None, job_max_memory=1000)

                #add output to some reduce job
                self.addInputToStatMergeJob(statMergeJob=allBlastMergeJob, \
                    inputF=blastJob.output, parentJobLs=[blastJob])
                no_of_jobs += 1
        sys.stderr.write("%s jobs. Done.\n" % (no_of_jobs))
        #include the tfam (outputList[1]) into the fileLs
        returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \
                 fileLs=[allBlastResultFile]))
        return returnData

Example #5

Show file

File: PCAOnVCFWorkflow.py Project: mjmontague/vervet-web

    def addJobs(
        self,
        workflow,
        inputData=None,
        db_vervet=None,
        smartpcaParameterFname="",
        pegasusFolderName="",
        maxContigID=None,
        missingCallAsRefBase=0,
        transferOutput=True,
    ):
        """
		2012.9.11
			add argument missingCallAsRefBase
		2011.1.8
			add outputDirPrefix to differentiate one run from another if multiple trio call workflows are run simultaneously
			outputDirPrefix could contain "/" to denote sub-folders.
		"""

        sys.stderr.write(
            "Adding smartpca jobs on %s VCFs (contig_id<=%s) ..." % (len(inputData.jobDataLs), maxContigID)
        )
        returnJobData = PassingData()

        no_of_jobs = 0

        topOutputDir = pegasusFolderName
        topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)

        smartpcaGenotypeInputFile = File(os.path.join(topOutputDir, "smartpca.geno"))
        smartpcaLocusInputFile = File(os.path.join(topOutputDir, "smartpca.snp"))

        smartpcaGenotypeMergeJob = self.addStatMergeJob(
            workflow,
            statMergeProgram=workflow.MergeFiles,
            outputF=smartpcaGenotypeInputFile,
            transferOutput=transferOutput,
            extraArguments="",
            parentJobLs=[topOutputDirJob],
        )
        smartpcaLocusMergeJob = self.addStatMergeJob(
            workflow,
            statMergeProgram=workflow.MergeFiles,
            outputF=smartpcaLocusInputFile,
            transferOutput=transferOutput,
            extraArguments="",
            parentJobLs=[topOutputDirJob],
        )

        smartpcaIndFile = None
        smartpcaIndJob = None
        no_of_jobs += 3

        for jobData in inputData.jobDataLs:
            inputF = jobData.vcfFile
            contig_id = self.getContigIDFromFname(inputF.name)
            try:
                if maxContigID:
                    contig_id = int(contig_id)
                    if contig_id > maxContigID:  # skip the small contigs
                        continue
            except:
                sys.stderr.write("Except type: %s\n" % repr(sys.exc_info()))
                import traceback

                traceback.print_exc()
            outputFnamePrefix = os.path.join(topOutputDir, "Contig%s" % (contig_id))
            convertJob = self.addConvertVCF2EigenStratJob(
                workflow,
                executable=workflow.ConvertVCF2EigenStrat,
                inputF=inputF,
                outputFnamePrefix=outputFnamePrefix,
                missingCallAsRefBase=missingCallAsRefBase,
                parentJobLs=[topOutputDirJob] + jobData.jobLs,
                extraDependentInputLs=[],
                transferOutput=False,
                extraArguments=None,
                job_max_memory=100,
            )
            if smartpcaIndFile is None:  # every VCF has the same order of individuals
                smartpcaIndFile = convertJob.indOutputF
                smartpcaIndJob = convertJob
            self.addInputToStatMergeJob(
                workflow, statMergeJob=smartpcaGenotypeMergeJob, inputF=convertJob.genoOutputF, parentJobLs=[convertJob]
            )
            self.addInputToStatMergeJob(
                workflow, statMergeJob=smartpcaLocusMergeJob, inputF=convertJob.locusOutputF, parentJobLs=[convertJob]
            )
            no_of_jobs += 1

            # smartpcaCorFile = File(os.path.join(topOutputDir, 'smartpca.cor'))
        smartpcaEvecFile = File(os.path.join(topOutputDir, "smartpca.evec"))
        smartpcaEvalFile = File(os.path.join(topOutputDir, "smartpca.eval"))
        self.outputSmartpcaParameters(
            smartpcaParameterFname=smartpcaParameterFname,
            smartpcaGenotypeInputFile=smartpcaGenotypeInputFile,
            smartpcaLocusInputFile=smartpcaLocusInputFile,
            smartpcaIndFile=smartpcaIndFile,
            smartpcaCorFile=None,
            smartpcaEvecFile=smartpcaEvecFile,
            smartpcaEvalFile=smartpcaEvalFile,
        )
        smartpcaParameterFile = self.registerOneInputFile(
            workflow, smartpcaParameterFname, folderName=pegasusFolderName
        )

        smartpcaJob = self.addSmartpcaJob(
            workflow,
            executable=workflow.smartpca,
            smartpcaParameterFile=smartpcaParameterFile,
            parentJobLs=[smartpcaGenotypeMergeJob, smartpcaLocusMergeJob, smartpcaIndJob],
            extraDependentInputLs=[smartpcaGenotypeInputFile, smartpcaLocusInputFile, smartpcaIndFile],
            transferOutput=transferOutput,
            extraArguments=None,
            outputFileList=[None, smartpcaEvecFile, smartpcaEvalFile],
            job_max_memory=18000,
        )

        # 2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey)
        outputF = File(os.path.join(topOutputDir, "smartpca_evec_withMetaInfo.tsv"))
        appendInfo2SmartPCAOutputJob = self.addGenericJob(
            executable=self.AppendInfo2SmartPCAOutput,
            inputFile=smartpcaEvecFile,
            outputFile=outputF,
            parentJobLs=[smartpcaJob],
            extraDependentInputLs=None,
            extraOutputLs=None,
            transferOutput=transferOutput,
            extraArgumentList=None,
            extraArguments="--inversePCValue",
            key2ObjectForJob=None,
            job_max_memory=2000,
        )
        self.addDBArgumentsToOneJob(job=appendInfo2SmartPCAOutputJob, objectWithDBArguments=self)

        no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))
        return smartpcaJob

Example #6

Show file

File: PairwiseGWASPeakOverlapPipeline.py Project: bopopescu/gwasmodules

    def run(self):
        """
		2011-9-28
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        db_250k = self.db_250k

        sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID(
            self.biology_category_id, access=self.access)
        sameCategoryPhenotypeMethodIDLs = [
            pm.id for pm in sameCategoryPhenotypeMethodLs
        ]

        #merge the two lists of phenotype method id together
        phenotype_method_id_ls = list(
            set(self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs))
        phenotype_method_id_ls.sort()

        result_query = db_250k.getResultLs(call_method_id=self.call_method_id, analysis_method_id_ls=self.analysis_method_id_ls, \
            phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method_id)
        result_id_ls = self.result_id_ls
        for result in result_query:
            result_id_ls.append(result.id)

        #make sure the entries with (result_id, self.association_peak_type_id) exists in AssociationPeak
        result_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak(
            result_id_ls, self.association_peak_type_id)

        # Create a abstract dag
        workflow = self.initiateWorkflow()

        self.registerExecutables(workflow)
        self.registerCustomExecutables(workflow)

        overlapStatDir = "overlapStat"
        overlapStatDirJob = yh_pegasus.addMkDirJob(workflow,
                                                   mkdir=workflow.mkdirWrap,
                                                   outputDir=overlapStatDir)
        overlapPlotDir = "overlapPlot"
        overlapPlotDirJob = yh_pegasus.addMkDirJob(workflow,
                                                   mkdir=workflow.mkdirWrap,
                                                   outputDir=overlapPlotDir)

        analysis_method_id_ls = map(str, self.analysis_method_id_ls)
        outputFnamePrefix = os.path.join(overlapPlotDir, 'callMethod%s_cnvMethod%s_analysisMethod%s_biologyCategory%s_peakType%s_overlapPeak'%\
              (self.call_method_id, self.cnv_method_id, '_'.join(analysis_method_id_ls), self.biology_category_id,\
              self.association_peak_type_id))
        plotAssociationPeakOverlapJob = self.addPlotPeakOverlapJob(workflow, executable=workflow.plotAssociationPeakOverlap, \
             outputFnamePrefix=outputFnamePrefix, \
             parentJobLs=[overlapPlotDirJob], job_max_memory=100, walltime = 60, \
             extraDependentInputLs=[], \
             transferOutput=True)

        counter = 0
        no_of_input = 0
        for i in xrange(len(result_id_ls)):
            for j in range(i + 1, len(result_id_ls)):
                result1_id = result_id_ls[i]
                result2_id = result_id_ls[j]
                outputFnamePrefix = 'result_%s_vs_%s_peak_type_%s' % (
                    result1_id, result2_id, self.association_peak_type_id)
                outputF = File(
                    os.path.join(overlapStatDir,
                                 '%s.tsv' % (outputFnamePrefix)))
                if no_of_input == 0:  #add one random input, otherwise replica catalog error occurs
                    rm1 = Stock_250kDB.ResultsMethod.get(result1_id)
                    inputFile1 = self.registerOneInputFile(
                        workflow, rm1.filename)
                    extraDependentInputLs = [inputFile1]
                else:
                    extraDependentInputLs = []

                no_of_input += 1
                gwasPeakOverlapJob = self.addGWASPeakOverlapJob(workflow, executable=workflow.twoGWASPeakOverlap, \
                   result1_id=result1_id, result2_id=result2_id, association1_peak_type_id=self.association_peak_type_id, \
                   association2_peak_type_id=self.association_peak_type_id, peak_padding=self.peak_padding, \
                   outputF=outputF, \
                   commit=1, results_directory=None, logFile=None, \
                   parentJobLs=[overlapStatDirJob], job_max_memory=100, walltime = 60, \
                   extraDependentInputLs=extraDependentInputLs, \
                   transferOutput=True)
                counter += 1

                self.addInputToStatMergeJob(workflow, statMergeJob=plotAssociationPeakOverlapJob, inputF=outputF, \
                   parentJobLs=[gwasPeakOverlapJob])

        sys.stderr.write("%s gwas peak overlap jobs.\n" % (counter))
        # Write the DAX to stdout
        outf = open(self.outputFname, 'w')
        self.writeXML(outf)

Example #7

Show file

    def addJobs(self,
                workflow,
                inputData=None,
                min_MAF=None,
                min_cor=None,
                chunkSize=None,
                pegasusFolderName=""):
        """
		2012.3.3
		"""

        sys.stderr.write(
            "Adding LD-calculating jobs on %s input datasets ..." %
            (len(inputData.jobDataLs)))
        returnJobData = PassingData()

        no_of_jobs = 0

        topOutputDir = pegasusFolderName
        topOutputDirJob = yh_pegasus.addMkDirJob(workflow,
                                                 mkdir=workflow.mkdirWrap,
                                                 outputDir=topOutputDir)

        finalCorrelationOutputFile = File(
            os.path.join(topOutputDir, 'correlation.h5'))

        correlationMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.MergeTwoLocusCorrelationHDF5, \
            outputF=finalCorrelationOutputFile, transferOutput=True, extraArguments='-d correlation', parentJobLs=[topOutputDirJob])

        inputJobData1, inputJobData2 = inputData.jobDataLs[:2]
        inputFile1 = inputJobData1.output
        inputFile2 = inputJobData2.output
        outputFile = File(
            os.path.join(topOutputDir, 'input1_in_input2_order.tsv'))
        orderDatasetRowJob1 = self.addOrderDatasetRowJob(workflow, executable=workflow.Order2ndSNPDataRowsSameAs1stSNPData, inputFile1=inputFile2, \
              inputFile2=inputFile1, outputFile=outputFile,\
              parentJobLs=[topOutputDirJob]+inputJobData1.jobLs, extraDependentInputLs=[], \
              transferOutput=False, extraArguments=None, \
              job_max_memory=1000)
        outputFile = File(
            os.path.join(topOutputDir, 'input2_in_same_order.tsv'))
        orderDatasetRowJob2 = self.addOrderDatasetRowJob(workflow, executable=workflow.Order2ndSNPDataRowsSameAs1stSNPData, inputFile1=orderDatasetRowJob1.output, \
              inputFile2=inputFile2, outputFile=outputFile,\
              parentJobLs=[orderDatasetRowJob1] + inputJobData2.jobLs, extraDependentInputLs=[], transferOutput=False, extraArguments=None, \
              job_max_memory=1000)

        outputFile = File(os.path.join(topOutputDir, 'input1.hdf5'))
        convertDataset2HDF5Job1 = self.addConvertSNPData2HDF5Job(workflow, executable=workflow.ConvertSNPData2HDF5, \
                      inputFile=orderDatasetRowJob1.output, \
              outputFile=outputFile, min_MAF=min_MAF, \
              parentJobLs=[orderDatasetRowJob1], extraDependentInputLs=[], transferOutput=False, extraArguments=None, \
              job_max_memory=100)

        outputFile = File(os.path.join(topOutputDir, 'input2.hdf5'))
        convertDataset2HDF5Job2 = self.addConvertSNPData2HDF5Job(workflow, executable=workflow.ConvertSNPData2HDF5, \
                      inputFile=orderDatasetRowJob2.output, \
               outputFile=outputFile, min_MAF=min_MAF, \
               parentJobLs=[orderDatasetRowJob2], extraDependentInputLs=[], transferOutput=False, extraArguments=None, \
               job_max_memory=100)
        no_of_jobs += 5

        no_of_cols_input1 = self.getNoOfLociFromSNPData(inputFile1.abspath)
        no_of_cols_input2 = self.getNoOfLociFromSNPData(inputFile2.abspath)
        for i1_start in range(0, no_of_cols_input1, chunkSize):
            i1_stop = min(i1_start + chunkSize - 1, no_of_cols_input1 - 1)
            for i2_start in range(0, no_of_cols_input2, chunkSize):
                i2_stop = min(i2_start + chunkSize - 1, no_of_cols_input2 - 1)
                outputFile = os.path.join(
                    topOutputDir, 'cor_i1_%s_%s_i2_%s_%s.h5' %
                    (i1_start, i1_stop, i2_start, i2_stop))
                corCalulationJob = self.addCalculateColCorBetweenTwoHDF5Job(workflow, executable=workflow.CalculateColCorBetweenTwoHDF5, \
                        inputFile1=convertDataset2HDF5Job1.output, inputFile2=convertDataset2HDF5Job2.output, \
                        outputFile=outputFile, i1_start=i1_start, i1_stop=i1_stop, i2_start=i2_start, i2_stop=i2_stop,\
                        min_cor=min_cor, parentJobLs=[convertDataset2HDF5Job1, convertDataset2HDF5Job2], \
                        extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
                        job_max_memory=50)
                no_of_jobs += 1
                self.addInputToStatMergeJob(workflow, statMergeJob=correlationMergeJob, \
                    inputF=corCalulationJob.output, parentJobLs=[corCalulationJob])

        sys.stderr.write("%s jobs.\n" % (no_of_jobs))
        return correlationMergeJob

Example #8

Show file

    def addJobs(self, workflow, db_250k=None, inputData=None, \
      biologyCategoryID2PhenotypeID2Data=None, pegasusFolderName="", \
      genePadding=20000, tax_id=3702, peakPadding=10000, phenotypeFile=None, call_method_id_set=None,\
      data_dir=None):
        """
		2012.11.13 change argument results_directory to data_dir
		2012.3.21
		"""
        sys.stderr.write(
            "Adding SNPRegion drawing jobs on %s biology categories ..." %
            (len(biologyCategoryID2PhenotypeID2Data)))
        returnJobData = PassingData(jobDataLs=[])

        topOutputDir = pegasusFolderName
        topOutputDirJob = yh_pegasus.addMkDirJob(workflow,
                                                 mkdir=workflow.mkdirWrap,
                                                 outputDir=topOutputDir)

        #add the PickleGenomeRBDict job
        genomeRBDictPickleFile = os.path.join(
            topOutputDir,
            'genomeRBDict_tax%s_padding%s.pickle' % (tax_id, genePadding))
        pickleGenomeRBDictJob = self.addPickleGenomeRBDictJob(workflow, workflow.PickleGenomeRBDict, outputF=genomeRBDictPickleFile, \
               genePadding=genePadding, tax_id=tax_id, \
               parentJobLs=[topOutputDirJob], job_max_memory=200, \
               extraDependentInputLs=[], transferOutput=True)

        #add the PickleGeneAnnotation job
        geneAnnotationPickleFile = os.path.join(
            topOutputDir, 'geneAnnotation_tax%s.pickle' % (tax_id))
        geneAnnotationPickleJob = self.addPickleGeneAnnotationJob(workflow, workflow.GenomeDB, outputF=geneAnnotationPickleFile, \
               tax_id=tax_id, \
               parentJobLs=[topOutputDirJob], job_max_memory=200, \
               extraDependentInputLs=[], transferOutput=True)

        no_of_jobs = 3

        #add PickleSNPInfo job for each call method
        call_method_id2JobData = {}
        for call_method_id in call_method_id_set:
            call_method = Stock_250kDB.CallMethod.get(call_method_id)
            snpMatrixFile = self.registerOneInputFile(workflow, \
                    inputFname=call_method.getFileAbsPath(oldDataDir=db_250k.data_dir, newDataDir=data_dir),\
                    folderName=pegasusFolderName)
            outputF = File(
                os.path.join(
                    topOutputDir, 'SNPInfo_LocusType%s.pickle' %
                    (call_method.locus_type_id)))
            pickleSNPInfoJob = self.addPickleSNPInfoJob(workflow, workflow.PickleSNPInfo, \
                  outputF=outputF, call_method_id=call_method_id, \
                  parentJobLs=[topOutputDirJob], job_max_memory=100, \
                  extraDependentInputLs=[], transferOutput=True)
            call_method_id2JobData[call_method_id] = PassingData(
                job=pickleSNPInfoJob, snpMatrixFile=snpMatrixFile)
            no_of_jobs += 1

        #one folder for each biology category
        for biology_category_id, phenotype_id2data in biologyCategoryID2PhenotypeID2Data.iteritems(
        ):
            #add a mkdirJob
            folderName = os.path.join(
                topOutputDir, 'biology_category_%s' % (biology_category_id))
            folderJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=folderName,\
                    parentJobLs=[topOutputDirJob])
            no_of_jobs += 1
            biologyCategory = Stock_250kDB.BiologyCategory.get(
                biology_category_id)

            list_type_id_list = biologyCategory.returnGeneListIDList()
            list_type_id_in_str_list = map(str, list_type_id_list)
            list_type_id_list_str = ','.join(list_type_id_in_str_list)

            for phenotype_id, result_data in phenotype_id2data.iteritems():
                result_peak_type_id_ls = result_data.result_peak_type_id_ls
                call_method_id_set = result_data.call_method_id_set
                result_id_peak_type_id_ls = []
                analysis_method_id_in_str_ls = []
                for result, peak_type_id in result_peak_type_id_ls:
                    result_id_peak_type_id_ls.append('%s:%s' %
                                                     (result.id, peak_type_id))
                    analysis_method_id_in_str_ls.append(
                        str(result.analysis_method_id))
                result_id_peak_type_id_ls_str = ','.join(
                    result_id_peak_type_id_ls)
                peakSpanOutputFile = File(
                    os.path.join(
                        folderName, 'phenotype_%s_result_%s.tsv' %
                        (phenotype_id, result_id_peak_type_id_ls_str)))
                multiPeakSpanJob = self.addOutputMultiGWASOverlapPeakSpanJob(workflow, workflow.OutputMultiGWASOverlapPeakSpan, \
                    outputF=peakSpanOutputFile, peakPadding=peakPadding, \
                    list_type_id_list=list_type_id_list_str, result_id_peak_type_id_ls=result_id_peak_type_id_ls_str, \
                    genePadding=genePadding, tax_id=tax_id, genomeRBDictPickleFile=genomeRBDictPickleFile, \
                    parentJobLs=[folderJob, pickleGenomeRBDictJob], job_max_memory=500, \
                    extraDependentInputLs=[], transferOutput=True)
                no_of_jobs += 1
                for call_method_id in call_method_id_set:
                    call_method = Stock_250kDB.CallMethod.get(call_method_id)
                    if call_method.locus_type_id == 2:  #2012.3.27 CNV locus type. no need to convert alleles into binary form.
                        snp_matrix_data_type = 4
                        #2012.3.26 these CNV-derived SNP dataset doesn't need its alleles to be converted to binary form as it's already binary.
                        #need_convert_alleles2binary = False
                        #useAlleleToDetermineAlpha = False
                    else:
                        snp_matrix_data_type = 1
                    callMethodJobData = call_method_id2JobData[call_method_id]
                    pickleSNPInfoJob = callMethodJobData.job
                    snpMatrixFile = callMethodJobData.snpMatrixFile
                    output_dir = folderName  #go to the biology category
                    logFile = File(os.path.join(folderName, 'call_%s_phenotype_%s_result_%s_drawSNPRegion.log')%\
                       (call_method_id, phenotype_id, result_id_peak_type_id_ls_str))
                    analysis_method_id_ls_str = ','.join(
                        analysis_method_id_in_str_ls)
                    drawSNPRegionJob = self.addDrawSNPRegionJob(workflow, executable=workflow.DrawSNPRegion, \
                       inputF=peakSpanOutputFile, call_method_id=call_method_id, snpMatrixFile=snpMatrixFile, \
                       phenotypeFile=phenotypeFile, output_dir=output_dir, results_directory=data_dir,\
                       analysis_method_id_ls=analysis_method_id_ls_str, \
                       geneAnnotationPickleFile=geneAnnotationPickleJob.output, \
                       list_type_id_list=list_type_id_list_str, \
                       snp_matrix_data_type=snp_matrix_data_type, exclude_accessions_with_NA_phenotype=0,\
                       snpInfoPickleFile=pickleSNPInfoJob.output, label_gene=1, min_MAF=0.1, min_distance=20000,\
                       logFile=logFile,\
                       parentJobLs=[geneAnnotationPickleJob, pickleSNPInfoJob, multiPeakSpanJob], \
                       job_max_memory=3500, extraDependentInputLs=[], \
                       transferOutput=True)
                    no_of_jobs += 1
        sys.stderr.write("%s jobs.\n" % (no_of_jobs))
        return returnJobData

Example #9

Show file

File: DefineAssociationLandscapePipeline.py Project: bopopescu/gwasmodules

	def run(self):
		"""
		2011-10
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		pd = PassingData(min_MAF=self.min_MAF,\
					data_dir=self.data_dir, \
					need_chr_pos_ls=0,)
		
		result_query = self.db_250k.getResultLs(analysis_method_id_ls=self.analysis_method_id_ls, \
						phenotype_method_id_ls=self.phenotype_method_id_ls, call_method_id_ls=self.call_method_id_ls,\
						cnv_method_id=self.cnv_method_id)
		result_id_ls = self.result_id_ls
		for result in result_query:
			result_id_ls.append(result.id)
			
		workflow = self.initiateWorkflow()
		
		self.registerExecutables()
		self.registerCustomExecutables(workflow)
		
		counter = 0
		
		topOutputDir = "%sAssociationLandscape"%(self.pegasusFolderName)
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow=self, mkdir=self.mkdirWrap, outputDir=topOutputDir)
		
		resultLandscapeType = self.db_250k.getResultLandscapeType(min_MAF=self.min_MAF, \
									neighbor_distance=self.neighbor_distance, \
									max_neighbor_distance=self.max_neighbor_distance)
		
		for result_id in result_id_ls:
			result = Stock_250kDB.ResultsMethod.get(result_id)
			
			associationResultFile = self.registerOneInputFile(inputFname=result.getFileAbsPath(oldDataDir=self.db_250k.data_dir, newDataDir=self.data_dir), \
												folderName=self.pegasusFolderName)
			logFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s.log'%\
									(result_id, resultLandscapeType.id)))
			landscapeOutputFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s.h5'%\
									(result_id, resultLandscapeType.id)))
			
			defineLandscapeJob = self.addDefineLandscapeJob(workflow, executable=workflow.DefineAssociationLandscape, \
							result_id=result_id, neighbor_distance=self.neighbor_distance, \
							max_neighbor_distance=self.max_neighbor_distance,\
							min_MAF=self.min_MAF, tax_id=self.tax_id, \
							data_dir=self.data_dir, logFile=logFile,\
							landscapeOutputFile=landscapeOutputFile,\
							extraDependentInputLs=[associationResultFile], \
							parentJobLs=[topOutputDirJob], sshDBTunnel=self.needSSHDBTunnel,\
							transferOutput=False)
			
			logFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s_log.tsv'%\
										(result_id, resultLandscapeType.id)))
			landscape2DBJob = self.addAssociationLandscape2DBJob(executable=self.AssociationLandscape2DB, inputFile=defineLandscapeJob.output, \
						result_id=result_id, \
						data_dir=self.data_dir, logFile=logFile, commit=self.commit, \
						min_MAF=self.min_MAF, \
						neighbor_distance=self.neighbor_distance, max_neighbor_distance=self.max_neighbor_distance, \
						parentJobLs=[topOutputDirJob, defineLandscapeJob], \
						extraDependentInputLs=None, transferOutput=True, extraArguments=None, job_max_memory=1000, sshDBTunnel=self.needSSHDBTunnel)
			
			#add landscape -> peak job
			outputFile = File('%s_peak.h5'%(outputFnamePrefix))
			self.addAssociationLandscape2PeakJob(executable=self.AssociationLandscape2Peak, \
				inputFile=defineLandscapeJob.output, outputFile=outputFile, min_score=min_score, ground_score=ground_score, \
				data_dir=data_dir, \
				parentJobLs=[defineLandscapeJob], job_max_memory=100, walltime = 60, \
				extraDependentInputLs=None, \
				transferOutput=False)
			
			counter += 1
		sys.stderr.write("%s total jobs.\n"%(self.no_of_jobs))
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		self.writeXML(outf)

Example #10

Show file

File: MummerTwoGenomesPipeline.py Project: mjmontague/vervet-web

	def run(self):
		"""
		2011-9-28
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		workflow = self.initiateWorkflow()
		
		self.registerJars()
		self.registerExecutables()
		self.registerCustomExecutables(workflow)
		site_handler =self.site_handler
		input_site_handler = self.input_site_handler
		
		ref_seq_f = self.registerOneInputFile(workflow, self.ref_seq_fname, folderName=self.pegasusFolderName)
		
		query_seq_f = self.registerOneInputFile(workflow, self.query_seq_fname, folderName=self.pegasusFolderName)
		
		# Add a mkdir job
		deltaOutputDir = "delta"
		deltaOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=deltaOutputDir)
		
		coordsOutputDir = "coords"
		coordsOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=coordsOutputDir)
		
		filterOutputDir = "filter"
		filterOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=filterOutputDir)
		
		plotOutputDir = "plot"
		plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotOutputDir)
		
		#plotScriptOutputDir = "plotScript"
		#plotScriptOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotScriptOutputDir)
		
		refNameLs = self.getFastaRecordTitleLs(self.ref_seq_fname)
		returnData3 = self.addSplitFastaFileJobs(workflow, ref_seq_f, self.SelectAndSplitFastaRecords, refNameLs, mkdirWrap=self.mkdirWrap,\
						site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='refFasta')
		refName2splitFastaJobDataLs = returnData3.refName2jobDataLs
		
		queryNameLs = self.getFastaRecordTitleLs(self.query_seq_fname)
		returnData3 = self.addSplitFastaFileJobs(workflow, query_seq_f, self.SelectAndSplitFastaRecords, queryNameLs, mkdirWrap=self.mkdirWrap,\
						site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='queryFasta')
		queryName2splitFastaJobDataLs = returnData3.refName2jobDataLs
		
		noOfJobs = len(refName2splitFastaJobDataLs) + len(queryName2splitFastaJobDataLs)
		ref_seq_prefix = os.path.splitext(os.path.basename(ref_seq_f.name))[0]
		for queryName, jobDataLs in queryName2splitFastaJobDataLs.iteritems():
			for refName, refJobDataLs in refName2splitFastaJobDataLs.iteritems():
				refSelectAndSplitFastaJob, refFastaFile = refJobDataLs[:2]
				selectAndSplitFastaJob, fastaFile = jobDataLs[:2]
				nucmerJob = Job(namespace=self.namespace, name=self.nucmer.name, version=self.version)
				outputPrefix = "%s_vs_%s_%s"%(queryName, ref_seq_prefix, refName)
				deltaFnamePrefix = os.path.join(deltaOutputDir, outputPrefix)
				nucmerJob.addArguments("--maxgap=500", "--mincluster=100", "--prefix", deltaFnamePrefix, \
									refFastaFile, fastaFile)
				nucmerJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT)
				nucmerJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT)
				deltaFname = "%s.delta"%(deltaFnamePrefix)
				deltaF = File(deltaFname)
				nucmerJob.uses(deltaFname, transfer=True, register=True, link=Link.OUTPUT)
				#3000M for one nucmer job with human as ref
				job_max_memory = 5000	#in MB
				yh_pegasus.setJobProperRequirement(nucmerJob, job_max_memory=job_max_memory)
				workflow.addJob(nucmerJob)
				
				workflow.depends(parent=refSelectAndSplitFastaJob, child=nucmerJob)
				workflow.depends(parent=selectAndSplitFastaJob, child=nucmerJob)
				workflow.depends(parent=deltaOutputDirJob, child=nucmerJob)
				
				coordsFname = os.path.join(coordsOutputDir, "%s.coords"%(outputPrefix))
				coordsF = File(coordsFname)
				filterFname = os.path.join(filterOutputDir, "%s.filter"%(outputPrefix))
				filterF = File(filterFname)
				plotPrefix = os.path.join(plotOutputDir, "%s_plot"%(outputPrefix))
				png_plotF = File("%s.png"%plotPrefix)
				gp_plotF = File("%s.gp"%plotPrefix)
				fplot_plotF = File("%s.fplot"%plotPrefix)
				rplot_plotF = File("%s.rplot"%plotPrefix)
				postNucJob = Job(namespace=self.namespace, name=self.PostNucmer.name, version=self.version)
				postNucJob.addArguments(deltaF, coordsF, filterF, refFastaFile, fastaFile, plotPrefix)
				postNucJob.uses(deltaF, transfer=True, register=True, link=Link.INPUT)
				postNucJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT)
				postNucJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT)
				
				postNucJob.uses(coordsF, transfer=True, register=True, link=Link.OUTPUT)
				postNucJob.uses(filterF, transfer=True, register=True, link=Link.OUTPUT)
				postNucJob.uses(png_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#leave files below behind
				#postNucJob.uses(gp_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#postNucJob.uses(fplot_plotF, transfer=True, register=True, link=Link.OUTPUT)
				#postNucJob.uses(rplot_plotF, transfer=True, register=True, link=Link.OUTPUT)
				
				yh_pegasus.setJobProperRequirement(postNucJob, job_max_memory=2000)
				workflow.addJob(postNucJob)
				workflow.depends(parent=nucmerJob, child=postNucJob)
				workflow.depends(parent=coordsOutputDirJob, child=postNucJob)
				workflow.depends(parent=filterOutputDirJob, child=postNucJob)
				workflow.depends(parent=plotOutputDirJob, child=postNucJob)
				#workflow.depends(parent=plotScriptOutputDirJob, child=postNucJob)
				noOfJobs += 2
		sys.stderr.write(" %s jobs. \n"%(noOfJobs))
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)

Example #11

Show file

File: BootstrapCompareAlleleFrequencyOfTwoPopulation.py Project: mjmontague/vervet-web

	def addAllJobs(self, workflow=None, inputVCFData=None, chr2IntervalDataLs=None, \
				GenomeAnalysisTKJar=None, samtools=None, \
				CreateSequenceDictionaryJava=None, CreateSequenceDictionaryJar=None, \
				BuildBamIndexFilesJava=None, BuildBamIndexJar=None,\
				mv=None, \
				refFastaFList=None, \
				needFastaIndexJob=False, needFastaDictJob=False, \
				data_dir=None, no_of_gatk_threads = 1, \
				intervalSize=3000, intervalOverlapSize=0, \
				outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords):
		"""
		2012.10.15
			architect of the whole map-reduce framework
			call the parent's addAllJobs in a loop
		"""
		samplingReturnDataLs = []
		for i in xrange(self.noOfSamplings):
			oneSamplingReturnData = CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.addAllJobs(self, \
					workflow=workflow, inputVCFData=inputVCFData, \
					chr2IntervalDataLs=chr2IntervalDataLs, samtools=samtools, \
				GenomeAnalysisTKJar=GenomeAnalysisTKJar, \
				CreateSequenceDictionaryJava=CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=CreateSequenceDictionaryJar, \
				BuildBamIndexFilesJava=BuildBamIndexFilesJava, BuildBamIndexJar=BuildBamIndexJar,\
				mv=mv, \
				refFastaFList=refFastaFList,\
				needFastaIndexJob=needFastaIndexJob, needFastaDictJob=needFastaDictJob, \
				data_dir=data_dir, no_of_gatk_threads = 1, \
				intervalSize=intervalSize, intervalOverlapSize=intervalOverlapSize, \
				outputDirPrefix='%s_%s_'%(outputDirPrefix, i), transferOutput=transferOutput, job_max_memory=job_max_memory,\
				**keywords)
			samplingReturnDataLs.append(oneSamplingReturnData)
		
		topOutputDir = "%sFinalReduce"%(outputDirPrefix)
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
		
		#a ReduceMatrixByAverageColumnsWithSameKey job
		outputFile = File(os.path.join(topOutputDir, 'medianAlleleSharingStatAcrossAllSampling.tsv'))
		medianReduceJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.ReduceMatrixByAverageColumnsWithSameKey, \
						outputF=outputFile, extraArguments='--keyColumnLs 0 -v 1-8', parentJobLs=[topOutputDirJob], \
						extraDependentInputLs=None, transferOutput=True)
		#a MergeSameHeaderTablesIntoOne job
		outputFile = File(os.path.join(topOutputDir, 'alleleSharingStatAcrossAllSampling.tsv'))
		mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.MergeSameHeaderTablesIntoOne, \
						outputF=outputFile, extraArguments=None, parentJobLs=[topOutputDirJob], \
						extraDependentInputLs=None, transferOutput=True)
		for oneSamplingReturnData in samplingReturnDataLs:
			self.addInputToStatMergeJob(workflow=workflow, statMergeJob=medianReduceJob, parentJobLs=[oneSamplingReturnData.estimateOutlierJob])
			self.addInputToStatMergeJob(workflow=workflow, statMergeJob=mergeJob, parentJobLs=[oneSamplingReturnData.estimateOutlierJob])
		
		outputFile = File( os.path.join(topOutputDirJob.output, 'outlierFraction_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="outlierFraction", whichColumnPlotLabel="outlierFraction", \
					logY=False, positiveLog=True, logCount=False, valueForNonPositiveYValue=-1,\
					minNoOfTotal=5,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[topOutputDirJob, mergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=True,  job_max_memory=2000)
		
		outputFile = File( os.path.join(topOutputDirJob.output, 'AFS_cor_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="corr", whichColumnPlotLabel="AFSCorrelation", \
					logY=False, positiveLog=True, logCount=False, valueForNonPositiveYValue=-1,\
					minNoOfTotal=5,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[topOutputDirJob, mergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=True,  job_max_memory=2000)
		
		sys.stderr.write("%s jobs.\n"%(self.no_of_jobs))

Example #12

Show file

File: VerifyBAMWorkflow.py Project: mjmontague/vervet-web

	def addJobs(self, workflow=None, alignmentDataLs=None, refName2size=None, inputVCF=None, verifyBamID=None, \
				data_dir=None, needPerContigJob=False, needSSHDBTunnel=0, outputDirPrefix="",\
				transferOutput=True):
		"""
		2012.8.30
			
		"""
		if workflow is None:
			workflow = self
		sys.stderr.write("Adding jobs for %s references and %s alignments..."%(len(refName2size), len(alignmentDataLs)))
		if len(alignmentDataLs)==0:
			sys.stderr.write("No alignment for verifyBamID. Exit now.\n")
			sys.exit(0)
		no_of_jobs = 0
		
		returnData = PassingData()
		returnData.jobDataLs = []
		
		topOutputDir = "%sverifyBAMOutput"%(outputDirPrefix)
		topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir)
		no_of_jobs += 1
		
		mergedOutputDir = "%smergedOutput"%(outputDirPrefix)
		mergedOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=mergedOutputDir)
		no_of_jobs += 1
		
		plotOutputDir = "%splot"%(outputDirPrefix)
		plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=plotOutputDir)
		no_of_jobs += 1
		
		selfSampleMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfSMMerge.tsv'))
		selfSampleMixupMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=selfSampleMixupMergeFile, transferOutput=False)
		returnData.jobDataLs.append(PassingData(jobLs=[selfSampleMixupMergeJob], \
												fileLs=[selfSampleMixupMergeFile]))
		no_of_jobs += 1
		"""
		output of *.selfSM from verifyBamID
		
#SEQ_ID	RG	CHIP_ID	#SNPS	#READS	AVG_DP	FREEMIX	FREELK1 FREELK0 FREE_RH FREE_RA CHIPMIX CHIPLK1 CHIPLK0 CHIP_RH CHIP_RA DPREF	RDPHET  RDPALT
1968_3017_2005001_GA_vs_524	ALL	NA	24414	22222	0.91	0.00003 13245.89	13305.27	0.55489 0.05202 NA	NA	NA	NA	NA	NA	NA	NA
		
		"""
		outputFile = File( os.path.join(plotOutputDir, 'freeMix_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \
					logY=False, logCount=True, valueForNonPositiveYValue=-1,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs += 1
		
		outputFile = File( os.path.join(plotOutputDir, 'refProbGivenHet_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREE_RH", whichColumnPlotLabel="refAlleleProbGivenHet", \
					logY=False, logCount=True, valueForNonPositiveYValue=-1,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs += 1
		
		outputFile = File( os.path.join(plotOutputDir, 'refProbGivenAlternative_Hist.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \
							outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREE_RA", whichColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \
					logY=False, logCount=True, valueForNonPositiveYValue=-1,\
					minNoOfTotal=10,\
					figureDPI=100, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs += 1

		outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_chipMix.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFractionByHet", \
					logY=False, valueForNonPositiveYValue=50,\
					xColumnHeader="CHIPMIX", xColumnPlotLabel="mixAgainstSelfGenotype", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_AVG_DP.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFractionByHet", \
					logY=False, valueForNonPositiveYValue=50,\
					xColumnHeader="AVG_DP", xColumnPlotLabel="avgDepth", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		outputFile = File(os.path.join(plotOutputDir, 'chipMix_vs_AVG_DP.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="CHIPMIX", whichColumnPlotLabel="mixAgainstSelfGenotype", \
					logY=False, valueForNonPositiveYValue=50,\
					xColumnHeader="AVG_DP", xColumnPlotLabel="avgDepth", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_refProbGivenHet.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \
					logY=False, valueForNonPositiveYValue=50,\
					xColumnHeader="FREE_RH", xColumnPlotLabel="refAlleleProbGivenHet", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_refProbGivenAlternativeAllele.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \
					logY=False, valueForNonPositiveYValue=50,\
					xColumnHeader="FREE_RA", xColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		outputFile = File(os.path.join(plotOutputDir, 'refAlleleProbGivenHet_vs_refProbGivenAlternativeAllele.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[selfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREE_RH", whichColumnPlotLabel="refAlleleProbGivenHet", \
					logY=False, valueForNonPositiveYValue=-1,\
					xColumnHeader="FREE_RA", xColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		#subtract the FREELIK1 - FREELIK0
		likSubstractedSelfSampleMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfSMMerge_likDelta.tsv'))
		likSubstractedSelfSampleMixupMergeJob = self.addStatMergeJob(workflow, \
					statMergeProgram=workflow.ReduceMatrixBySumSameKeyColsAndThenDivide, \
					outputF=likSubstractedSelfSampleMixupMergeFile, transferOutput=False, \
					extraArguments="--operatorType 2 -k 0 -v 7,8,6")
					#column 0 is sample ID, column 6 is FREEMIX. 7 is FREELIK1, 8 is FRELIK0
		self.addInputToStatMergeJob(workflow, statMergeJob=likSubstractedSelfSampleMixupMergeJob, inputF=selfSampleMixupMergeJob.output,\
						parentJobLs=[selfSampleMixupMergeJob])
		returnData.jobDataLs.append(PassingData(jobLs=[likSubstractedSelfSampleMixupMergeJob], \
								fileLs=[likSubstractedSelfSampleMixupMergeFile]))
		no_of_jobs += 1
		
		outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_deltaMinusLogLikelihood.png'))
		#no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel)
		self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \
					inputFileList=[likSubstractedSelfSampleMixupMergeJob.output], \
					outputFile=outputFile, \
					whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \
					logY=False, valueForNonPositiveYValue=-1,\
					xColumnHeader="FREELK1_by_FREELK0", xColumnPlotLabel="deltaMinusLogLikelihood", \
					minNoOfTotal=5,\
					figureDPI=150, samplingRate=1,\
					parentJobLs=[plotOutputDirJob, likSubstractedSelfSampleMixupMergeJob], \
					extraDependentInputLs=None, \
					extraArguments=None, transferOutput=transferOutput,  job_max_memory=2000)
		no_of_jobs +=1
		
		
		selfRGMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfRGMerge.tsv'))
		selfRGMixupMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=selfRGMixupMergeFile, transferOutput=False)
		returnData.jobDataLs.append(PassingData(jobLs=[selfRGMixupMergeJob], \
												fileLs=[selfRGMixupMergeFile]))
		no_of_jobs += 1
		
		#alignmentId2RGJobDataLs = returnData.alignmentId2RGJobDataLs
		i =0
		for alignmentData in alignmentDataLs:
			alignment = alignmentData.alignment
			
			bamF= alignmentData.bamF
			baiF = alignmentData.baiF
			if i ==0:	#need at least one log file
				transferOutputForThisJob = True
			else:
				transferOutputForThisJob = False
			i += 1
			
			outputFnamePrefix = os.path.join(topOutputDir, alignment.getReadGroup())
			verifyBamIDJob = self.addVerifyBamIDJob(executable=self.verifyBamID, inputVCF=inputVCF, inputBAM=bamF, \
									outputFnamePrefix=outputFnamePrefix,\
				doFreeFull=True,\
				doChipMix=None, doChipFull=None, doChipRefBias=None, doChipNone=None, \
				minAF=0.01, genoError=1e-03, minCallRate=0.50, \
				minMapQ=20, maxDepth=int(3*alignment.median_depth), minQ=13, maxQ=40, \
				parentJobLs=[topOutputDirJob]+alignmentData.jobLs, extraDependentInputLs=[baiF], \
				transferOutput=transferOutputForThisJob, \
				extraArguments=None, job_max_memory=5000)
				# pass transferOutput to it so to keep log
			no_of_jobs += 1
			
			
			self.addInputToStatMergeJob(workflow, statMergeJob=selfSampleMixupMergeJob, inputF=verifyBamIDJob.selfSMFile,\
						parentJobLs=[verifyBamIDJob])
			self.addInputToStatMergeJob(workflow, statMergeJob=selfRGMixupMergeJob, inputF=verifyBamIDJob.selfRGFile,\
						parentJobLs=[verifyBamIDJob])
		
		sys.stderr.write("%s jobs.\n"%(no_of_jobs))
		
		#2012.8.30 gzip the final output
		newReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=returnData, transferOutput=transferOutput,\
						outputDirPrefix="%smergedOutputGzip"%(outputDirPrefix))
		return newReturnData

Example #13

Show file

File: AssociationWorkflow.py Project: bopopescu/gwasmodules

    def addJobs(self, db_250k=None, callMethodID2Data=None, kinshipFile=None, eigenVectorFile=None, phenotype_method_id_ls=[],\
      analysis_method_id_ls=[], genotypeFileToGenerateKinship=None, data_dir=None, getPublicPhenotype=False, commit=False, \
      transferOutput=True, needSSHDBTunnel=False, outputDirPrefix=""):
        """
		2013.1.7 use callMethod.locus_type_id to decide whether noSNPAlleleOrdinalConversion should be toggled or not
		2012.9.28
			add argument getPublicPhenotype
		2012.6.5
		"""
        sys.stderr.write(
            "Adding association jobs for %s polymorphism data ... " %
            (len(callMethodID2Data)))
        returnData = PassingData()
        returnData.jobDataLs = []

        topOutputDir = "%sAssociation" % (outputDirPrefix)
        topOutputDirJob = yh_pegasus.addMkDirJob(workflow=self,
                                                 mkdir=self.mkdirWrap,
                                                 outputDir=topOutputDir)

        phenotypeFile = File(os.path.join(topOutputDir, 'phenotype.tsv'))
        outputPhenotypeJob = self.addOutputPhenotypeJob(executable=self.OutputPhenotype, outputFile=phenotypeFile, \
               getRawPhenotypeData=False, getPublicPhenotype=getPublicPhenotype,\
               parentJobLs=[topOutputDirJob], transferOutput=True, job_max_memory=2000,\
               sshDBTunnel=needSSHDBTunnel)

        locusMapFile = File(os.path.join(topOutputDir, 'locusMap.h5'))
        locusMapJob = self.addStock_250kDBJob(executable=self.Stock_250kDB, outputFile=locusMapFile, run_type=2, \
           parentJobLs=[topOutputDirJob], extraDependentInputLs=None, transferOutput=False, \
           extraArguments=None, job_max_memory=2000, sshDBTunnel=needSSHDBTunnel)

        for analysis_method_id in analysis_method_id_ls:
            analysisMethod = Stock_250kDB.AnalysisMethod.get(
                analysis_method_id)
            if not analysisMethod:
                sys.stderr.write(
                    "Warning: analysis_method_id %s not in db. Skip.\n" %
                    (analysis_method_id))
                continue
            for phenotype_method_id in phenotype_method_id_ls:
                phenotypeMethod = Stock_250kDB.PhenotypeMethod.get(
                    phenotype_method_id)
                if not phenotypeMethod:
                    sys.stderr.write(
                        "Warning: phenotype_method_id %s not in db. Skip.\n" %
                        (phenotype_method_id))
                    continue
                for callMethodID, callMethodData in callMethodID2Data.iteritems(
                ):
                    test_type = analysisMethod.association_test_type
                    if not test_type:
                        sys.stderr.write(
                            "Warning: analysisMethod %s has non-None test_type %s. Skip.\n"
                            % (test_type))
                        continue
                    #2012.9.28 not in db
                    rm = db_250k.checkResultsMethod(call_method_id=callMethodID, phenotype_method_id=phenotype_method_id, \
                          analysis_method_id=analysis_method_id, \
                          cnv_method_id=None)
                    if rm:
                        sys.stderr.write(
                            "Warning: skip association for c=%s, p=%s, a=%s.\n"
                            % (callMethodID, phenotype_method_id,
                               analysis_method_id))
                        continue
                    outputFile = File(os.path.join(topOutputDir, '%s_%s_%s.h5'%(callMethodID, phenotype_method_id, \
                                   analysis_method_id)))
                    if callMethodData.db_entry.locus_type_id == 2:  #cnv dataset is already in 0,1 binary format.
                        #no conversion needed
                        noSNPAlleleOrdinalConversion = 1
                        inputMissingGenotypeNotationType = 2
                    else:
                        noSNPAlleleOrdinalConversion = 0
                        inputMissingGenotypeNotationType = 1
                    associationJob = self.addAssociationJob(executable=self.Association, datasetFile=callMethodData.datasetFile, \
                     phenotypeFile=outputPhenotypeJob.output, phenotype_method_id=phenotype_method_id, \
                     outputFile=outputFile, kinshipFile=kinshipFile, eigenVectorFile=eigenVectorFile, \
                     genotypeFileToGenerateKinship=genotypeFileToGenerateKinship, \
                     locusMapFile=locusMapJob.output,\
                     test_type=test_type,\
                     min_data_point=self.min_data_point, noSNPAlleleOrdinalConversion=noSNPAlleleOrdinalConversion, \
                     which_PC_index_ls=self.which_PC_index_ls,\
                     inputMissingGenotypeNotationType=inputMissingGenotypeNotationType,\
                     parentJobLs=[outputPhenotypeJob, locusMapJob], job_max_memory=3500, walltime =200, \
                     extraDependentInputLs=None, transferOutput=False)
                    logFile = File(os.path.join(topOutputDir, '%s_%s_%s_2DB.log'%(callMethodID, phenotype_method_id, \
                                   analysis_method_id)))
                    result2DBJob = self.addResult2DBJob(executable=self.Results2DB_250k, inputFile=associationJob.output, \
                        call_method_id=callMethodID, phenotype_method_id=phenotype_method_id, \
                        analysis_method_id=analysis_method_id, data_dir=data_dir, \
                        results_method_type_id=1,\
                        logFile=logFile, commit=commit,\
                        parentJobLs=[associationJob], transferOutput=transferOutput, \
                        job_max_memory=500, sshDBTunnel=needSSHDBTunnel)
                    returnData.jobDataLs.append(PassingData(jobLs=[result2DBJob], file=logFile, \
                          fileList=[logFile]))
        sys.stderr.write("%s jobs.\n" % (self.no_of_jobs))
        return returnData

Example #14

Show file

    def addJobs(self,
                workflow=None,
                result_peak_ls=None,
                inputData=None,
                datasetName=None,
                chunkSize=None,
                pegasusFolderName=""):
        """
		2012.3.3
		"""
        if workflow is None:
            workflow = self

        returnJobData = PassingData()

        no_of_jobs = 0

        topOutputDir = pegasusFolderName
        topOutputDirJob = yh_pegasus.addMkDirJob(workflow,
                                                 mkdir=workflow.mkdirWrap,
                                                 outputDir=topOutputDir)
        no_of_jobs += 1

        biology_category_id2peak_data = {}
        no_of_peaks = 0
        for row in result_peak_ls:
            biology_category_id = row.result.phenotype_method.biology_category_id
            if biology_category_id not in biology_category_id2peak_data:
                biology_category_id2peak_data[
                    biology_category_id] = PassingData(job=None,
                                                       result_peak_ls=[])
                #add a mkdirJob
                folderName = os.path.join(
                    topOutputDir,
                    'biology_category_%s' % (biology_category_id))
                folderJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=folderName,\
                        parentJobLs=[topOutputDirJob])
                biology_category_id2peak_data[
                    biology_category_id].job = folderJob
                no_of_jobs += 1
            biology_category_id2peak_data[
                biology_category_id].result_peak_ls.append(row)
            no_of_peaks += 1

        sys.stderr.write("%s peaks. %s biology categories.\n" %
                         (no_of_peaks, len(biology_category_id2peak_data)))

        sys.stderr.write(
            "Finding max LD between one type of loci and peaks on %s input correlation files ... "
            % (len(inputData.jobDataLs)))

        prevReportedStr = ""
        for biology_category_id, peak_data in biology_category_id2peak_data.iteritems(
        ):
            outputDirJob = peak_data.job
            for peak in peak_data.result_peak_ls:

                #identify the proper output folder & its creation job
                outputFile = File(
                    os.path.join(outputDirJob.folder,
                                 'peak_%s_loci.h5' % (peak.id)))
                peakLociOutputJob = self.addOutputLociIDOfResultPeakInHDF5Job(workflow, executable=workflow.OutputLociIDOfResultPeakInHDF5, \
                          peak_id=peak.id, outputFile=outputFile,\
                          parentJobLs=[outputDirJob], extraDependentInputLs=[], \
                          transferOutput=False, extraArguments=None, \
                          job_max_memory=200)
                no_of_jobs += 1
                peakCorrelationFile = File(
                    os.path.join(
                        outputDirJob.folder,
                        'maxCorrelationBetweenFirstLociAndPeak%s.h5' %
                        (peak.id)))
                maxLDJob = self.mapMaxLDJobsGivenInputData(workflow, inputData=inputData, datasetName=datasetName, peak_id=peak.id, \
                    peakLociH5File=peakLociOutputJob.output, outputFile=peakCorrelationFile, outputDirJob=outputDirJob, \
                    chunkSize=chunkSize, parentJobLs=[peakLociOutputJob], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
                    job_max_memory=200)
                no_of_jobs += maxLDJob.no_of_jobs

                #final output filename is call_method_biology_category_phenotype_analysis_chr_start_stop_peak_id
                outputFnamePrefix = os.path.join(outputDirJob.folder, 'call_%s_category_%s_phenotype_%s_%s_analysis_%s_chr_%s_%s_%s_%s'%\
                        (peak.result.call_method_id, peak.result.phenotype_method.biology_category_id, \
                        peak.result.phenotype_method.id, \
                        peak.result.phenotype_method.getProperShortName(), peak.result.analysis_method.id,\
                        peak.chromosome, peak.start, peak.stop, peak.id))
                outputFile = File("%s.png" % (outputFnamePrefix))
                plotJob = self.addDrawManhattanPlotForLDInHDF5Job(workflow, executable=workflow.DrawManhattanPlotForLDInHDF5, \
                      correlationFile=maxLDJob.output, peak_id=peak.id, \
                      datasetName=datasetName, outputFile=outputFile,\
                      outputFnamePrefix=outputFnamePrefix, parentJobLs=[maxLDJob], \
                      extraDependentInputLs=[], transferOutput=True, extraArguments=None, \
                 job_max_memory=300)
                no_of_jobs += 1
                if no_of_jobs % 2 == 0:
                    sys.stderr.write(
                        "%s%s" % ("\x08" * len(prevReportedStr), no_of_jobs))
                    prevReportedStr = str(no_of_jobs)
        sys.stderr.write("  %s jobs.\n" % (no_of_jobs))
        return no_of_jobs

Example #15

Show file

	def run(self):
		"""
		2011-9-28
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname)
		db_250k.setup(create_tables=False)
		
		
		sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID(self.biology_category_id, access=self.access)
		sameCategoryPhenotypeMethodIDLs = [pm.id for pm in sameCategoryPhenotypeMethodLs]
		#merge the two lists of phenotype method id together
		phenotype_method_id_ls = list(set(self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs))
		
		result1_query = db_250k.getResultLs(call_method_id=self.call_method1_id, analysis_method_id_ls=[self.analysis_method1_id], \
						phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method1_id)
		result2_query = db_250k.getResultLs(call_method_id=self.call_method2_id, analysis_method_id_ls=[self.analysis_method2_id], \
						phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method2_id)
		
		result1_id_ls = []
		for result in result1_query:
			result1_id_ls.append(result.id)
		
		result2_id_ls = []
		for result in result2_query:
			result2_id_ls.append(result.id)
		
		#make sure the entries with (result_id, self.association_peak_type_id) exists in AssociationPeak
		result1_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak(result1_id_ls, self.association1_peak_type_id)
		result2_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak(result2_id_ls, self.association2_peak_type_id)
		
		phenotype_method_id2result1_id = self.getPhenotypeMethodId2ResultID(result1_id_ls)
		phenotype_method_id2result2_id = self.getPhenotypeMethodId2ResultID(result2_id_ls)
		phenotype_method_id2result_id_pair = {}
		for phenotype_method_id , result1_id in phenotype_method_id2result1_id.iteritems():
			if phenotype_method_id in phenotype_method_id2result2_id:
				result2_id = phenotype_method_id2result2_id.get(phenotype_method_id)
				phenotype_method_id2result_id_pair[phenotype_method_id] = [result1_id, result2_id]
		
		# Create a abstract dag
		workflowName = os.path.splitext(os.path.basename(self.outputFname))[0]
		workflow = self.initiateWorkflow(workflowName)
		
		self.registerExecutables(workflow)
		self.registerCustomExecutables(workflow)
		
		counter = 0
		
		overlapStatDir = "overlapStat"
		overlapStatDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapStatDir)
		
		counter += 1
		"""
		overlapPlotDir = "overlapPlot"
		overlapPlotDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapPlotDir)
		
		outputFnamePrefix = os.path.join(overlapPlotDir, 'cm%s_cnvM%s_am%s_peakType%s_vs_cm%s_cnvM%s_am%s_peakType%s_biologyCategory%s_overlapPeak'%\
								(self.call_method1_id, self.cnv_method1_id, self.analysis_method1_id, self.association1_peak_type_id, \
								self.call_method2_id, self.cnv_method2_id, self.analysis_method2_id, self.association2_peak_type_id, \
								self.biology_category_id))
		plotAssociationPeakOverlapJob = slef.addPlotPeakOverlapJob(workflow, executable=workflow.plotAssociationPeakOverlapJob, \
							outputFnamePrefix=outputFnamePrefix, \
							parentJobLs=[overlapPlotDirJob], job_max_memory=100, walltime = 60, \
							extraDependentInputLs=[], \
							transferOutput=True)
		"""
		#each contig in each trio gets a summary.
		peakOverlapStatMergeFile = File('peak_overlap_stat.tsv')
		peakOverlapStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=peakOverlapStatMergeFile, transferOutput=True, parentJobLs=[])
		counter += 1
		no_of_input =0
		for phenotype_method_id, result_id_pair in phenotype_method_id2result_id_pair.iteritems():
			result1_id = result_id_pair[0]
			result2_id = result_id_pair[1]
			outputFnamePrefix = 'result_%s_vs_%s_peak_type_%s_vs_%s'%(result1_id, result2_id, self.association1_peak_type_id, self.association2_peak_type_id)
			outputF = File(os.path.join(overlapStatDir, '%s.tsv'%(outputFnamePrefix)))
			
			if no_of_input==0:	#add one random input, otherwise replica catalog error occurs
				rm1 = Stock_250kDB.ResultsMethod.get(result1_id)
				inputFile1 = self.registerOneInputFile(workflow, rm1.filename)
				extraDependentInputLs=[inputFile1]
			else:
				extraDependentInputLs=[]
			
			no_of_input += 1
			gwasPeakOverlapJob = self.addGWASPeakOverlapJob(workflow, executable=workflow.twoGWASPeakOverlap, \
							result1_id=result1_id, result2_id=result2_id, association1_peak_type_id=self.association1_peak_type_id, \
							association2_peak_type_id=self.association2_peak_type_id, peak_padding=self.peak_padding, \
							outputF=outputF, \
							commit=1, results_directory=None, logFile=None, \
							parentJobLs=[overlapStatDirJob], job_max_memory=100, walltime = 60, \
							extraDependentInputLs=extraDependentInputLs, \
							transferOutput=True)
			counter += 1
			
			self.addInputToStatMergeJob(workflow, statMergeJob=peakOverlapStatMergeJob, \
									inputF=outputF, parentJobLs=[gwasPeakOverlapJob])
			"""
			self.addInputToStatMergeJob(workflow, statMergeJob=plotAssociationPeakOverlapJob, inputF=outputF, \
							parentJobLs=[gwasPeakOverlapJob])
			"""
			
		sys.stderr.write("%s jobs.\n"%(counter))
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)

Example #16

Show file

File: ConvertOldIsqRecordsPipeline.py Project: mjmontague/vervet-web

	def run(self):
		"""
		2012.2.8
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = VervetDB.VervetDB(drivername=self.drivername, db_user=self.db_user,
					db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname, schema=self.schema)
		db_vervet.setup(create_tables=False)
		session = db_vervet.session
		session.begin()
		
		if not self.data_dir:
			self.data_dir = db_vervet.data_dir
		
		if not self.local_data_dir:
			self.local_data_dir = db_vervet.data_dir
		
		workflowName = os.path.splitext(os.path.basename(self.outputFname))[0]
		workflow = self.initiateWorkflow(workflowName)
		
		self.registerJars(workflow)
		self.registerExecutables(workflow)
		self.registerCustomExecutables(workflow)
		
		
		isq_ls = self.fetchIndividualSequenceFromDB(db_vervet, self.isq_id_ls)
		
		no_of_jobs = 1
		
		individualSequenceID2FilePairLs = db_vervet.getIndividualSequenceID2FilePairLs([isq.id for isq in isq_ls], \
													data_dir=self.local_data_dir, checkOldPath=True)
		
		for individualSequenceID, FilePairLs in individualSequenceID2FilePairLs.iteritems():
			individual_sequence = VervetDB.IndividualSequence.get(individualSequenceID)
			
			newISQPath = individual_sequence.constructRelativePathForIndividualSequence()
			#newISQPath = '%s_split'%(newISQPath)
			if individual_sequence.path !=newISQPath:
				individual_sequence.path = newISQPath
				session.add(individual_sequence)
				session.flush()
			
			sequenceOutputDir = os.path.join(self.data_dir, individual_sequence.path)
			sequenceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=sequenceOutputDir)
			
			for filePair in FilePairLs:
				for fileRecord in filePair:
					filename = fileRecord[0]
					absPath = os.path.join(self.local_data_dir, filename)
					fastqFile = self.registerOneInputFile(workflow, absPath)
					library, mate_id = self.parseLibraryMateIDFromFilename(filename)[:2]
					if library is None:
						sys.stderr.write("Warning: can't parse library out of file %s of isq %s & skip.\n"%(filename, individualSequenceID))
						continue
					
					if mate_id:
						prefix = '%s_%s'%(library, mate_id)
					else:
						prefix = library
					
					outputFilenamePrefix = '%s_%s'%(individual_sequence.id, prefix)
					
					splitOutputDir = outputFilenamePrefix
					splitOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=splitOutputDir)
					
					splitFastQFnamePrefix = os.path.join(splitOutputDir, outputFilenamePrefix)
					
					logFile = File('%s_%s.split.log'%(individual_sequence.id, prefix))
					splitReadFileJob1 = self.addSplitReadFileJob(workflow, executable=workflow.splitReadFile, \
									inputF=fastqFile, outputFnamePrefix=splitFastQFnamePrefix, \
									outputFnamePrefixTail="", minNoOfReads=self.minNoOfReads, \
									logFile=logFile, parentJobLs=[splitOutputDirJob], \
									job_max_memory=2000, walltime = 800, \
									extraDependentInputLs=[], transferOutput=True)
					
					logFile = File('%s_%s.register.log'%(individual_sequence.id, prefix))
					registerJob1 = self.addRegisterAndMoveSplitFileJob(workflow, executable=workflow.registerAndMoveSplitSequenceFiles, \
									inputDir=splitOutputDir, outputDir=sequenceOutputDir, relativeOutputDir=individual_sequence.path, logFile=logFile,\
									individual_sequence_id=individual_sequence.id, bamFile=None, library=library, mate_id=mate_id, \
									parentJobLs=[splitReadFileJob1, sequenceOutputDirJob], job_max_memory=100, walltime = 60, \
									commit=self.commit, extraDependentInputLs=[], \
									transferOutput=True)
					
					no_of_jobs += 3
		sys.stderr.write("%s jobs.\n"%(no_of_jobs))
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)
		if self.commit:
			session.commit()
		else:
			session.rollback()