def addSplitFastaFileJobs(self, workflow, refFastaF, SelectAndSplitFastaRecords, fastaTitleLs, mkdirWrap=None, site_handler=None, namespace='workflow', version='1.0', fastaOutputDir = "fasta"): """ 2011-7-25 split the whole fasta file into files, each containing one fasta record (from fastaTitleLs) return the data """ sys.stderr.write("Adding job to split %s into %s records ..."%(refFastaF.name, len(fastaTitleLs))) # Add a mkdir job mkDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=mkdirWrap, outputDir=fastaOutputDir, namespace=namespace, version=version) selectAndSplitFastaJob = Job(namespace=namespace, name=SelectAndSplitFastaRecords.name, version=version) selectAndSplitFastaJob.addArguments('-i', refFastaF, "-o", fastaOutputDir) selectAndSplitFastaJob.uses(refFastaF, transfer=True, register=True, link=Link.INPUT) workflow.addJob(selectAndSplitFastaJob) workflow.depends(parent=mkDirJob, child=selectAndSplitFastaJob) refName2jobDataLs = {} for refName in fastaTitleLs: if refName not in refName2jobDataLs: refName2jobDataLs[refName] = [] selectAndSplitFastaJob.addArguments(refName) fastaFname = os.path.join(fastaOutputDir, '%s.fasta'%(refName)) fastaFile = File(fastaFname) selectAndSplitFastaJob.uses(fastaFile, transfer=False, register=True, link=Link.OUTPUT) refName2jobDataLs[refName] = [selectAndSplitFastaJob, fastaFile] sys.stderr.write("Done.\n") return PassingData(refName2jobDataLs=refName2jobDataLs, workflow=workflow)
def addJobs(self, inputURL=None, relativePathList =[], outputDir="", username=None, password=None, \ transferOutput=True): """ 2012.6.27 """ sys.stderr.write("Adding wget jobs for %s input ... " % (len(relativePathList))) no_of_jobs = 0 topOutputDir = outputDir topOutputDirJob = yh_pegasus.addMkDirJob(self, mkdir=self.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 returnData = PassingData() returnData.jobDataLs = [] for relativePath in relativePathList: #2013.06.26 remove all "/" from relativePath in case it's a folder relativePathNoFolder = relativePath.replace('/', '_') logFile = File('%s.log' % (relativePathNoFolder)) wgetJob = self.addWgetJob(executable=self.wget, url=inputURL, relativePath=relativePath, \ username=username, password=password,\ targetFolder=outputDir, logFile=logFile, cut_dir_number=self.cut_dir_number, parentJobLs=[topOutputDirJob], extraDependentInputLs=[], \ transferOutput=transferOutput, \ extraArguments=None, job_max_memory=50) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[wgetJob], file=wgetJob.output, \ fileLs=wgetJob.outputLs)) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnData
def addJobs(self, workflow, inputData=None, pegasusFolderName="", tmpDir="/tmp"): """ 2012.3.21 """ sys.stderr.write("Adding MarkDuplicates jobs on %s input datasets ..."%(len(inputData.jobDataLs))) returnJobData = PassingData(jobDataLs = []) topOutputDir = pegasusFolderName topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) no_of_jobs = 1 for jobData in inputData.jobDataLs: inputFile = jobData.output bamIndexJob = self.addBAMIndexJob(workflow, BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, \ BuildBamIndexJar=workflow.BuildBamIndexJar, \ inputBamF=inputFile,\ parentJobLs=[topOutputDirJob]+jobData.jobLs, transferOutput=False) outputFname = self.getMarkDupOutputFnameBasedOnInputFname(inputFile.abspath) finalBamFileName = os.path.join(topOutputDir, outputFname) finalBamFile = File(finalBamFileName) markDupJob, markDupBamIndexJob = self.addMarkDupJob(workflow, parentJobLs=[bamIndexJob]+jobData.jobLs, \ inputBamF=bamIndexJob.bamFile, \ inputBaiF=bamIndexJob.output, outputBamFile=finalBamFile,\ MarkDuplicatesJava=workflow.MarkDuplicatesJava, MarkDuplicatesJar=workflow.MarkDuplicatesJar, tmpDir=tmpDir,\ BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \ transferOutput=True) no_of_jobs += 3 returnJobData.jobDataLs.append(PassingData(output=finalBamFile, jobLs=[markDupJob, markDupBamIndexJob])) sys.stderr.write("%s jobs.\n"%(no_of_jobs)) return returnJobData
def addJobs(self, inputData=None, outputDirPrefix="", ntDatabaseFileList=None, noOfTotalSequences=None, \ transferOutput=True, makeBlastDBJob=None): """ 2012.5.24 """ sys.stderr.write("Adding blast jobs for %s input ... " % (len(inputData.jobDataLs))) no_of_jobs = 0 topOutputDir = "%sBlast" % (outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(self, mkdir=self.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 allBlastResultFile = File(os.path.join(topOutputDir, 'blast.tsv')) allBlastMergeJob = self.addStatMergeJob(statMergeProgram=self.mergeSameHeaderTablesIntoOne, \ outputF=allBlastResultFile, transferOutput=transferOutput, parentJobLs=[topOutputDirJob]) no_of_jobs += 1 ntDatabaseFile = ntDatabaseFileList[0] returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.output outputFnamePrefix = os.path.join( topOutputDir, os.path.splitext(os.path.basename(inputF.name))[0]) splitFastaJob = self.addSplitFastaFileJob(executable=self.SplitFastaFile, inputFile=inputF, outputFnamePrefix=outputFnamePrefix, \ noOfSequencesPerSplitFile=self.blockSize, filenameSuffix=".fasta", noOfTotalSequences=noOfTotalSequences,\ parentJobLs=jobData.jobLs + [topOutputDirJob], extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, job_max_memory=500) no_of_jobs += 1 for splitFastaOutput in splitFastaJob.outputList: outputFile = File('%s.tsv' % (splitFastaOutput.name)) blastJob = self.addBlastWrapperJob(executable=self.BlastWrapper, inputFile=splitFastaOutput, outputFile=outputFile, \ outputFnamePrefix=splitFastaOutput.name , databaseFile=ntDatabaseFile,\ maxNoOfMismatches=self.maxNoOfMismatches, minNoOfIdentities=self.minNoOfIdentities, \ minIdentityPercentage=self.minIdentityPercentage, blastallPath=self.blastallPath, \ parentJobLs=[splitFastaJob, makeBlastDBJob], extraDependentInputLs=ntDatabaseFileList, transferOutput=False, \ extraArguments=None, job_max_memory=1000) #add output to some reduce job self.addInputToStatMergeJob(statMergeJob=allBlastMergeJob, \ inputF=blastJob.output, parentJobLs=[blastJob]) no_of_jobs += 1 sys.stderr.write("%s jobs. Done.\n" % (no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[allBlastMergeJob], file=allBlastResultFile, \ fileLs=[allBlastResultFile])) return returnData
def addJobs( self, workflow, inputData=None, db_vervet=None, smartpcaParameterFname="", pegasusFolderName="", maxContigID=None, missingCallAsRefBase=0, transferOutput=True, ): """ 2012.9.11 add argument missingCallAsRefBase 2011.1.8 add outputDirPrefix to differentiate one run from another if multiple trio call workflows are run simultaneously outputDirPrefix could contain "/" to denote sub-folders. """ sys.stderr.write( "Adding smartpca jobs on %s VCFs (contig_id<=%s) ..." % (len(inputData.jobDataLs), maxContigID) ) returnJobData = PassingData() no_of_jobs = 0 topOutputDir = pegasusFolderName topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) smartpcaGenotypeInputFile = File(os.path.join(topOutputDir, "smartpca.geno")) smartpcaLocusInputFile = File(os.path.join(topOutputDir, "smartpca.snp")) smartpcaGenotypeMergeJob = self.addStatMergeJob( workflow, statMergeProgram=workflow.MergeFiles, outputF=smartpcaGenotypeInputFile, transferOutput=transferOutput, extraArguments="", parentJobLs=[topOutputDirJob], ) smartpcaLocusMergeJob = self.addStatMergeJob( workflow, statMergeProgram=workflow.MergeFiles, outputF=smartpcaLocusInputFile, transferOutput=transferOutput, extraArguments="", parentJobLs=[topOutputDirJob], ) smartpcaIndFile = None smartpcaIndJob = None no_of_jobs += 3 for jobData in inputData.jobDataLs: inputF = jobData.vcfFile contig_id = self.getContigIDFromFname(inputF.name) try: if maxContigID: contig_id = int(contig_id) if contig_id > maxContigID: # skip the small contigs continue except: sys.stderr.write("Except type: %s\n" % repr(sys.exc_info())) import traceback traceback.print_exc() outputFnamePrefix = os.path.join(topOutputDir, "Contig%s" % (contig_id)) convertJob = self.addConvertVCF2EigenStratJob( workflow, executable=workflow.ConvertVCF2EigenStrat, inputF=inputF, outputFnamePrefix=outputFnamePrefix, missingCallAsRefBase=missingCallAsRefBase, parentJobLs=[topOutputDirJob] + jobData.jobLs, extraDependentInputLs=[], transferOutput=False, extraArguments=None, job_max_memory=100, ) if smartpcaIndFile is None: # every VCF has the same order of individuals smartpcaIndFile = convertJob.indOutputF smartpcaIndJob = convertJob self.addInputToStatMergeJob( workflow, statMergeJob=smartpcaGenotypeMergeJob, inputF=convertJob.genoOutputF, parentJobLs=[convertJob] ) self.addInputToStatMergeJob( workflow, statMergeJob=smartpcaLocusMergeJob, inputF=convertJob.locusOutputF, parentJobLs=[convertJob] ) no_of_jobs += 1 # smartpcaCorFile = File(os.path.join(topOutputDir, 'smartpca.cor')) smartpcaEvecFile = File(os.path.join(topOutputDir, "smartpca.evec")) smartpcaEvalFile = File(os.path.join(topOutputDir, "smartpca.eval")) self.outputSmartpcaParameters( smartpcaParameterFname=smartpcaParameterFname, smartpcaGenotypeInputFile=smartpcaGenotypeInputFile, smartpcaLocusInputFile=smartpcaLocusInputFile, smartpcaIndFile=smartpcaIndFile, smartpcaCorFile=None, smartpcaEvecFile=smartpcaEvecFile, smartpcaEvalFile=smartpcaEvalFile, ) smartpcaParameterFile = self.registerOneInputFile( workflow, smartpcaParameterFname, folderName=pegasusFolderName ) smartpcaJob = self.addSmartpcaJob( workflow, executable=workflow.smartpca, smartpcaParameterFile=smartpcaParameterFile, parentJobLs=[smartpcaGenotypeMergeJob, smartpcaLocusMergeJob, smartpcaIndJob], extraDependentInputLs=[smartpcaGenotypeInputFile, smartpcaLocusInputFile, smartpcaIndFile], transferOutput=transferOutput, extraArguments=None, outputFileList=[None, smartpcaEvecFile, smartpcaEvalFile], job_max_memory=18000, ) # 2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey) outputF = File(os.path.join(topOutputDir, "smartpca_evec_withMetaInfo.tsv")) appendInfo2SmartPCAOutputJob = self.addGenericJob( executable=self.AppendInfo2SmartPCAOutput, inputFile=smartpcaEvecFile, outputFile=outputF, parentJobLs=[smartpcaJob], extraDependentInputLs=None, extraOutputLs=None, transferOutput=transferOutput, extraArgumentList=None, extraArguments="--inversePCValue", key2ObjectForJob=None, job_max_memory=2000, ) self.addDBArgumentsToOneJob(job=appendInfo2SmartPCAOutputJob, objectWithDBArguments=self) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return smartpcaJob
def run(self): """ 2011-9-28 """ if self.debug: import pdb pdb.set_trace() db_250k = self.db_250k sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID( self.biology_category_id, access=self.access) sameCategoryPhenotypeMethodIDLs = [ pm.id for pm in sameCategoryPhenotypeMethodLs ] #merge the two lists of phenotype method id together phenotype_method_id_ls = list( set(self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs)) phenotype_method_id_ls.sort() result_query = db_250k.getResultLs(call_method_id=self.call_method_id, analysis_method_id_ls=self.analysis_method_id_ls, \ phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method_id) result_id_ls = self.result_id_ls for result in result_query: result_id_ls.append(result.id) #make sure the entries with (result_id, self.association_peak_type_id) exists in AssociationPeak result_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak( result_id_ls, self.association_peak_type_id) # Create a abstract dag workflow = self.initiateWorkflow() self.registerExecutables(workflow) self.registerCustomExecutables(workflow) overlapStatDir = "overlapStat" overlapStatDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapStatDir) overlapPlotDir = "overlapPlot" overlapPlotDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapPlotDir) analysis_method_id_ls = map(str, self.analysis_method_id_ls) outputFnamePrefix = os.path.join(overlapPlotDir, 'callMethod%s_cnvMethod%s_analysisMethod%s_biologyCategory%s_peakType%s_overlapPeak'%\ (self.call_method_id, self.cnv_method_id, '_'.join(analysis_method_id_ls), self.biology_category_id,\ self.association_peak_type_id)) plotAssociationPeakOverlapJob = self.addPlotPeakOverlapJob(workflow, executable=workflow.plotAssociationPeakOverlap, \ outputFnamePrefix=outputFnamePrefix, \ parentJobLs=[overlapPlotDirJob], job_max_memory=100, walltime = 60, \ extraDependentInputLs=[], \ transferOutput=True) counter = 0 no_of_input = 0 for i in xrange(len(result_id_ls)): for j in range(i + 1, len(result_id_ls)): result1_id = result_id_ls[i] result2_id = result_id_ls[j] outputFnamePrefix = 'result_%s_vs_%s_peak_type_%s' % ( result1_id, result2_id, self.association_peak_type_id) outputF = File( os.path.join(overlapStatDir, '%s.tsv' % (outputFnamePrefix))) if no_of_input == 0: #add one random input, otherwise replica catalog error occurs rm1 = Stock_250kDB.ResultsMethod.get(result1_id) inputFile1 = self.registerOneInputFile( workflow, rm1.filename) extraDependentInputLs = [inputFile1] else: extraDependentInputLs = [] no_of_input += 1 gwasPeakOverlapJob = self.addGWASPeakOverlapJob(workflow, executable=workflow.twoGWASPeakOverlap, \ result1_id=result1_id, result2_id=result2_id, association1_peak_type_id=self.association_peak_type_id, \ association2_peak_type_id=self.association_peak_type_id, peak_padding=self.peak_padding, \ outputF=outputF, \ commit=1, results_directory=None, logFile=None, \ parentJobLs=[overlapStatDirJob], job_max_memory=100, walltime = 60, \ extraDependentInputLs=extraDependentInputLs, \ transferOutput=True) counter += 1 self.addInputToStatMergeJob(workflow, statMergeJob=plotAssociationPeakOverlapJob, inputF=outputF, \ parentJobLs=[gwasPeakOverlapJob]) sys.stderr.write("%s gwas peak overlap jobs.\n" % (counter)) # Write the DAX to stdout outf = open(self.outputFname, 'w') self.writeXML(outf)
def addJobs(self, workflow, inputData=None, min_MAF=None, min_cor=None, chunkSize=None, pegasusFolderName=""): """ 2012.3.3 """ sys.stderr.write( "Adding LD-calculating jobs on %s input datasets ..." % (len(inputData.jobDataLs))) returnJobData = PassingData() no_of_jobs = 0 topOutputDir = pegasusFolderName topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) finalCorrelationOutputFile = File( os.path.join(topOutputDir, 'correlation.h5')) correlationMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.MergeTwoLocusCorrelationHDF5, \ outputF=finalCorrelationOutputFile, transferOutput=True, extraArguments='-d correlation', parentJobLs=[topOutputDirJob]) inputJobData1, inputJobData2 = inputData.jobDataLs[:2] inputFile1 = inputJobData1.output inputFile2 = inputJobData2.output outputFile = File( os.path.join(topOutputDir, 'input1_in_input2_order.tsv')) orderDatasetRowJob1 = self.addOrderDatasetRowJob(workflow, executable=workflow.Order2ndSNPDataRowsSameAs1stSNPData, inputFile1=inputFile2, \ inputFile2=inputFile1, outputFile=outputFile,\ parentJobLs=[topOutputDirJob]+inputJobData1.jobLs, extraDependentInputLs=[], \ transferOutput=False, extraArguments=None, \ job_max_memory=1000) outputFile = File( os.path.join(topOutputDir, 'input2_in_same_order.tsv')) orderDatasetRowJob2 = self.addOrderDatasetRowJob(workflow, executable=workflow.Order2ndSNPDataRowsSameAs1stSNPData, inputFile1=orderDatasetRowJob1.output, \ inputFile2=inputFile2, outputFile=outputFile,\ parentJobLs=[orderDatasetRowJob1] + inputJobData2.jobLs, extraDependentInputLs=[], transferOutput=False, extraArguments=None, \ job_max_memory=1000) outputFile = File(os.path.join(topOutputDir, 'input1.hdf5')) convertDataset2HDF5Job1 = self.addConvertSNPData2HDF5Job(workflow, executable=workflow.ConvertSNPData2HDF5, \ inputFile=orderDatasetRowJob1.output, \ outputFile=outputFile, min_MAF=min_MAF, \ parentJobLs=[orderDatasetRowJob1], extraDependentInputLs=[], transferOutput=False, extraArguments=None, \ job_max_memory=100) outputFile = File(os.path.join(topOutputDir, 'input2.hdf5')) convertDataset2HDF5Job2 = self.addConvertSNPData2HDF5Job(workflow, executable=workflow.ConvertSNPData2HDF5, \ inputFile=orderDatasetRowJob2.output, \ outputFile=outputFile, min_MAF=min_MAF, \ parentJobLs=[orderDatasetRowJob2], extraDependentInputLs=[], transferOutput=False, extraArguments=None, \ job_max_memory=100) no_of_jobs += 5 no_of_cols_input1 = self.getNoOfLociFromSNPData(inputFile1.abspath) no_of_cols_input2 = self.getNoOfLociFromSNPData(inputFile2.abspath) for i1_start in range(0, no_of_cols_input1, chunkSize): i1_stop = min(i1_start + chunkSize - 1, no_of_cols_input1 - 1) for i2_start in range(0, no_of_cols_input2, chunkSize): i2_stop = min(i2_start + chunkSize - 1, no_of_cols_input2 - 1) outputFile = os.path.join( topOutputDir, 'cor_i1_%s_%s_i2_%s_%s.h5' % (i1_start, i1_stop, i2_start, i2_stop)) corCalulationJob = self.addCalculateColCorBetweenTwoHDF5Job(workflow, executable=workflow.CalculateColCorBetweenTwoHDF5, \ inputFile1=convertDataset2HDF5Job1.output, inputFile2=convertDataset2HDF5Job2.output, \ outputFile=outputFile, i1_start=i1_start, i1_stop=i1_stop, i2_start=i2_start, i2_stop=i2_stop,\ min_cor=min_cor, parentJobLs=[convertDataset2HDF5Job1, convertDataset2HDF5Job2], \ extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=50) no_of_jobs += 1 self.addInputToStatMergeJob(workflow, statMergeJob=correlationMergeJob, \ inputF=corCalulationJob.output, parentJobLs=[corCalulationJob]) sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return correlationMergeJob
def addJobs(self, workflow, db_250k=None, inputData=None, \ biologyCategoryID2PhenotypeID2Data=None, pegasusFolderName="", \ genePadding=20000, tax_id=3702, peakPadding=10000, phenotypeFile=None, call_method_id_set=None,\ data_dir=None): """ 2012.11.13 change argument results_directory to data_dir 2012.3.21 """ sys.stderr.write( "Adding SNPRegion drawing jobs on %s biology categories ..." % (len(biologyCategoryID2PhenotypeID2Data))) returnJobData = PassingData(jobDataLs=[]) topOutputDir = pegasusFolderName topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) #add the PickleGenomeRBDict job genomeRBDictPickleFile = os.path.join( topOutputDir, 'genomeRBDict_tax%s_padding%s.pickle' % (tax_id, genePadding)) pickleGenomeRBDictJob = self.addPickleGenomeRBDictJob(workflow, workflow.PickleGenomeRBDict, outputF=genomeRBDictPickleFile, \ genePadding=genePadding, tax_id=tax_id, \ parentJobLs=[topOutputDirJob], job_max_memory=200, \ extraDependentInputLs=[], transferOutput=True) #add the PickleGeneAnnotation job geneAnnotationPickleFile = os.path.join( topOutputDir, 'geneAnnotation_tax%s.pickle' % (tax_id)) geneAnnotationPickleJob = self.addPickleGeneAnnotationJob(workflow, workflow.GenomeDB, outputF=geneAnnotationPickleFile, \ tax_id=tax_id, \ parentJobLs=[topOutputDirJob], job_max_memory=200, \ extraDependentInputLs=[], transferOutput=True) no_of_jobs = 3 #add PickleSNPInfo job for each call method call_method_id2JobData = {} for call_method_id in call_method_id_set: call_method = Stock_250kDB.CallMethod.get(call_method_id) snpMatrixFile = self.registerOneInputFile(workflow, \ inputFname=call_method.getFileAbsPath(oldDataDir=db_250k.data_dir, newDataDir=data_dir),\ folderName=pegasusFolderName) outputF = File( os.path.join( topOutputDir, 'SNPInfo_LocusType%s.pickle' % (call_method.locus_type_id))) pickleSNPInfoJob = self.addPickleSNPInfoJob(workflow, workflow.PickleSNPInfo, \ outputF=outputF, call_method_id=call_method_id, \ parentJobLs=[topOutputDirJob], job_max_memory=100, \ extraDependentInputLs=[], transferOutput=True) call_method_id2JobData[call_method_id] = PassingData( job=pickleSNPInfoJob, snpMatrixFile=snpMatrixFile) no_of_jobs += 1 #one folder for each biology category for biology_category_id, phenotype_id2data in biologyCategoryID2PhenotypeID2Data.iteritems( ): #add a mkdirJob folderName = os.path.join( topOutputDir, 'biology_category_%s' % (biology_category_id)) folderJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=folderName,\ parentJobLs=[topOutputDirJob]) no_of_jobs += 1 biologyCategory = Stock_250kDB.BiologyCategory.get( biology_category_id) list_type_id_list = biologyCategory.returnGeneListIDList() list_type_id_in_str_list = map(str, list_type_id_list) list_type_id_list_str = ','.join(list_type_id_in_str_list) for phenotype_id, result_data in phenotype_id2data.iteritems(): result_peak_type_id_ls = result_data.result_peak_type_id_ls call_method_id_set = result_data.call_method_id_set result_id_peak_type_id_ls = [] analysis_method_id_in_str_ls = [] for result, peak_type_id in result_peak_type_id_ls: result_id_peak_type_id_ls.append('%s:%s' % (result.id, peak_type_id)) analysis_method_id_in_str_ls.append( str(result.analysis_method_id)) result_id_peak_type_id_ls_str = ','.join( result_id_peak_type_id_ls) peakSpanOutputFile = File( os.path.join( folderName, 'phenotype_%s_result_%s.tsv' % (phenotype_id, result_id_peak_type_id_ls_str))) multiPeakSpanJob = self.addOutputMultiGWASOverlapPeakSpanJob(workflow, workflow.OutputMultiGWASOverlapPeakSpan, \ outputF=peakSpanOutputFile, peakPadding=peakPadding, \ list_type_id_list=list_type_id_list_str, result_id_peak_type_id_ls=result_id_peak_type_id_ls_str, \ genePadding=genePadding, tax_id=tax_id, genomeRBDictPickleFile=genomeRBDictPickleFile, \ parentJobLs=[folderJob, pickleGenomeRBDictJob], job_max_memory=500, \ extraDependentInputLs=[], transferOutput=True) no_of_jobs += 1 for call_method_id in call_method_id_set: call_method = Stock_250kDB.CallMethod.get(call_method_id) if call_method.locus_type_id == 2: #2012.3.27 CNV locus type. no need to convert alleles into binary form. snp_matrix_data_type = 4 #2012.3.26 these CNV-derived SNP dataset doesn't need its alleles to be converted to binary form as it's already binary. #need_convert_alleles2binary = False #useAlleleToDetermineAlpha = False else: snp_matrix_data_type = 1 callMethodJobData = call_method_id2JobData[call_method_id] pickleSNPInfoJob = callMethodJobData.job snpMatrixFile = callMethodJobData.snpMatrixFile output_dir = folderName #go to the biology category logFile = File(os.path.join(folderName, 'call_%s_phenotype_%s_result_%s_drawSNPRegion.log')%\ (call_method_id, phenotype_id, result_id_peak_type_id_ls_str)) analysis_method_id_ls_str = ','.join( analysis_method_id_in_str_ls) drawSNPRegionJob = self.addDrawSNPRegionJob(workflow, executable=workflow.DrawSNPRegion, \ inputF=peakSpanOutputFile, call_method_id=call_method_id, snpMatrixFile=snpMatrixFile, \ phenotypeFile=phenotypeFile, output_dir=output_dir, results_directory=data_dir,\ analysis_method_id_ls=analysis_method_id_ls_str, \ geneAnnotationPickleFile=geneAnnotationPickleJob.output, \ list_type_id_list=list_type_id_list_str, \ snp_matrix_data_type=snp_matrix_data_type, exclude_accessions_with_NA_phenotype=0,\ snpInfoPickleFile=pickleSNPInfoJob.output, label_gene=1, min_MAF=0.1, min_distance=20000,\ logFile=logFile,\ parentJobLs=[geneAnnotationPickleJob, pickleSNPInfoJob, multiPeakSpanJob], \ job_max_memory=3500, extraDependentInputLs=[], \ transferOutput=True) no_of_jobs += 1 sys.stderr.write("%s jobs.\n" % (no_of_jobs)) return returnJobData
def run(self): """ 2011-10 """ if self.debug: import pdb pdb.set_trace() pd = PassingData(min_MAF=self.min_MAF,\ data_dir=self.data_dir, \ need_chr_pos_ls=0,) result_query = self.db_250k.getResultLs(analysis_method_id_ls=self.analysis_method_id_ls, \ phenotype_method_id_ls=self.phenotype_method_id_ls, call_method_id_ls=self.call_method_id_ls,\ cnv_method_id=self.cnv_method_id) result_id_ls = self.result_id_ls for result in result_query: result_id_ls.append(result.id) workflow = self.initiateWorkflow() self.registerExecutables() self.registerCustomExecutables(workflow) counter = 0 topOutputDir = "%sAssociationLandscape"%(self.pegasusFolderName) topOutputDirJob = yh_pegasus.addMkDirJob(workflow=self, mkdir=self.mkdirWrap, outputDir=topOutputDir) resultLandscapeType = self.db_250k.getResultLandscapeType(min_MAF=self.min_MAF, \ neighbor_distance=self.neighbor_distance, \ max_neighbor_distance=self.max_neighbor_distance) for result_id in result_id_ls: result = Stock_250kDB.ResultsMethod.get(result_id) associationResultFile = self.registerOneInputFile(inputFname=result.getFileAbsPath(oldDataDir=self.db_250k.data_dir, newDataDir=self.data_dir), \ folderName=self.pegasusFolderName) logFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s.log'%\ (result_id, resultLandscapeType.id))) landscapeOutputFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s.h5'%\ (result_id, resultLandscapeType.id))) defineLandscapeJob = self.addDefineLandscapeJob(workflow, executable=workflow.DefineAssociationLandscape, \ result_id=result_id, neighbor_distance=self.neighbor_distance, \ max_neighbor_distance=self.max_neighbor_distance,\ min_MAF=self.min_MAF, tax_id=self.tax_id, \ data_dir=self.data_dir, logFile=logFile,\ landscapeOutputFile=landscapeOutputFile,\ extraDependentInputLs=[associationResultFile], \ parentJobLs=[topOutputDirJob], sshDBTunnel=self.needSSHDBTunnel,\ transferOutput=False) logFile = File(os.path.join(topOutputDirJob.output, 'Result%s_LandscapeType%s_log.tsv'%\ (result_id, resultLandscapeType.id))) landscape2DBJob = self.addAssociationLandscape2DBJob(executable=self.AssociationLandscape2DB, inputFile=defineLandscapeJob.output, \ result_id=result_id, \ data_dir=self.data_dir, logFile=logFile, commit=self.commit, \ min_MAF=self.min_MAF, \ neighbor_distance=self.neighbor_distance, max_neighbor_distance=self.max_neighbor_distance, \ parentJobLs=[topOutputDirJob, defineLandscapeJob], \ extraDependentInputLs=None, transferOutput=True, extraArguments=None, job_max_memory=1000, sshDBTunnel=self.needSSHDBTunnel) #add landscape -> peak job outputFile = File('%s_peak.h5'%(outputFnamePrefix)) self.addAssociationLandscape2PeakJob(executable=self.AssociationLandscape2Peak, \ inputFile=defineLandscapeJob.output, outputFile=outputFile, min_score=min_score, ground_score=ground_score, \ data_dir=data_dir, \ parentJobLs=[defineLandscapeJob], job_max_memory=100, walltime = 60, \ extraDependentInputLs=None, \ transferOutput=False) counter += 1 sys.stderr.write("%s total jobs.\n"%(self.no_of_jobs)) # Write the DAX to stdout outf = open(self.outputFname, 'w') self.writeXML(outf)
def run(self): """ 2011-9-28 """ if self.debug: import pdb pdb.set_trace() workflow = self.initiateWorkflow() self.registerJars() self.registerExecutables() self.registerCustomExecutables(workflow) site_handler =self.site_handler input_site_handler = self.input_site_handler ref_seq_f = self.registerOneInputFile(workflow, self.ref_seq_fname, folderName=self.pegasusFolderName) query_seq_f = self.registerOneInputFile(workflow, self.query_seq_fname, folderName=self.pegasusFolderName) # Add a mkdir job deltaOutputDir = "delta" deltaOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=deltaOutputDir) coordsOutputDir = "coords" coordsOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=coordsOutputDir) filterOutputDir = "filter" filterOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=filterOutputDir) plotOutputDir = "plot" plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotOutputDir) #plotScriptOutputDir = "plotScript" #plotScriptOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=self.mkdirWrap, outputDir=plotScriptOutputDir) refNameLs = self.getFastaRecordTitleLs(self.ref_seq_fname) returnData3 = self.addSplitFastaFileJobs(workflow, ref_seq_f, self.SelectAndSplitFastaRecords, refNameLs, mkdirWrap=self.mkdirWrap,\ site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='refFasta') refName2splitFastaJobDataLs = returnData3.refName2jobDataLs queryNameLs = self.getFastaRecordTitleLs(self.query_seq_fname) returnData3 = self.addSplitFastaFileJobs(workflow, query_seq_f, self.SelectAndSplitFastaRecords, queryNameLs, mkdirWrap=self.mkdirWrap,\ site_handler=site_handler, namespace=self.namespace, version=self.version, fastaOutputDir='queryFasta') queryName2splitFastaJobDataLs = returnData3.refName2jobDataLs noOfJobs = len(refName2splitFastaJobDataLs) + len(queryName2splitFastaJobDataLs) ref_seq_prefix = os.path.splitext(os.path.basename(ref_seq_f.name))[0] for queryName, jobDataLs in queryName2splitFastaJobDataLs.iteritems(): for refName, refJobDataLs in refName2splitFastaJobDataLs.iteritems(): refSelectAndSplitFastaJob, refFastaFile = refJobDataLs[:2] selectAndSplitFastaJob, fastaFile = jobDataLs[:2] nucmerJob = Job(namespace=self.namespace, name=self.nucmer.name, version=self.version) outputPrefix = "%s_vs_%s_%s"%(queryName, ref_seq_prefix, refName) deltaFnamePrefix = os.path.join(deltaOutputDir, outputPrefix) nucmerJob.addArguments("--maxgap=500", "--mincluster=100", "--prefix", deltaFnamePrefix, \ refFastaFile, fastaFile) nucmerJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT) nucmerJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT) deltaFname = "%s.delta"%(deltaFnamePrefix) deltaF = File(deltaFname) nucmerJob.uses(deltaFname, transfer=True, register=True, link=Link.OUTPUT) #3000M for one nucmer job with human as ref job_max_memory = 5000 #in MB yh_pegasus.setJobProperRequirement(nucmerJob, job_max_memory=job_max_memory) workflow.addJob(nucmerJob) workflow.depends(parent=refSelectAndSplitFastaJob, child=nucmerJob) workflow.depends(parent=selectAndSplitFastaJob, child=nucmerJob) workflow.depends(parent=deltaOutputDirJob, child=nucmerJob) coordsFname = os.path.join(coordsOutputDir, "%s.coords"%(outputPrefix)) coordsF = File(coordsFname) filterFname = os.path.join(filterOutputDir, "%s.filter"%(outputPrefix)) filterF = File(filterFname) plotPrefix = os.path.join(plotOutputDir, "%s_plot"%(outputPrefix)) png_plotF = File("%s.png"%plotPrefix) gp_plotF = File("%s.gp"%plotPrefix) fplot_plotF = File("%s.fplot"%plotPrefix) rplot_plotF = File("%s.rplot"%plotPrefix) postNucJob = Job(namespace=self.namespace, name=self.PostNucmer.name, version=self.version) postNucJob.addArguments(deltaF, coordsF, filterF, refFastaFile, fastaFile, plotPrefix) postNucJob.uses(deltaF, transfer=True, register=True, link=Link.INPUT) postNucJob.uses(refFastaFile, transfer=False, register=True, link=Link.INPUT) postNucJob.uses(fastaFile, transfer=False, register=True, link=Link.INPUT) postNucJob.uses(coordsF, transfer=True, register=True, link=Link.OUTPUT) postNucJob.uses(filterF, transfer=True, register=True, link=Link.OUTPUT) postNucJob.uses(png_plotF, transfer=True, register=True, link=Link.OUTPUT) #leave files below behind #postNucJob.uses(gp_plotF, transfer=True, register=True, link=Link.OUTPUT) #postNucJob.uses(fplot_plotF, transfer=True, register=True, link=Link.OUTPUT) #postNucJob.uses(rplot_plotF, transfer=True, register=True, link=Link.OUTPUT) yh_pegasus.setJobProperRequirement(postNucJob, job_max_memory=2000) workflow.addJob(postNucJob) workflow.depends(parent=nucmerJob, child=postNucJob) workflow.depends(parent=coordsOutputDirJob, child=postNucJob) workflow.depends(parent=filterOutputDirJob, child=postNucJob) workflow.depends(parent=plotOutputDirJob, child=postNucJob) #workflow.depends(parent=plotScriptOutputDirJob, child=postNucJob) noOfJobs += 2 sys.stderr.write(" %s jobs. \n"%(noOfJobs)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def addAllJobs(self, workflow=None, inputVCFData=None, chr2IntervalDataLs=None, \ GenomeAnalysisTKJar=None, samtools=None, \ CreateSequenceDictionaryJava=None, CreateSequenceDictionaryJar=None, \ BuildBamIndexFilesJava=None, BuildBamIndexJar=None,\ mv=None, \ refFastaFList=None, \ needFastaIndexJob=False, needFastaDictJob=False, \ data_dir=None, no_of_gatk_threads = 1, \ intervalSize=3000, intervalOverlapSize=0, \ outputDirPrefix="", transferOutput=True, job_max_memory=2000, **keywords): """ 2012.10.15 architect of the whole map-reduce framework call the parent's addAllJobs in a loop """ samplingReturnDataLs = [] for i in xrange(self.noOfSamplings): oneSamplingReturnData = CompareAlleleFrequencyOfTwoPopulationFromOneVCFFolder.addAllJobs(self, \ workflow=workflow, inputVCFData=inputVCFData, \ chr2IntervalDataLs=chr2IntervalDataLs, samtools=samtools, \ GenomeAnalysisTKJar=GenomeAnalysisTKJar, \ CreateSequenceDictionaryJava=CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=CreateSequenceDictionaryJar, \ BuildBamIndexFilesJava=BuildBamIndexFilesJava, BuildBamIndexJar=BuildBamIndexJar,\ mv=mv, \ refFastaFList=refFastaFList,\ needFastaIndexJob=needFastaIndexJob, needFastaDictJob=needFastaDictJob, \ data_dir=data_dir, no_of_gatk_threads = 1, \ intervalSize=intervalSize, intervalOverlapSize=intervalOverlapSize, \ outputDirPrefix='%s_%s_'%(outputDirPrefix, i), transferOutput=transferOutput, job_max_memory=job_max_memory,\ **keywords) samplingReturnDataLs.append(oneSamplingReturnData) topOutputDir = "%sFinalReduce"%(outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) #a ReduceMatrixByAverageColumnsWithSameKey job outputFile = File(os.path.join(topOutputDir, 'medianAlleleSharingStatAcrossAllSampling.tsv')) medianReduceJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.ReduceMatrixByAverageColumnsWithSameKey, \ outputF=outputFile, extraArguments='--keyColumnLs 0 -v 1-8', parentJobLs=[topOutputDirJob], \ extraDependentInputLs=None, transferOutput=True) #a MergeSameHeaderTablesIntoOne job outputFile = File(os.path.join(topOutputDir, 'alleleSharingStatAcrossAllSampling.tsv')) mergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.MergeSameHeaderTablesIntoOne, \ outputF=outputFile, extraArguments=None, parentJobLs=[topOutputDirJob], \ extraDependentInputLs=None, transferOutput=True) for oneSamplingReturnData in samplingReturnDataLs: self.addInputToStatMergeJob(workflow=workflow, statMergeJob=medianReduceJob, parentJobLs=[oneSamplingReturnData.estimateOutlierJob]) self.addInputToStatMergeJob(workflow=workflow, statMergeJob=mergeJob, parentJobLs=[oneSamplingReturnData.estimateOutlierJob]) outputFile = File( os.path.join(topOutputDirJob.output, 'outlierFraction_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="outlierFraction", whichColumnPlotLabel="outlierFraction", \ logY=False, positiveLog=True, logCount=False, valueForNonPositiveYValue=-1,\ minNoOfTotal=5,\ figureDPI=100, samplingRate=1,\ parentJobLs=[topOutputDirJob, mergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=True, job_max_memory=2000) outputFile = File( os.path.join(topOutputDirJob.output, 'AFS_cor_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[mergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="corr", whichColumnPlotLabel="AFSCorrelation", \ logY=False, positiveLog=True, logCount=False, valueForNonPositiveYValue=-1,\ minNoOfTotal=5,\ figureDPI=100, samplingRate=1,\ parentJobLs=[topOutputDirJob, mergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=True, job_max_memory=2000) sys.stderr.write("%s jobs.\n"%(self.no_of_jobs))
def addJobs(self, workflow=None, alignmentDataLs=None, refName2size=None, inputVCF=None, verifyBamID=None, \ data_dir=None, needPerContigJob=False, needSSHDBTunnel=0, outputDirPrefix="",\ transferOutput=True): """ 2012.8.30 """ if workflow is None: workflow = self sys.stderr.write("Adding jobs for %s references and %s alignments..."%(len(refName2size), len(alignmentDataLs))) if len(alignmentDataLs)==0: sys.stderr.write("No alignment for verifyBamID. Exit now.\n") sys.exit(0) no_of_jobs = 0 returnData = PassingData() returnData.jobDataLs = [] topOutputDir = "%sverifyBAMOutput"%(outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 mergedOutputDir = "%smergedOutput"%(outputDirPrefix) mergedOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=mergedOutputDir) no_of_jobs += 1 plotOutputDir = "%splot"%(outputDirPrefix) plotOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=plotOutputDir) no_of_jobs += 1 selfSampleMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfSMMerge.tsv')) selfSampleMixupMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=selfSampleMixupMergeFile, transferOutput=False) returnData.jobDataLs.append(PassingData(jobLs=[selfSampleMixupMergeJob], \ fileLs=[selfSampleMixupMergeFile])) no_of_jobs += 1 """ output of *.selfSM from verifyBamID #SEQ_ID RG CHIP_ID #SNPS #READS AVG_DP FREEMIX FREELK1 FREELK0 FREE_RH FREE_RA CHIPMIX CHIPLK1 CHIPLK0 CHIP_RH CHIP_RA DPREF RDPHET RDPALT 1968_3017_2005001_GA_vs_524 ALL NA 24414 22222 0.91 0.00003 13245.89 13305.27 0.55489 0.05202 NA NA NA NA NA NA NA NA """ outputFile = File( os.path.join(plotOutputDir, 'freeMix_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \ logY=False, logCount=True, valueForNonPositiveYValue=-1,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs += 1 outputFile = File( os.path.join(plotOutputDir, 'refProbGivenHet_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREE_RH", whichColumnPlotLabel="refAlleleProbGivenHet", \ logY=False, logCount=True, valueForNonPositiveYValue=-1,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs += 1 outputFile = File( os.path.join(plotOutputDir, 'refProbGivenAlternative_Hist.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addDrawHistogramJob(workflow=workflow, executable=workflow.DrawHistogram, inputFileList=[selfSampleMixupMergeFile], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREE_RA", whichColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \ logY=False, logCount=True, valueForNonPositiveYValue=-1,\ minNoOfTotal=10,\ figureDPI=100, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs += 1 outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_chipMix.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFractionByHet", \ logY=False, valueForNonPositiveYValue=50,\ xColumnHeader="CHIPMIX", xColumnPlotLabel="mixAgainstSelfGenotype", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_AVG_DP.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFractionByHet", \ logY=False, valueForNonPositiveYValue=50,\ xColumnHeader="AVG_DP", xColumnPlotLabel="avgDepth", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 outputFile = File(os.path.join(plotOutputDir, 'chipMix_vs_AVG_DP.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="CHIPMIX", whichColumnPlotLabel="mixAgainstSelfGenotype", \ logY=False, valueForNonPositiveYValue=50,\ xColumnHeader="AVG_DP", xColumnPlotLabel="avgDepth", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_refProbGivenHet.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \ logY=False, valueForNonPositiveYValue=50,\ xColumnHeader="FREE_RH", xColumnPlotLabel="refAlleleProbGivenHet", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_refProbGivenAlternativeAllele.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \ logY=False, valueForNonPositiveYValue=50,\ xColumnHeader="FREE_RA", xColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 outputFile = File(os.path.join(plotOutputDir, 'refAlleleProbGivenHet_vs_refProbGivenAlternativeAllele.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[selfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREE_RH", whichColumnPlotLabel="refAlleleProbGivenHet", \ logY=False, valueForNonPositiveYValue=-1,\ xColumnHeader="FREE_RA", xColumnPlotLabel="refAlleleProbGivenAlternativeAllele", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, selfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 #subtract the FREELIK1 - FREELIK0 likSubstractedSelfSampleMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfSMMerge_likDelta.tsv')) likSubstractedSelfSampleMixupMergeJob = self.addStatMergeJob(workflow, \ statMergeProgram=workflow.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=likSubstractedSelfSampleMixupMergeFile, transferOutput=False, \ extraArguments="--operatorType 2 -k 0 -v 7,8,6") #column 0 is sample ID, column 6 is FREEMIX. 7 is FREELIK1, 8 is FRELIK0 self.addInputToStatMergeJob(workflow, statMergeJob=likSubstractedSelfSampleMixupMergeJob, inputF=selfSampleMixupMergeJob.output,\ parentJobLs=[selfSampleMixupMergeJob]) returnData.jobDataLs.append(PassingData(jobLs=[likSubstractedSelfSampleMixupMergeJob], \ fileLs=[likSubstractedSelfSampleMixupMergeFile])) no_of_jobs += 1 outputFile = File(os.path.join(plotOutputDir, 'freeMix_vs_deltaMinusLogLikelihood.png')) #no spaces or parenthesis or any other shell-vulnerable letters in the x or y axis labels (whichColumnPlotLabel, xColumnPlotLabel) self.addAbstractPlotJob(workflow=workflow, executable=workflow.AbstractPlot, \ inputFileList=[likSubstractedSelfSampleMixupMergeJob.output], \ outputFile=outputFile, \ whichColumn=None, whichColumnHeader="FREEMIX", whichColumnPlotLabel="mixFraction", \ logY=False, valueForNonPositiveYValue=-1,\ xColumnHeader="FREELK1_by_FREELK0", xColumnPlotLabel="deltaMinusLogLikelihood", \ minNoOfTotal=5,\ figureDPI=150, samplingRate=1,\ parentJobLs=[plotOutputDirJob, likSubstractedSelfSampleMixupMergeJob], \ extraDependentInputLs=None, \ extraArguments=None, transferOutput=transferOutput, job_max_memory=2000) no_of_jobs +=1 selfRGMixupMergeFile = File(os.path.join(mergedOutputDir, 'selfRGMerge.tsv')) selfRGMixupMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=selfRGMixupMergeFile, transferOutput=False) returnData.jobDataLs.append(PassingData(jobLs=[selfRGMixupMergeJob], \ fileLs=[selfRGMixupMergeFile])) no_of_jobs += 1 #alignmentId2RGJobDataLs = returnData.alignmentId2RGJobDataLs i =0 for alignmentData in alignmentDataLs: alignment = alignmentData.alignment bamF= alignmentData.bamF baiF = alignmentData.baiF if i ==0: #need at least one log file transferOutputForThisJob = True else: transferOutputForThisJob = False i += 1 outputFnamePrefix = os.path.join(topOutputDir, alignment.getReadGroup()) verifyBamIDJob = self.addVerifyBamIDJob(executable=self.verifyBamID, inputVCF=inputVCF, inputBAM=bamF, \ outputFnamePrefix=outputFnamePrefix,\ doFreeFull=True,\ doChipMix=None, doChipFull=None, doChipRefBias=None, doChipNone=None, \ minAF=0.01, genoError=1e-03, minCallRate=0.50, \ minMapQ=20, maxDepth=int(3*alignment.median_depth), minQ=13, maxQ=40, \ parentJobLs=[topOutputDirJob]+alignmentData.jobLs, extraDependentInputLs=[baiF], \ transferOutput=transferOutputForThisJob, \ extraArguments=None, job_max_memory=5000) # pass transferOutput to it so to keep log no_of_jobs += 1 self.addInputToStatMergeJob(workflow, statMergeJob=selfSampleMixupMergeJob, inputF=verifyBamIDJob.selfSMFile,\ parentJobLs=[verifyBamIDJob]) self.addInputToStatMergeJob(workflow, statMergeJob=selfRGMixupMergeJob, inputF=verifyBamIDJob.selfRGFile,\ parentJobLs=[verifyBamIDJob]) sys.stderr.write("%s jobs.\n"%(no_of_jobs)) #2012.8.30 gzip the final output newReturnData = self.addGzipSubWorkflow(workflow=workflow, inputData=returnData, transferOutput=transferOutput,\ outputDirPrefix="%smergedOutputGzip"%(outputDirPrefix)) return newReturnData
def addJobs(self, db_250k=None, callMethodID2Data=None, kinshipFile=None, eigenVectorFile=None, phenotype_method_id_ls=[],\ analysis_method_id_ls=[], genotypeFileToGenerateKinship=None, data_dir=None, getPublicPhenotype=False, commit=False, \ transferOutput=True, needSSHDBTunnel=False, outputDirPrefix=""): """ 2013.1.7 use callMethod.locus_type_id to decide whether noSNPAlleleOrdinalConversion should be toggled or not 2012.9.28 add argument getPublicPhenotype 2012.6.5 """ sys.stderr.write( "Adding association jobs for %s polymorphism data ... " % (len(callMethodID2Data))) returnData = PassingData() returnData.jobDataLs = [] topOutputDir = "%sAssociation" % (outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(workflow=self, mkdir=self.mkdirWrap, outputDir=topOutputDir) phenotypeFile = File(os.path.join(topOutputDir, 'phenotype.tsv')) outputPhenotypeJob = self.addOutputPhenotypeJob(executable=self.OutputPhenotype, outputFile=phenotypeFile, \ getRawPhenotypeData=False, getPublicPhenotype=getPublicPhenotype,\ parentJobLs=[topOutputDirJob], transferOutput=True, job_max_memory=2000,\ sshDBTunnel=needSSHDBTunnel) locusMapFile = File(os.path.join(topOutputDir, 'locusMap.h5')) locusMapJob = self.addStock_250kDBJob(executable=self.Stock_250kDB, outputFile=locusMapFile, run_type=2, \ parentJobLs=[topOutputDirJob], extraDependentInputLs=None, transferOutput=False, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=needSSHDBTunnel) for analysis_method_id in analysis_method_id_ls: analysisMethod = Stock_250kDB.AnalysisMethod.get( analysis_method_id) if not analysisMethod: sys.stderr.write( "Warning: analysis_method_id %s not in db. Skip.\n" % (analysis_method_id)) continue for phenotype_method_id in phenotype_method_id_ls: phenotypeMethod = Stock_250kDB.PhenotypeMethod.get( phenotype_method_id) if not phenotypeMethod: sys.stderr.write( "Warning: phenotype_method_id %s not in db. Skip.\n" % (phenotype_method_id)) continue for callMethodID, callMethodData in callMethodID2Data.iteritems( ): test_type = analysisMethod.association_test_type if not test_type: sys.stderr.write( "Warning: analysisMethod %s has non-None test_type %s. Skip.\n" % (test_type)) continue #2012.9.28 not in db rm = db_250k.checkResultsMethod(call_method_id=callMethodID, phenotype_method_id=phenotype_method_id, \ analysis_method_id=analysis_method_id, \ cnv_method_id=None) if rm: sys.stderr.write( "Warning: skip association for c=%s, p=%s, a=%s.\n" % (callMethodID, phenotype_method_id, analysis_method_id)) continue outputFile = File(os.path.join(topOutputDir, '%s_%s_%s.h5'%(callMethodID, phenotype_method_id, \ analysis_method_id))) if callMethodData.db_entry.locus_type_id == 2: #cnv dataset is already in 0,1 binary format. #no conversion needed noSNPAlleleOrdinalConversion = 1 inputMissingGenotypeNotationType = 2 else: noSNPAlleleOrdinalConversion = 0 inputMissingGenotypeNotationType = 1 associationJob = self.addAssociationJob(executable=self.Association, datasetFile=callMethodData.datasetFile, \ phenotypeFile=outputPhenotypeJob.output, phenotype_method_id=phenotype_method_id, \ outputFile=outputFile, kinshipFile=kinshipFile, eigenVectorFile=eigenVectorFile, \ genotypeFileToGenerateKinship=genotypeFileToGenerateKinship, \ locusMapFile=locusMapJob.output,\ test_type=test_type,\ min_data_point=self.min_data_point, noSNPAlleleOrdinalConversion=noSNPAlleleOrdinalConversion, \ which_PC_index_ls=self.which_PC_index_ls,\ inputMissingGenotypeNotationType=inputMissingGenotypeNotationType,\ parentJobLs=[outputPhenotypeJob, locusMapJob], job_max_memory=3500, walltime =200, \ extraDependentInputLs=None, transferOutput=False) logFile = File(os.path.join(topOutputDir, '%s_%s_%s_2DB.log'%(callMethodID, phenotype_method_id, \ analysis_method_id))) result2DBJob = self.addResult2DBJob(executable=self.Results2DB_250k, inputFile=associationJob.output, \ call_method_id=callMethodID, phenotype_method_id=phenotype_method_id, \ analysis_method_id=analysis_method_id, data_dir=data_dir, \ results_method_type_id=1,\ logFile=logFile, commit=commit,\ parentJobLs=[associationJob], transferOutput=transferOutput, \ job_max_memory=500, sshDBTunnel=needSSHDBTunnel) returnData.jobDataLs.append(PassingData(jobLs=[result2DBJob], file=logFile, \ fileList=[logFile])) sys.stderr.write("%s jobs.\n" % (self.no_of_jobs)) return returnData
def addJobs(self, workflow=None, result_peak_ls=None, inputData=None, datasetName=None, chunkSize=None, pegasusFolderName=""): """ 2012.3.3 """ if workflow is None: workflow = self returnJobData = PassingData() no_of_jobs = 0 topOutputDir = pegasusFolderName topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) no_of_jobs += 1 biology_category_id2peak_data = {} no_of_peaks = 0 for row in result_peak_ls: biology_category_id = row.result.phenotype_method.biology_category_id if biology_category_id not in biology_category_id2peak_data: biology_category_id2peak_data[ biology_category_id] = PassingData(job=None, result_peak_ls=[]) #add a mkdirJob folderName = os.path.join( topOutputDir, 'biology_category_%s' % (biology_category_id)) folderJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=folderName,\ parentJobLs=[topOutputDirJob]) biology_category_id2peak_data[ biology_category_id].job = folderJob no_of_jobs += 1 biology_category_id2peak_data[ biology_category_id].result_peak_ls.append(row) no_of_peaks += 1 sys.stderr.write("%s peaks. %s biology categories.\n" % (no_of_peaks, len(biology_category_id2peak_data))) sys.stderr.write( "Finding max LD between one type of loci and peaks on %s input correlation files ... " % (len(inputData.jobDataLs))) prevReportedStr = "" for biology_category_id, peak_data in biology_category_id2peak_data.iteritems( ): outputDirJob = peak_data.job for peak in peak_data.result_peak_ls: #identify the proper output folder & its creation job outputFile = File( os.path.join(outputDirJob.folder, 'peak_%s_loci.h5' % (peak.id))) peakLociOutputJob = self.addOutputLociIDOfResultPeakInHDF5Job(workflow, executable=workflow.OutputLociIDOfResultPeakInHDF5, \ peak_id=peak.id, outputFile=outputFile,\ parentJobLs=[outputDirJob], extraDependentInputLs=[], \ transferOutput=False, extraArguments=None, \ job_max_memory=200) no_of_jobs += 1 peakCorrelationFile = File( os.path.join( outputDirJob.folder, 'maxCorrelationBetweenFirstLociAndPeak%s.h5' % (peak.id))) maxLDJob = self.mapMaxLDJobsGivenInputData(workflow, inputData=inputData, datasetName=datasetName, peak_id=peak.id, \ peakLociH5File=peakLociOutputJob.output, outputFile=peakCorrelationFile, outputDirJob=outputDirJob, \ chunkSize=chunkSize, parentJobLs=[peakLociOutputJob], extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=200) no_of_jobs += maxLDJob.no_of_jobs #final output filename is call_method_biology_category_phenotype_analysis_chr_start_stop_peak_id outputFnamePrefix = os.path.join(outputDirJob.folder, 'call_%s_category_%s_phenotype_%s_%s_analysis_%s_chr_%s_%s_%s_%s'%\ (peak.result.call_method_id, peak.result.phenotype_method.biology_category_id, \ peak.result.phenotype_method.id, \ peak.result.phenotype_method.getProperShortName(), peak.result.analysis_method.id,\ peak.chromosome, peak.start, peak.stop, peak.id)) outputFile = File("%s.png" % (outputFnamePrefix)) plotJob = self.addDrawManhattanPlotForLDInHDF5Job(workflow, executable=workflow.DrawManhattanPlotForLDInHDF5, \ correlationFile=maxLDJob.output, peak_id=peak.id, \ datasetName=datasetName, outputFile=outputFile,\ outputFnamePrefix=outputFnamePrefix, parentJobLs=[maxLDJob], \ extraDependentInputLs=[], transferOutput=True, extraArguments=None, \ job_max_memory=300) no_of_jobs += 1 if no_of_jobs % 2 == 0: sys.stderr.write( "%s%s" % ("\x08" * len(prevReportedStr), no_of_jobs)) prevReportedStr = str(no_of_jobs) sys.stderr.write(" %s jobs.\n" % (no_of_jobs)) return no_of_jobs
def run(self): """ 2011-9-28 """ if self.debug: import pdb pdb.set_trace() db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname) db_250k.setup(create_tables=False) sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID(self.biology_category_id, access=self.access) sameCategoryPhenotypeMethodIDLs = [pm.id for pm in sameCategoryPhenotypeMethodLs] #merge the two lists of phenotype method id together phenotype_method_id_ls = list(set(self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs)) result1_query = db_250k.getResultLs(call_method_id=self.call_method1_id, analysis_method_id_ls=[self.analysis_method1_id], \ phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method1_id) result2_query = db_250k.getResultLs(call_method_id=self.call_method2_id, analysis_method_id_ls=[self.analysis_method2_id], \ phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method2_id) result1_id_ls = [] for result in result1_query: result1_id_ls.append(result.id) result2_id_ls = [] for result in result2_query: result2_id_ls.append(result.id) #make sure the entries with (result_id, self.association_peak_type_id) exists in AssociationPeak result1_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak(result1_id_ls, self.association1_peak_type_id) result2_id_ls = db_250k.filterResultIDLsBasedOnAssociationPeak(result2_id_ls, self.association2_peak_type_id) phenotype_method_id2result1_id = self.getPhenotypeMethodId2ResultID(result1_id_ls) phenotype_method_id2result2_id = self.getPhenotypeMethodId2ResultID(result2_id_ls) phenotype_method_id2result_id_pair = {} for phenotype_method_id , result1_id in phenotype_method_id2result1_id.iteritems(): if phenotype_method_id in phenotype_method_id2result2_id: result2_id = phenotype_method_id2result2_id.get(phenotype_method_id) phenotype_method_id2result_id_pair[phenotype_method_id] = [result1_id, result2_id] # Create a abstract dag workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) self.registerExecutables(workflow) self.registerCustomExecutables(workflow) counter = 0 overlapStatDir = "overlapStat" overlapStatDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapStatDir) counter += 1 """ overlapPlotDir = "overlapPlot" overlapPlotDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=overlapPlotDir) outputFnamePrefix = os.path.join(overlapPlotDir, 'cm%s_cnvM%s_am%s_peakType%s_vs_cm%s_cnvM%s_am%s_peakType%s_biologyCategory%s_overlapPeak'%\ (self.call_method1_id, self.cnv_method1_id, self.analysis_method1_id, self.association1_peak_type_id, \ self.call_method2_id, self.cnv_method2_id, self.analysis_method2_id, self.association2_peak_type_id, \ self.biology_category_id)) plotAssociationPeakOverlapJob = slef.addPlotPeakOverlapJob(workflow, executable=workflow.plotAssociationPeakOverlapJob, \ outputFnamePrefix=outputFnamePrefix, \ parentJobLs=[overlapPlotDirJob], job_max_memory=100, walltime = 60, \ extraDependentInputLs=[], \ transferOutput=True) """ #each contig in each trio gets a summary. peakOverlapStatMergeFile = File('peak_overlap_stat.tsv') peakOverlapStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \ outputF=peakOverlapStatMergeFile, transferOutput=True, parentJobLs=[]) counter += 1 no_of_input =0 for phenotype_method_id, result_id_pair in phenotype_method_id2result_id_pair.iteritems(): result1_id = result_id_pair[0] result2_id = result_id_pair[1] outputFnamePrefix = 'result_%s_vs_%s_peak_type_%s_vs_%s'%(result1_id, result2_id, self.association1_peak_type_id, self.association2_peak_type_id) outputF = File(os.path.join(overlapStatDir, '%s.tsv'%(outputFnamePrefix))) if no_of_input==0: #add one random input, otherwise replica catalog error occurs rm1 = Stock_250kDB.ResultsMethod.get(result1_id) inputFile1 = self.registerOneInputFile(workflow, rm1.filename) extraDependentInputLs=[inputFile1] else: extraDependentInputLs=[] no_of_input += 1 gwasPeakOverlapJob = self.addGWASPeakOverlapJob(workflow, executable=workflow.twoGWASPeakOverlap, \ result1_id=result1_id, result2_id=result2_id, association1_peak_type_id=self.association1_peak_type_id, \ association2_peak_type_id=self.association2_peak_type_id, peak_padding=self.peak_padding, \ outputF=outputF, \ commit=1, results_directory=None, logFile=None, \ parentJobLs=[overlapStatDirJob], job_max_memory=100, walltime = 60, \ extraDependentInputLs=extraDependentInputLs, \ transferOutput=True) counter += 1 self.addInputToStatMergeJob(workflow, statMergeJob=peakOverlapStatMergeJob, \ inputF=outputF, parentJobLs=[gwasPeakOverlapJob]) """ self.addInputToStatMergeJob(workflow, statMergeJob=plotAssociationPeakOverlapJob, inputF=outputF, \ parentJobLs=[gwasPeakOverlapJob]) """ sys.stderr.write("%s jobs.\n"%(counter)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf)
def run(self): """ 2012.2.8 """ if self.debug: import pdb pdb.set_trace() db_vervet = VervetDB.VervetDB(drivername=self.drivername, db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname, schema=self.schema) db_vervet.setup(create_tables=False) session = db_vervet.session session.begin() if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir workflowName = os.path.splitext(os.path.basename(self.outputFname))[0] workflow = self.initiateWorkflow(workflowName) self.registerJars(workflow) self.registerExecutables(workflow) self.registerCustomExecutables(workflow) isq_ls = self.fetchIndividualSequenceFromDB(db_vervet, self.isq_id_ls) no_of_jobs = 1 individualSequenceID2FilePairLs = db_vervet.getIndividualSequenceID2FilePairLs([isq.id for isq in isq_ls], \ data_dir=self.local_data_dir, checkOldPath=True) for individualSequenceID, FilePairLs in individualSequenceID2FilePairLs.iteritems(): individual_sequence = VervetDB.IndividualSequence.get(individualSequenceID) newISQPath = individual_sequence.constructRelativePathForIndividualSequence() #newISQPath = '%s_split'%(newISQPath) if individual_sequence.path !=newISQPath: individual_sequence.path = newISQPath session.add(individual_sequence) session.flush() sequenceOutputDir = os.path.join(self.data_dir, individual_sequence.path) sequenceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=sequenceOutputDir) for filePair in FilePairLs: for fileRecord in filePair: filename = fileRecord[0] absPath = os.path.join(self.local_data_dir, filename) fastqFile = self.registerOneInputFile(workflow, absPath) library, mate_id = self.parseLibraryMateIDFromFilename(filename)[:2] if library is None: sys.stderr.write("Warning: can't parse library out of file %s of isq %s & skip.\n"%(filename, individualSequenceID)) continue if mate_id: prefix = '%s_%s'%(library, mate_id) else: prefix = library outputFilenamePrefix = '%s_%s'%(individual_sequence.id, prefix) splitOutputDir = outputFilenamePrefix splitOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=splitOutputDir) splitFastQFnamePrefix = os.path.join(splitOutputDir, outputFilenamePrefix) logFile = File('%s_%s.split.log'%(individual_sequence.id, prefix)) splitReadFileJob1 = self.addSplitReadFileJob(workflow, executable=workflow.splitReadFile, \ inputF=fastqFile, outputFnamePrefix=splitFastQFnamePrefix, \ outputFnamePrefixTail="", minNoOfReads=self.minNoOfReads, \ logFile=logFile, parentJobLs=[splitOutputDirJob], \ job_max_memory=2000, walltime = 800, \ extraDependentInputLs=[], transferOutput=True) logFile = File('%s_%s.register.log'%(individual_sequence.id, prefix)) registerJob1 = self.addRegisterAndMoveSplitFileJob(workflow, executable=workflow.registerAndMoveSplitSequenceFiles, \ inputDir=splitOutputDir, outputDir=sequenceOutputDir, relativeOutputDir=individual_sequence.path, logFile=logFile,\ individual_sequence_id=individual_sequence.id, bamFile=None, library=library, mate_id=mate_id, \ parentJobLs=[splitReadFileJob1, sequenceOutputDirJob], job_max_memory=100, walltime = 60, \ commit=self.commit, extraDependentInputLs=[], \ transferOutput=True) no_of_jobs += 3 sys.stderr.write("%s jobs.\n"%(no_of_jobs)) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf) if self.commit: session.commit() else: session.rollback()