def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2013.06.14 move topOutputDirJob from addAllJobs to here. 2012.9.17 """ if workflow is None: workflow = self returnData = PassingData(no_of_jobs=0) returnData.jobDataLs = [] self.topOutputDirJob = self.addMkDirJob(outputDir="%sRun" % (outputDirPrefix)) passingData.topOutputDirJob = self.topOutputDirJob mapDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, \ outputDir="%sMap"%(outputDirPrefix)) passingData.mapDirJob = mapDirJob returnData.mapDirJob = mapDirJob self.mapDirJob = mapDirJob reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, \ outputDir="%sReduce"%(outputDirPrefix)) passingData.reduceOutputDirJob = reduceOutputDirJob returnData.reduceOutputDirJob = reduceOutputDirJob self.plotDirJob = self.addMkDirJob(outputDir="%sPlot" % (outputDirPrefix)) self.statDirJob = self.addMkDirJob(outputDir="%sStat" % (outputDirPrefix)) self.reduceStatDirJob = self.addMkDirJob(outputDir="%sReduceStat" % (outputDirPrefix)) self.reduceEachInputDirJob = self.addMkDirJob( outputDir="%sReduceEachInput" % (outputDirPrefix)) self.reduceEachChromosomeDirJob = self.addMkDirJob( outputDir="%sReduceEachChromosome" % (outputDirPrefix)) self.reduceOutputDirJob = reduceOutputDirJob return returnData
def addJobs(self, workflow=None, inputData=None, db_vervet=None, genotypeMethodShortName=None, commit=None,\ data_dir=None, checkEmptyVCFByReading=False, transferOutput=True,\ maxContigID=None, outputDirPrefix="", needSSHDBTunnel=False): """ 2012.5.9 """ sys.stderr.write("Adding VCF2DB jobs for %s vcf files ... "%(len(inputData.jobDataLs))) topOutputDir = "%sVCF2DB"%(outputDirPrefix) topOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=topOutputDir) firstVCFFile = inputData.jobDataLs[0].vcfFile logFile = File(os.path.join(topOutputDir, 'AddGenotypeMethod2DB.log')) addGM2DBJob = self.addAddGenotypeMethod2DBJob(executable=self.AddGenotypeMethod2DB, inputFile=firstVCFFile, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, data_dir=data_dir, commit=commit, parentJobLs=[], extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=10, sshDBTunnel=needSSHDBTunnel) updateGMlogFile = File(os.path.join(topOutputDir, 'updateGM.log')) updateGMNoOfLociJob = self.addUpdateGenotypeMethodNoOfLociJob(executable=self.UpdateGenotypeMethodNoOfLoci, \ genotypeMethodShortName=genotypeMethodShortName,\ logFile=updateGMlogFile, data_dir=data_dir, commit=commit, parentJobLs=[topOutputDirJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=20, sshDBTunnel=needSSHDBTunnel) returnData = PassingData() returnData.jobDataLs = [] for jobData in inputData.jobDataLs: inputF = jobData.vcfFile if maxContigID: contig_id = self.getContigIDFromFname(inputF.name) try: contig_id = int(contig_id) if contig_id>maxContigID: #skip the small contigs continue except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() logFile = File(os.path.join(topOutputDir, 'AddVCFFile2DB_%s.log'%(self.getChrFromFname(inputF.name)))) addVCFJob = self.addAddVCFFile2DBJob(executable=self.AddVCFFile2DB, inputFile=inputF, genotypeMethodShortName=genotypeMethodShortName,\ logFile=logFile, format="VCF", data_dir=data_dir, checkEmptyVCFByReading=checkEmptyVCFByReading, commit=commit, \ parentJobLs=[addGM2DBJob]+jobData.jobLs, extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=1000, sshDBTunnel=needSSHDBTunnel) workflow.depends(parent=addVCFJob, child=updateGMNoOfLociJob) sys.stderr.write("%s jobs.\n"%(self.no_of_jobs)) #include the tfam (outputList[1]) into the fileLs returnData.jobDataLs.append(PassingData(jobLs=[updateGMNoOfLociJob], file=updateGMlogFile, \ fileLs=[updateGMlogFile])) return returnData
def run(self): """ 2011-7-11 """ if self.debug: import pdb pdb.set_trace() db_vervet = self.db_vervet session = db_vervet.session session.begin() if not self.data_dir: self.data_dir = db_vervet.data_dir if not self.local_data_dir: self.local_data_dir = db_vervet.data_dir workflow = self.initiateWorkflow() self.registerJars(workflow) self.registerCustomJars(workflow) self.registerExecutables(workflow) self.registerCustomExecutables(workflow) isq_id2LibrarySplitOrder2FileLs = db_vervet.getISQ_ID2LibrarySplitOrder2FileLs(self.ind_seq_id_ls, data_dir=self.data_dir, \ filtered=0, ignoreEmptyReadFile=False) #2012.6.1 unfiltered read file shoudn't be empty to_work_ind_seq_id_set = set() parent_individual_sequence_file_id_set = set() for ind_seq_id, LibrarySplitOrder2FileLs in isq_id2LibrarySplitOrder2FileLs.iteritems(): parent_individual_sequence = VervetDB.IndividualSequence.get(ind_seq_id) if parent_individual_sequence is not None and parent_individual_sequence.format=='fastq': """ check if the child individual_sequence already exists in db or not. if it does, what about its files?? if not, go add filtering jobs. """ #2012.6.8 individual_sequence = db_vervet.copyParentIndividualSequence(parent_individual_sequence=parent_individual_sequence, \ parent_individual_sequence_id=ind_seq_id,\ quality_score_format='Standard', filtered=1, data_dir=self.data_dir) """ # 2012.6.8 use db_vervet.copyParentIndividualSequence() instead. individual_sequence = db_vervet.getIndividualSequence(individual_id=parent_individual_sequence.individual_id, \ sequencer=parent_individual_sequence.sequencer, sequence_type=parent_individual_sequence.sequence_type,\ sequence_format=parent_individual_sequence.format, path_to_original_sequence=None, tissue_name=None, coverage=None,\ quality_score_format='Standard', filtered=1,\ parent_individual_sequence_id=parent_individual_sequence.id, data_dir=self.data_dir) """ library_split_order2filtered_db_entry_ls = self.getLibrarySplitOrder2DBEntryLs(individual_sequence) sequenceOutputDirJob = None filteredReadOutputDirJob = None for key, fileObjLs in LibrarySplitOrder2FileLs.iteritems(): if key in library_split_order2filtered_db_entry_ls: sys.stderr.write("Warning: this pair of filtered individual_sequence_file(s), %s, parent_individual_sequence (id=%s, %s),\ individual_sequence (id=%s, %s) are already in db. skip.\n"%\ (repr(key), parent_individual_sequence.id, parent_individual_sequence.individual.code,\ individual_sequence.id, individual_sequence.individual.code)) continue else: if sequenceOutputDirJob is None: sequenceOutputDir = os.path.join(self.data_dir, individual_sequence.path) sequenceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=sequenceOutputDir) if filteredReadOutputDirJob is None: filteredReadOutputDir = os.path.join(os.path.basename(individual_sequence.path)) filteredReadOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=filteredReadOutputDir) library, split_order = key[:2] #add filter jobs filterShortRead_job = self.addFilterReadJob(executable=self.FilterReadJava, jar=workflow.FilterReadJar,\ parentJobLs=[filteredReadOutputDirJob], job_max_memory=2000, walltime = 120, \ extraDependentInputLs=None, transferOutput=False) for i in xrange(len(fileObjLs)): fileObj = fileObjLs[i] try: #2012.7.2 inputFile = self.registerOneInputFile(workflow, inputFname=fileObj.path, folderName='inputIndividualSequenceFile') except: import pdb pdb.set_trace() outputFname = os.path.join(filteredReadOutputDir, os.path.basename(fileObj.path)) outputFile = File(outputFname) #take the base filename as the output filename. it'll be in scratch/. if i==0: #1st mate #also add the quality_score_format filterShortRead_job.addArguments('V=%s'%fileObj.db_entry.quality_score_format) filterShortRead_job.addArguments("I=", inputFile, 'O=', outputFile) elif i==1: #2nd mate filterShortRead_job.addArguments("J=", inputFile, 'P=', outputFile) else: sys.stderr.write("Error: mate %s appeared in paired-end data (individualSequenceID=%s).\n"%(i+1, ind_seq_id)) sys.exit(4) filterShortRead_job.uses(inputFile, transfer=True, register=True, link=Link.INPUT) filterShortRead_job.uses(outputFile, transfer=False, register=True, link=Link.OUTPUT) logFile = File('%s_%s.register.log'%(individual_sequence.id, fileObj.db_entry.id)) addFilteredSequences2DB_job = self.addAddFilteredSequences2DB_job(workflow, \ executable=workflow.AddFilteredSequences2DB, \ inputFile=outputFile, individual_sequence_id=individual_sequence.id, outputDir=sequenceOutputDir, \ logFile=logFile, \ parent_individual_sequence_file_id=fileObj.db_entry.id,\ parentJobLs=[sequenceOutputDirJob, filterShortRead_job], commit=self.commit, \ extraDependentInputLs=None, transferOutput=True, sshDBTunnel=self.needSSHDBTunnel) to_work_ind_seq_id_set.add(ind_seq_id) parent_individual_sequence_file_id_set.add(fileObj.db_entry.id) sys.stderr.write("%s jobs, %s individual_sequence entries, %s parent_individual_sequence_file_id s.\n"%\ (self.no_of_jobs, len(to_work_ind_seq_id_set), len(parent_individual_sequence_file_id_set))) # Write the DAX to stdout outf = open(self.outputFname, 'w') workflow.writeXML(outf) if self.commit: session.commit() else: session.rollback()