Ejemplo n.º 1
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		counter = 0
		no_of_vcf = 0
		real_counter = 0
		for inputFname in self.inputFnameLs:
			counter += 1
			if os.path.isfile(inputFname):
				try:
					if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False):
						no_of_vcf += 1
						if NextGenSeq.isVCFFileEmpty(inputFname, checkContent=self.checkEmptyVCFByReading):
							if self.commit:
								if self.report:
									sys.stderr.write("file %s deleted.\n"%(inputFname))
								commandline = 'rm %s'%(inputFname)
								return_data = runLocalCommand(commandline, report_stderr=True, report_stdout=True)
							real_counter += 1
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
			if self.report and counter%500==0:
				sys.stderr.write("%s%s\t%s\t%s"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s%s\t%s\t%s\n"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s files in total.\n"%(counter))
		sys.stderr.write("Out of %s VCF files, %s are empty and were deleted.\n"%(no_of_vcf, real_counter))
	def run(self):
		"""
		2011-7-11
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		if NextGenSeq.isVCFFileEmpty(self.inputFname, checkContent=True):
			sys.stderr.write("Input %s doesn't exist or no variants in it.\n"%(self.inputFname))
			#make sure some output files will exist for downstream jobs.
			self.openOutputFiles(self.outputFnamePrefix, self.windowSize)
			sys.exit(0)
		
		vcfFile = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth)
		trio_col_index_data = self.findTrioIndex(vcfFile.sample_id2index, self.trio_id)
		father_index = trio_col_index_data.father_index
		mother_index = trio_col_index_data.mother_index
		child_index = trio_col_index_data.child_index
		outputDStruc = self.openOutputFiles(self.outputFnamePrefix, self.windowSize)
		if (father_index==-1 and mother_index!=-1) or (father_index!=-1 and mother_index==-1):
			#one parent is missing. it's duo.
			self._calculateForDuo(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data)
		else:
			self._calculateForTrio(vcfFile, outputDStruc=outputDStruc, trio_col_index_data=trio_col_index_data)
		
		"""
	def add2DB(self, db=None, individual_alignment_id=None, inputFname=None, format=None, minDP=None, maxDP=None, minBaseQ=None, minMapQ=None,\
			minRMSMapQ=None, minDistanceToIndel=None, comment=None, data_dir=None, commit=0):
		"""
		2012.11.13
		"""
		session = db.session
		session.begin()
		
		#2012.11.13 check if it's in db already
		db_entry = db.checkIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, minDP=minDP, \
									maxDP=maxDP, minBaseQ=minBaseQ, minMapQ=minMapQ,\
									minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel)
		if db_entry:
			sys.stderr.write("Warning: IndividualAlignmentConsensusSequence of (individual_alignment_id=%s, minDP %s, maxDP %s, etc.) already in db with id=%s.\n"%\
							(individual_alignment_id, minDP, maxDP, db_entry.id))
			sys.exit(3)
		else:
			countData = NextGenSeq.countNoOfChromosomesBasesInFastQFile(inputFname)
			no_of_chromosomes = countData.no_of_chromosomes
			no_of_bases = countData.no_of_bases
			
			db_entry = db.getIndividualAlignmentConsensusSequence(individual_alignment_id=individual_alignment_id, format=format, \
									minDP=minDP, maxDP=maxDP, minBaseQ=minBaseQ, \
									minMapQ=minMapQ, minRMSMapQ=minRMSMapQ, minDistanceToIndel=minDistanceToIndel, \
									no_of_chromosomes=no_of_chromosomes,no_of_bases=no_of_bases, \
									original_path=os.path.abspath(inputFname), data_dir=data_dir)
		
		if commit:
			inputFileBasename = os.path.basename(inputFname)
			#moveFileIntoDBAffiliatedStorage() will also set db_entry.path
			exitCode = db.moveFileIntoDBAffiliatedStorage(db_entry=db_entry, filename=inputFileBasename, \
									inputDir=os.path.split(inputFname)[0], \
									outputDir=data_dir,\
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=db_entry.constructRelativePath, data_dir=data_dir)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			session.flush()
			session.commit()
		else:	#default is also rollback(). to demonstrate good programming
			session.rollback()
Ejemplo n.º 4
0
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db_vervet = self.db_vervet
		if not self.data_dir:
			self.data_dir = db_vervet.data_dir
		
		if not self.local_data_dir:
			self.local_data_dir = db_vervet.data_dir
		
		# Create a abstract dag
		workflowName = os.path.splitext(os.path.basename(self.outputFname))[0]
		workflow = self.initiateWorkflow(workflowName)
		
		self.registerJars(workflow)
		self.registerCommonExecutables(workflow)
		self.registerCustomExecutables(workflow)
		
		refSequence = VervetDB.IndividualSequence.get(self.ref_ind_seq_id)
		
		refFastaFname = os.path.join(self.data_dir, refSequence.path)
		registerReferenceData = yh_pegasus.registerRefFastaFile(workflow, refFastaFname, registerAffiliateFiles=True, \
						input_site_handler=self.input_site_handler,\
						checkAffiliateFileExistence=True)
		refFastaFList = registerReferenceData.refFastaFList
		
		self.outputAlignmentDepthAndOthersForFilter(self.alnStatForFilterFname, ref_ind_seq_id=self.ref_ind_seq_id, \
												foldChange=self.depthFoldChange, minGQ=self.minGQ)
		alnStatForFilterF = self.registerOneInputFile(workflow, self.alnStatForFilterFname)
		
		#name to distinguish between vcf1Dir, and vcf2Dir
		vcf1Name = self.findProperVCFDirIdentifier(self.vcf1Dir, defaultName='vcf1')
		vcf2Name = self.findProperVCFDirIdentifier(self.vcf2Dir, defaultName='vcf2')
		if vcf2Name==vcf1Name or not vcf2Name:
			vcf2Name = "vcf2"
		
		no_of_jobs = 0
		vcf1DepthFilterDir = "%s_DepthFilter"%(vcf1Name)
		vcf1DepthFilterDirJob = self.addMkDirJob(outputDir=vcf1DepthFilterDir)
		#vcf2DepthFilterDir = "%s_DepthFilter"%(vcf2Name)
		#vcf2DepthFilterDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=vcf2DepthFilterDir)
		
		trioInconsistencyDir = "trioInconsistency"
		trioInconsistencyDirJob = self.addMkDirJob(outputDir=trioInconsistencyDir)
		
		
		SNPMismatchStatDir = "SNPMismatchStat"
		SNPMismatchStatDirJob = self.addMkDirJob(outputDir=SNPMismatchStatDir)
		
		input_site_handler = self.input_site_handler
		
		
		#whole genome reduction job.
		wholeGenomeSiteStatFile = File('siteStatAndTrioInconsistency.tsv')
		wholeGenomeSiteStatMergeJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.mergeSameHeaderTablesIntoOne, \
							outputF=wholeGenomeSiteStatFile,transferOutput=False)
		
		wholeGenomeSiteStatBGzipFile = File("%s.gz"%wholeGenomeSiteStatFile.name)
		wholeGenomeSiteStatBGZipTabixJob = self.addBGZIP_tabix_Job(workflow, bgzip_tabix=workflow.bgzip_tabix, \
							parentJob=wholeGenomeSiteStatMergeJob, inputF=wholeGenomeSiteStatFile, \
							outputF=wholeGenomeSiteStatBGzipFile, \
							transferOutput=True, tabixArguments="-s 1 -b 2 -e 2")
		no_of_jobs += 5
		
		#read the trioInconsistencyByPosistionFname and figure out how many contigs in it and add an extraction job for each contig
		chrLs = self.getChrListInTrioInconsistencyFile(self.tabixPath, self.trioInconsistencyByPosistionFname)
		chr2tabixRetrieveJob = {}
		trioInconsistencyByPosistionF = self.registerOneInputFile(workflow, self.trioInconsistencyByPosistionFname)
		trioInconsistencyByPosistion_tbi_Fname = '%s.tbi'%(self.trioInconsistencyByPosistionFname)
		trioInconsistencyByPosistion_tbi_F = self.registerOneInputFile(workflow, trioInconsistencyByPosistion_tbi_Fname)
		
		for chr in chrLs:
			outputF = File(os.path.join(trioInconsistencyDir, '%s.trioInconsistency.tsv'%chr))
			tabixRetrieveJob = self.addTabixRetrieveJob(workflow, executable=workflow.tabixRetrieve, tabixPath=self.tabixPath, \
							inputF=trioInconsistencyByPosistionF, outputF=outputF, regionOfInterest=chr, includeHeader=True,\
							parentJobLs=[trioInconsistencyDirJob], job_max_memory=100, extraDependentInputLs=[trioInconsistencyByPosistion_tbi_F], \
							transferOutput=False)
			chr2tabixRetrieveJob[chr] = tabixRetrieveJob
			no_of_jobs += 1
		
		counter = 0
		no_of_vcf = 0
		no_of_good_vcf = 0
		for inputFname in os.listdir(self.vcf1Dir):
			counter += 1
			if counter%500==0:
				sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files"%('\x08'*180, no_of_jobs, \
														no_of_good_vcf, no_of_vcf, counter))
			
			vcf1AbsPath = os.path.join(os.path.abspath(self.vcf1Dir), inputFname)
			vcf2AbsPath = os.path.join(os.path.abspath(self.vcf2Dir), inputFname)
			if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False) and not NextGenSeq.isVCFFileEmpty(vcf1AbsPath):
				if not NextGenSeq.isVCFFileEmpty(vcf2AbsPath, checkContent=self.checkEmptyVCFByReading):	#make sure the samtools vcf exists
					no_of_vcf += 1
					chr = self.getChrFromFname(inputFname)
					if not chr or chr not in chr2tabixRetrieveJob:
						continue
					no_of_good_vcf += 1
					#find the contig id and  the matching tabix job
					commonPrefix = inputFname.split('.')[0]
					vcf1 = File(os.path.join(vcf1Name, inputFname))	#relative path
					vcf1.absPath = vcf1AbsPath
					self.registerVCFAndItsTabixIndex(workflow, vcf1, input_site_handler)
					vcf2 = File(os.path.join(vcf2Name, inputFname))	#relative path
					vcf2.absPath = vcf2AbsPath
					self.registerVCFAndItsTabixIndex(workflow, vcf2, input_site_handler)
					
					outputSiteStatF = File(os.path.join(vcf1DepthFilterDir, '%s.siteStat.tsv'%(commonPrefix)))
					vcf1FilterByDepthJob = self.addFilterVCFByDepthJob(workflow, FilterVCFByDepthJava=workflow.FilterVCFByDepthJava, \
							GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \
							refFastaFList=refFastaFList, inputVCFF=vcf1, outputVCFF=None, outputSiteStatF=outputSiteStatF,\
							parentJobLs=[vcf1DepthFilterDirJob], \
							alnStatForFilterF=alnStatForFilterF, \
							extraDependentInputLs=[vcf1.tbi_F], onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP)
					
					snpMisMatchStatFile = File(os.path.join(SNPMismatchStatDir, '%s_snpMismatchStat.tsv'%(os.path.splitext(commonPrefix)[0])))
					calculateSNPMismatchRateOfTwoVCFJob = self.addCalculateTwoVCFSNPMismatchRateJob(workflow, \
							executable=workflow.CalculateSNPMismatchRateOfTwoVCF, \
							vcf1=vcf1, vcf2=vcf2, snpMisMatchStatFile=snpMisMatchStatFile, \
							maxSNPMismatchRate=1.0, parentJobLs=[SNPMismatchStatDirJob], \
							job_max_memory=1000, extraDependentInputLs=[], \
							transferOutput=False)
					
					#add a ReduceMatrixByMergeColumnsWithSameKey job
					chrMergingStatF = File('%s_variantSiteStatAndTrioInconsistencyRate.tsv'%(chr))
					chrMergingStatJob = self.addStatMergeJob(workflow, \
									statMergeProgram=workflow.ReduceMatrixByMergeColumnsWithSameKey, \
									outputF=chrMergingStatF, extraArguments='-k 0,1', transferOutput=False)
					tabixRetrieveJob = chr2tabixRetrieveJob[chr]
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=tabixRetrieveJob.output, \
								parentJobLs=[tabixRetrieveJob])
					
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=outputSiteStatF, \
								parentJobLs=[vcf1FilterByDepthJob])
					self.addInputToStatMergeJob(workflow, statMergeJob=chrMergingStatJob, \
								inputF=snpMisMatchStatFile, \
								parentJobLs=[calculateSNPMismatchRateOfTwoVCFJob])
					
					#add to the whole genome reduction job
					self.addInputToStatMergeJob(workflow, statMergeJob=wholeGenomeSiteStatMergeJob, \
								inputF=chrMergingStatJob.output, \
								parentJobLs=[chrMergingStatJob])
					no_of_jobs += 3
		
		sys.stderr.write("%s %s jobs %s good vcf, %s total vcf, %s total files.\n"%('\x08'*180, no_of_jobs, \
														no_of_good_vcf, no_of_vcf, counter))
		
		# Write the DAX to stdout
		outf = open(self.outputFname, 'w')
		workflow.writeXML(outf)
Ejemplo n.º 5
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		realPath = os.path.realpath(self.inputFname)
		logMessage = "file %s.\n"%(self.inputFname)
		if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \
				not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading):
			vcfFile = VCFFile(inputFname=self.inputFname)
			
			individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile)
			
			genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \
															individualAlignmentLs=individualAlignmentLs,\
															no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\
															data_dir=self.data_dir)
			self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session)
			
			pdata = self.getNoOfLociFromVCFFile(vcfFile)
			chromosome2noOfLoci = pdata.chromosome2noOfLoci
			no_of_loci = pdata.no_of_loci
			if no_of_loci>0:	#file with zero loci could have identical md5sum
				try:
					md5sum = utils.get_md5sum(realPath)
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					self.cleanUpAndExitOnFailure(exitCode=4)
			else:
				md5sum = None
			"""
			db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first()
			if db_entry:
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\
									(db_entry.path, md5sum, realPath))
				session.rollback()
				#2012.8.3 when the jobs are clustered into one merged job and it failed halfway
				# and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. 
				self.cleanUpAndExitOnFailure(exitCode=0)
			"""
			no_of_individuals = len(individualAlignmentLs)
			no_of_chromosomes = len(chromosome2noOfLoci)
			if no_of_chromosomes == 1:	#2012.8.30 use 1st chromosome
				chromosome = chromosome2noOfLoci.keys()[0]
			else:
				chromosome = None
			genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\
										chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\
										original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\
										data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes)
			if genotypeFile.id and genotypeFile.path:
				isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir)
				if isPathInDB==-1:
					sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path))
					self.cleanUpAndExitOnFailure(exitCode=isPathInDB)
				elif isPathInDB==1:	#successful exit, entry already in db
					sys.stderr.write("Warning: file %s is already in db.\n"%\
										(genotypeFile.path))
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=0)
				else:	#not in db affiliated storage, keep going.
					pass
			#move the file and update the db_entry's path as well
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			#copy the tbi (tabix) index file if it exists
			tbiFilename = '%s.tbi'%(realPath)
			if os.path.isfile(tbiFilename):
				srcFilename = tbiFilename
				dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path))
				utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
				logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename)
			## 2012.7.17 commented out because md5sum is calcualted above
			#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			# #2012.7.17 record the size of db_entry.path (folder or file)
			self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir)
			
			vcfFile.close()
			logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum)
		else:
			logMessage += " is empty (no loci) or not VCF file.\n"
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			session.rollback()
			#delete all target files but exit gracefully (exit 0)
			self.cleanUpAndExitOnFailure(exitCode=0)