def addBamFileToDB(self, db_vervet, bamFilePath, library=None, individual_sequence_id=None, mate_id=None):
		"""
		2013.04.03 bamFilePath passed to original_path
		2012.4.30
			add mate_id
		2012.1.27
			1. run md5sum
			2. check if it already exists in db
				if not, add it into db
				if yes, exit the program. no more work.
		"""
		sys.stderr.write("Record the bamfile into db ...")
		md5sum = utils.get_md5sum(bamFilePath)
		db_entry = VervetDB.IndividualSequenceFileRaw.query.filter_by(md5sum=md5sum).first()
		if db_entry:
			sys.stderr.write("Warning: another file %s with the identical md5sum %s (library=%s) as this file %s is already in db.\n"%\
								(db_entry.path, md5sum, library, bamFilePath))
			#sys.exit(3)
		else:
			db_entry = db_vervet.getIndividualSequenceFileRaw(individual_sequence_id, library=library, md5sum=md5sum, \
											original_path=bamFilePath, mate_id=mate_id)
		mate_id2split_order_ls = {}
		for individual_sequence_file in db_entry.individual_sequence_file_ls:
			mate_id = individual_sequence_file.mate_id
			if mate_id is None:	#sequence entries without mate_id are just from one mate.
				mate_id = 1
			if mate_id not in mate_id2split_order_ls:
				mate_id2split_order_ls[mate_id] = []
			mate_id2split_order_ls[mate_id].append(individual_sequence_file.split_order)
		if len(mate_id2split_order_ls)>2:
			sys.stderr.write("Error: db sequence files spawned from bam file %s (md5sum=%s) form %s(>2) mates.\
				Unless this bam file contains reads from >2 mates, reads from this bam files should be stored in db already.\n"%\
				(db_entry.path, md5sum, len(mate_id2split_order_ls)))
			sys.exit(4)
		return db_entry
Beispiel #2
0
	def updateDBEntryMD5SUM(self, db_entry=None, data_dir=None, absPath=None):
		"""
		2012.12.15 moved from VervetDB
		2012.7.13
			if absPath is given, take that , rather than construct it from data_dir and db_entry.path
		"""
		from pymodule import utils
		if data_dir is None:
			data_dir = self.data_dir
		
		if hasattr(db_entry, 'path') and db_entry.path:
			db_entry_path = db_entry.path
		elif hasattr(db_entry, 'filename') and db_entry.filename:
			db_entry_path = db_entry.filename
		else:
			db_entry_path = None
		if not absPath and db_entry_path:
			absPath = supplantFilePathWithNewDataDir(filePath=db_entry_path, oldDataDir=self.data_dir,\
													newDataDir=data_dir)
		
		if absPath and not os.path.isfile(absPath):
			sys.stderr.write("updateDBEntryMD5SUM() Warning: target file %s doesn't exist. Could not update its md5sum.\n"%(absPath))
			return
		md5sum = utils.get_md5sum(absPath)
		if db_entry.md5sum is  not None and db_entry.md5sum!=md5sum:
			sys.stderr.write("WARNING: The new md5sum %s is not same as the existing md5sum %s.\n"%(md5sum, db_entry.md5sum))
		db_entry.md5sum = md5sum
		self.session.add(db_entry)
		self.session.flush()
	def run(self):
		"""
		2013.08.04
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		inputFileRealPath = os.path.realpath(self.inputFname)
		logMessage = "Adding file %s to db .\n"%(self.inputFname)
		
		if os.path.isfile(inputFileRealPath):
			popGenSimulationType = self.db_vervet.getPopGenSimulationType( short_name=None, r=self.r, rho=self.rho, \
							mu=self.mu, theta=self.theta, n0=self.n0, is_selection=self.is_selection,\
							selection_parameters=self.selection_parameters, indel=None, indel_parameters=None, \
							population_size_parameters=self.population_size_parameters, \
							parent_pop_gen_simulation_type_id=self.parent_pop_gen_simulation_type_id)
			popGenSimulation = self.db_vervet.getPopGenSimulation(pop_gen_simulation_type_id=popGenSimulationType.id, \
						replicate_index=self.replicate_index, no_of_populations=self.no_of_populations,\
						no_of_chromosomes=self.no_of_chromosomes, chromosome_length=self.chromosome_length, \
						sample_size=self.sample_size, \
						no_of_polymorphic_loci=self.no_of_polymorphic_loci, programs=self.simulation_programs,\
						original_path=inputFileRealPath, data_dir=self.data_dir)
			try:
				md5sum = utils.get_md5sum(inputFileRealPath)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=4)
			
			db_entry = VervetDB.PopGenSimulation.query.filter_by(md5sum=md5sum).first()
			if db_entry and db_entry.id!=popGenSimulation.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)):
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\
								(db_entry.path, md5sum, inputFileRealPath))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=3)
			
			
			if popGenSimulation.md5sum is None or popGenSimulation.md5sum!=md5sum:
				popGenSimulation.md5sum = md5sum
				session.add(popGenSimulation)
				session.flush()
			try:
				#move the file and update the db_entry's path as well
				exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=popGenSimulation, \
							filename=os.path.basename(inputFileRealPath), \
							inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, popGenSimulation.path), \
							relativeOutputDir=None, shellCommand='cp -rL', \
							srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
							constructRelativePathFunction=popGenSimulation.constructRelativePath)
			except:
				sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info())))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			try:
				#make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs
				#copy further files if there are
				if self.inputFnameLs:
					for inputFname in self.inputFnameLs:
						if inputFname!=self.inputFname:	#2013.3.18 make sure it has not been copied.
							logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \
												filenameWithPrefix=popGenSimulation.path, \
												outputDir=self.data_dir,\
												logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \
												dstFilenameLs=self.dstFilenameLs)
				
				self.db_vervet.updateDBEntryPathFileSize(db_entry=popGenSimulation, data_dir=data_dir)
				
				## 2012.7.17 commented out because md5sum is calculated above
				#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
		else:
			logMessage += "%s doesn't exist.\n"%(inputFileRealPath)
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			#delete all target files but exit gracefully (exit 0)
			self.sessionRollback(session)
			self.cleanUpAndExitOnFailure(exitCode=0)
Beispiel #4
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		realPath = os.path.realpath(self.inputFname)
		logMessage = "file %s.\n"%(self.inputFname)
		if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \
				not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading):
			vcfFile = VCFFile(inputFname=self.inputFname)
			
			individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile)
			
			genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \
															individualAlignmentLs=individualAlignmentLs,\
															no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\
															data_dir=self.data_dir)
			self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session)
			
			pdata = self.getNoOfLociFromVCFFile(vcfFile)
			chromosome2noOfLoci = pdata.chromosome2noOfLoci
			no_of_loci = pdata.no_of_loci
			if no_of_loci>0:	#file with zero loci could have identical md5sum
				try:
					md5sum = utils.get_md5sum(realPath)
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					self.cleanUpAndExitOnFailure(exitCode=4)
			else:
				md5sum = None
			"""
			db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first()
			if db_entry:
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\
									(db_entry.path, md5sum, realPath))
				session.rollback()
				#2012.8.3 when the jobs are clustered into one merged job and it failed halfway
				# and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. 
				self.cleanUpAndExitOnFailure(exitCode=0)
			"""
			no_of_individuals = len(individualAlignmentLs)
			no_of_chromosomes = len(chromosome2noOfLoci)
			if no_of_chromosomes == 1:	#2012.8.30 use 1st chromosome
				chromosome = chromosome2noOfLoci.keys()[0]
			else:
				chromosome = None
			genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\
										chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\
										original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\
										data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes)
			if genotypeFile.id and genotypeFile.path:
				isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir)
				if isPathInDB==-1:
					sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path))
					self.cleanUpAndExitOnFailure(exitCode=isPathInDB)
				elif isPathInDB==1:	#successful exit, entry already in db
					sys.stderr.write("Warning: file %s is already in db.\n"%\
										(genotypeFile.path))
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=0)
				else:	#not in db affiliated storage, keep going.
					pass
			#move the file and update the db_entry's path as well
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			#copy the tbi (tabix) index file if it exists
			tbiFilename = '%s.tbi'%(realPath)
			if os.path.isfile(tbiFilename):
				srcFilename = tbiFilename
				dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path))
				utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
				logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename)
			## 2012.7.17 commented out because md5sum is calcualted above
			#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			# #2012.7.17 record the size of db_entry.path (folder or file)
			self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir)
			
			vcfFile.close()
			logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum)
		else:
			logMessage += " is empty (no loci) or not VCF file.\n"
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			session.rollback()
			#delete all target files but exit gracefully (exit 0)
			self.cleanUpAndExitOnFailure(exitCode=0)
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		inputFileRealPath = os.path.realpath(self.inputFname)
		logMessage = "Adding file %s to db .\n"%(self.inputFname)
		
		if os.path.isfile(inputFileRealPath):
			if self.individual_alignment_id:
				individual_alignment = VervetDB.IndividualAlignment.get(self.individual_alignment_id)
			elif self.parent_individual_alignment_id:
				individual_alignment = self.db_vervet.copyParentIndividualAlignment(parent_individual_alignment_id=self.parent_individual_alignment_id,\
																	mask_genotype_method_id=self.mask_genotype_method_id,\
																	data_dir=self.data_dir, local_realigned=self.local_realigned)
			else:
				#alignment for this library of the individual_sequence
				individual_sequence = VervetDB.IndividualSequence.get(self.individual_sequence_id)
				
				individual_alignment = self.db_vervet.getAlignment(individual_sequence_id=self.individual_sequence_id,\
										path_to_original_alignment=None, sequencer=individual_sequence.sequencer,\
										sequence_type=individual_sequence.sequence_type, sequence_format=individual_sequence.format, \
										ref_individual_sequence_id=self.ref_sequence_id, \
										alignment_method_id=self.alignment_method_id, alignment_format=self.format,\
										individual_sequence_filtered=individual_sequence.filtered, read_group_added=1,
										data_dir=data_dir, \
										mask_genotype_method_id=self.mask_genotype_method_id, \
										parent_individual_alignment_id=self.parent_individual_alignment_id,\
										individual_sequence_file_raw_id=self.individual_sequence_file_raw_id,\
										local_realigned=self.local_realigned, read_group=self.read_group)
			needSessionFlush = False
			if not individual_alignment.path:
				individual_alignment.path = individual_alignment.constructRelativePath()
				needSessionFlush = True
			
			if self.mask_genotype_method_id and \
					individual_alignment.mask_genotype_method_id!=self.mask_genotype_method_id:
				individual_alignment.mask_genotype_method_id = self.mask_genotype_method_id
				needSessionFlush = True
			if self.individual_sequence_file_raw_id and \
					individual_alignment.individual_sequence_file_raw_id != self.individual_sequence_file_raw_id:
				individual_alignment.individual_sequence_file_raw_id = self.individual_sequence_file_raw_id
				needSessionFlush = True
			
			if needSessionFlush:
				session.add(individual_alignment)
				session.flush()
			
			try:
				md5sum = utils.get_md5sum(inputFileRealPath)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=4)
			
			db_entry = VervetDB.IndividualAlignment.query.filter_by(md5sum=md5sum).first()
			if db_entry and db_entry.id!=individual_alignment.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)):
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\
								(db_entry.path, md5sum, inputFileRealPath))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=3)
			
			
			if individual_alignment.md5sum is None or individual_alignment.md5sum!=md5sum:
				individual_alignment.md5sum = md5sum
				session.add(individual_alignment)
				session.flush()
			try:
				#move the file and update the db_entry's path as well
				exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=individual_alignment, filename=os.path.basename(inputFileRealPath), \
							inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, individual_alignment.path), \
							relativeOutputDir=None, shellCommand='cp -rL', \
							srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
							constructRelativePathFunction=individual_alignment.constructRelativePath)
			except:
				sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info())))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode))
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			try:
				#make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs
				#copy further files if there are
				if self.inputFnameLs:
					for inputFname in self.inputFnameLs:
						if inputFname!=self.inputFname:	#2013.3.18 make sure it has not been copied.
							logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \
												filenameWithPrefix=individual_alignment.path, \
												outputDir=self.data_dir,\
												logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \
												dstFilenameLs=self.dstFilenameLs)
				
				self.db_vervet.updateDBEntryPathFileSize(db_entry=individual_alignment, data_dir=data_dir)
				
				## 2012.7.17 commented out because md5sum is calculated above
				#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
				#copy the bai index file if it exists
				baiFilename = '%s.bai'%(self.inputFname)
				if not os.path.isfile(baiFilename):
					sys.stderr.write("")
					self.sessionRollback(session)
					self.cleanUpAndExitOnFailure(exitCode=5)
				if os.path.isfile(baiFilename):
					srcFilename = baiFilename
					dstFilename = os.path.join(self.data_dir, '%s.bai'%(individual_alignment.path))
					utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
					logMessage += "bai file %s has been copied to %s.\n"%(srcFilename, dstFilename)
					self.srcFilenameLs.append(srcFilename)
					self.dstFilenameLs.append(dstFilename)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.sessionRollback(session)
				self.cleanUpAndExitOnFailure(exitCode=5)
		else:
			logMessage += "%s doesn't exist.\n"%(inputFileRealPath)
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			#delete all target files but exit gracefully (exit 0)
			self.sessionRollback(session)
			self.cleanUpAndExitOnFailure(exitCode=0)