def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		counter = 0
		no_of_vcf = 0
		real_counter = 0
		for inputFname in self.inputFnameLs:
			counter += 1
			if os.path.isfile(inputFname):
				try:
					if NextGenSeq.isFileNameVCF(inputFname, includeIndelVCF=False):
						no_of_vcf += 1
						if NextGenSeq.isVCFFileEmpty(inputFname, checkContent=self.checkEmptyVCFByReading):
							if self.commit:
								if self.report:
									sys.stderr.write("file %s deleted.\n"%(inputFname))
								commandline = 'rm %s'%(inputFname)
								return_data = runLocalCommand(commandline, report_stderr=True, report_stdout=True)
							real_counter += 1
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
			if self.report and counter%500==0:
				sys.stderr.write("%s%s\t%s\t%s"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s%s\t%s\t%s\n"%('\x08'*80, counter, no_of_vcf, real_counter))
		sys.stderr.write("%s files in total.\n"%(counter))
		sys.stderr.write("Out of %s VCF files, %s are empty and were deleted.\n"%(no_of_vcf, real_counter))
	def getChrListInTrioInconsistencyFile(self, tabixPath, trioInconsistencyByPosistionFname=None):
		"""
		2011.12.21
		"""
		sys.stderr.write("Getting list of chromosomes out of %s ..."%(trioInconsistencyByPosistionFname))
		chr_id_ls = []
		commandline = "%s -l %s"%(tabixPath, trioInconsistencyByPosistionFname)
		return_data = runLocalCommand(commandline, report_stderr=True, report_stdout=False)
		for chr in return_data.output_stdout:
			chr_id_ls.append(chr.strip())
		sys.stderr.write(" %s chromosomes.\n"%(len(chr_id_ls)))
		return chr_id_ls
Beispiel #3
0
	def moveFileIntoDBAffiliatedStorage(self, db_entry=None, filename=None, inputDir=None, outputDir=None, \
									dstFilename=None,\
								relativeOutputDir=None, shellCommand='cp -rL', srcFilenameLs=None, dstFilenameLs=None,\
								constructRelativePathFunction=None, data_dir=None):
		"""
			filename (required): relative path of input file
			inputDir (required): where 'filename' is from
			outputDir (required): where the output file will be 
			dstFilename: the absolute path of where the output file will be.
				if set to None (usually), then it'll be constructed on the fly. First 
					either through constructRelativePathFunction()
					or use join(relativeOutputDir, '%s_%s'%(db_entry.id, filename))
					or '%s_%s'%(db_entry.id, filename)
			
			relativeOutputDir: used for construct dstFilename if constructRelativePathFunction() is not there.
			constructRelativePathFunction: similar function of relativeOutputDir.
			 	used to construct relative path of output file.
			if neither relativeOutputDir nor constructRelativePathFunction is available, relative path is ='%s_%s'%(db_entry.id, filename).
				relative path is used to set db_entry.path when the latter is None.
			
			srcFilenameLs, dstFilenameLs: optional. two lists used to store the absolute path of input and output files.
				used in case rollback is needed.
			
			data_dir: the top-level folder where all the db-affiliated file storage is. for constructRelativePathFunction 
			 	
		2013.1.31 bugfix: if relativeOutputDir is included in both outputDir and newPath, use newfilename to avoid double usage. 
		2012.12.15 moved from VervetDB. i.e.:
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath, data_dir=self.data_dir)
			#same as this
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], \
									outputDir=self.data_dir, \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath, data_dir=self.data_dir)
									
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
		
		2012.8.30 add argument dstFilename, which if given , overwrites outputDir
		2012.7.18 -L of cp meant "always follow symbolic links in SOURCE".
		2012.7.13 copied from RegisterAndMoveSplitSequenceFiles.moveNewISQFileIntoDBStorage()
			filename could be a folder.
		2012.7.4
			add srcFilename and dstFilename into given arguments (srcFilenameLs, dstFilenameLs) for later undo
		2012.6.8
			return non-zero if failure in move or destination file already exists
		2012.2.10
			this function moves a file to a db-affiliated storage path
			relativeOutputDir is the path part (in relative path) of db_entry.path = os.path.split(db_entry.path)[0]
		"""
		exitCode = 0
		if constructRelativePathFunction is not None:
			newPath = constructRelativePathFunction(db_entry=db_entry, sourceFilename=filename, data_dir=data_dir)
			newfilename = os.path.basename(newPath)
		elif relativeOutputDir:
			newfilename = '%s_%s'%(db_entry.id, filename)
			newPath = os.path.join(relativeOutputDir, newfilename)
		else:
			newfilename = '%s_%s'%(db_entry.id, filename)
			newPath = newfilename
		
		if db_entry.getFilePath()!=newPath:
			db_entry.setFilePath(newPath)
			try:
				self.session.add(db_entry)
				self.session.flush()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				exitCode = 4
				return exitCode
		
		srcFilename = os.path.join(inputDir, filename)
		if dstFilename is None:	#2012.8.30
			if relativeOutputDir:
				relativePathIndex = outputDir.find(relativeOutputDir)
				noOfCharsInRelativeOutputDir = len(relativeOutputDir)
				if outputDir[relativePathIndex:relativePathIndex+noOfCharsInRelativeOutputDir]==relativeOutputDir and newPath.find(relativeOutputDir)>=0:
					#2013.1.31 bugfix: if relativeOutputDir is included in both outputDir and newPath, use newfilename to avoid double usage. 
					dstFilename = os.path.join(outputDir, newfilename)
			if dstFilename is None:	#still nothing , use newPath instead
				dstFilename = os.path.join(outputDir, newPath)
		if os.path.isfile(dstFilename):
			sys.stderr.write("Error: destination %s already exists.\n"%(dstFilename))
			exitCode = 2
		else:
			#21012.12.15 create folder if not existent
			dstFolder = os.path.split(dstFilename)[0]
			if not os.path.isdir(dstFolder):
				os.makedirs(dstFolder)
			#move the file
			commandline = '%s %s %s'%(shellCommand, srcFilename, dstFilename)
			return_data = utils.runLocalCommand(commandline, report_stderr=True, report_stdout=True)
			if srcFilenameLs is not None:
				srcFilenameLs.append(srcFilename)
			if dstFilenameLs is not None:
				dstFilenameLs.append(dstFilename)
			if hasattr(db_entry, 'md5sum'):# and getattr(db_entry, 'md5sum', None) is None:	#2012.7.14 has this attribute but it's None
				try:
					self.updateDBEntryMD5SUM(db_entry=db_entry, absPath=dstFilename)
				except:
					self.session.delete(db_entry)
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					exitCode = 2
					return exitCode
			if return_data.stderr_content:
				#something wrong. abort
				sys.stderr.write("commandline %s failed: %s\n"%(commandline, return_data.stderr_content))
				#remove the db entry
				self.session.delete(db_entry)
				self.session.flush()
				exitCode = 3
				return exitCode
			if hasattr(db_entry, 'file_size'):# and db_entry.file_size is None:
				try:
					self.updateDBEntryPathFileSize(db_entry=db_entry, absPath=dstFilename)
				except:
					self.session.delete(db_entry)
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					exitCode = 2
					return exitCode
			else:
				exitCode = 0
		return exitCode