def addBamFileToDB(self, db_vervet, bamFilePath, library=None, individual_sequence_id=None, mate_id=None): """ 2013.04.03 bamFilePath passed to original_path 2012.4.30 add mate_id 2012.1.27 1. run md5sum 2. check if it already exists in db if not, add it into db if yes, exit the program. no more work. """ sys.stderr.write("Record the bamfile into db ...") md5sum = utils.get_md5sum(bamFilePath) db_entry = VervetDB.IndividualSequenceFileRaw.query.filter_by(md5sum=md5sum).first() if db_entry: sys.stderr.write("Warning: another file %s with the identical md5sum %s (library=%s) as this file %s is already in db.\n"%\ (db_entry.path, md5sum, library, bamFilePath)) #sys.exit(3) else: db_entry = db_vervet.getIndividualSequenceFileRaw(individual_sequence_id, library=library, md5sum=md5sum, \ original_path=bamFilePath, mate_id=mate_id) mate_id2split_order_ls = {} for individual_sequence_file in db_entry.individual_sequence_file_ls: mate_id = individual_sequence_file.mate_id if mate_id is None: #sequence entries without mate_id are just from one mate. mate_id = 1 if mate_id not in mate_id2split_order_ls: mate_id2split_order_ls[mate_id] = [] mate_id2split_order_ls[mate_id].append(individual_sequence_file.split_order) if len(mate_id2split_order_ls)>2: sys.stderr.write("Error: db sequence files spawned from bam file %s (md5sum=%s) form %s(>2) mates.\ Unless this bam file contains reads from >2 mates, reads from this bam files should be stored in db already.\n"%\ (db_entry.path, md5sum, len(mate_id2split_order_ls))) sys.exit(4) return db_entry
def updateDBEntryMD5SUM(self, db_entry=None, data_dir=None, absPath=None): """ 2012.12.15 moved from VervetDB 2012.7.13 if absPath is given, take that , rather than construct it from data_dir and db_entry.path """ from pymodule import utils if data_dir is None: data_dir = self.data_dir if hasattr(db_entry, 'path') and db_entry.path: db_entry_path = db_entry.path elif hasattr(db_entry, 'filename') and db_entry.filename: db_entry_path = db_entry.filename else: db_entry_path = None if not absPath and db_entry_path: absPath = supplantFilePathWithNewDataDir(filePath=db_entry_path, oldDataDir=self.data_dir,\ newDataDir=data_dir) if absPath and not os.path.isfile(absPath): sys.stderr.write("updateDBEntryMD5SUM() Warning: target file %s doesn't exist. Could not update its md5sum.\n"%(absPath)) return md5sum = utils.get_md5sum(absPath) if db_entry.md5sum is not None and db_entry.md5sum!=md5sum: sys.stderr.write("WARNING: The new md5sum %s is not same as the existing md5sum %s.\n"%(md5sum, db_entry.md5sum)) db_entry.md5sum = md5sum self.session.add(db_entry) self.session.flush()
def run(self): """ 2013.08.04 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir inputFileRealPath = os.path.realpath(self.inputFname) logMessage = "Adding file %s to db .\n"%(self.inputFname) if os.path.isfile(inputFileRealPath): popGenSimulationType = self.db_vervet.getPopGenSimulationType( short_name=None, r=self.r, rho=self.rho, \ mu=self.mu, theta=self.theta, n0=self.n0, is_selection=self.is_selection,\ selection_parameters=self.selection_parameters, indel=None, indel_parameters=None, \ population_size_parameters=self.population_size_parameters, \ parent_pop_gen_simulation_type_id=self.parent_pop_gen_simulation_type_id) popGenSimulation = self.db_vervet.getPopGenSimulation(pop_gen_simulation_type_id=popGenSimulationType.id, \ replicate_index=self.replicate_index, no_of_populations=self.no_of_populations,\ no_of_chromosomes=self.no_of_chromosomes, chromosome_length=self.chromosome_length, \ sample_size=self.sample_size, \ no_of_polymorphic_loci=self.no_of_polymorphic_loci, programs=self.simulation_programs,\ original_path=inputFileRealPath, data_dir=self.data_dir) try: md5sum = utils.get_md5sum(inputFileRealPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) db_entry = VervetDB.PopGenSimulation.query.filter_by(md5sum=md5sum).first() if db_entry and db_entry.id!=popGenSimulation.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)): sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\ (db_entry.path, md5sum, inputFileRealPath)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=3) if popGenSimulation.md5sum is None or popGenSimulation.md5sum!=md5sum: popGenSimulation.md5sum = md5sum session.add(popGenSimulation) session.flush() try: #move the file and update the db_entry's path as well exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=popGenSimulation, \ filename=os.path.basename(inputFileRealPath), \ inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, popGenSimulation.path), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=popGenSimulation.constructRelativePath) except: sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info()))) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=exitCode) try: #make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs #copy further files if there are if self.inputFnameLs: for inputFname in self.inputFnameLs: if inputFname!=self.inputFname: #2013.3.18 make sure it has not been copied. logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \ filenameWithPrefix=popGenSimulation.path, \ outputDir=self.data_dir,\ logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \ dstFilenameLs=self.dstFilenameLs) self.db_vervet.updateDBEntryPathFileSize(db_entry=popGenSimulation, data_dir=data_dir) ## 2012.7.17 commented out because md5sum is calculated above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) else: logMessage += "%s doesn't exist.\n"%(inputFileRealPath) self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: #delete all target files but exit gracefully (exit 0) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=0)
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir realPath = os.path.realpath(self.inputFname) logMessage = "file %s.\n"%(self.inputFname) if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \ not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading): vcfFile = VCFFile(inputFname=self.inputFname) individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile) genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \ individualAlignmentLs=individualAlignmentLs,\ no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\ data_dir=self.data_dir) self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session) pdata = self.getNoOfLociFromVCFFile(vcfFile) chromosome2noOfLoci = pdata.chromosome2noOfLoci no_of_loci = pdata.no_of_loci if no_of_loci>0: #file with zero loci could have identical md5sum try: md5sum = utils.get_md5sum(realPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) else: md5sum = None """ db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first() if db_entry: sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\ (db_entry.path, md5sum, realPath)) session.rollback() #2012.8.3 when the jobs are clustered into one merged job and it failed halfway # and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. self.cleanUpAndExitOnFailure(exitCode=0) """ no_of_individuals = len(individualAlignmentLs) no_of_chromosomes = len(chromosome2noOfLoci) if no_of_chromosomes == 1: #2012.8.30 use 1st chromosome chromosome = chromosome2noOfLoci.keys()[0] else: chromosome = None genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\ chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\ original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\ data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes) if genotypeFile.id and genotypeFile.path: isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir) if isPathInDB==-1: sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path)) self.cleanUpAndExitOnFailure(exitCode=isPathInDB) elif isPathInDB==1: #successful exit, entry already in db sys.stderr.write("Warning: file %s is already in db.\n"%\ (genotypeFile.path)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=0) else: #not in db affiliated storage, keep going. pass #move the file and update the db_entry's path as well inputFileBasename = os.path.basename(self.inputFname) relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename) exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \ inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=genotypeFile.constructRelativePath) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode)) session.rollback() self.cleanUpAndExitOnFailure(exitCode=exitCode) #copy the tbi (tabix) index file if it exists tbiFilename = '%s.tbi'%(realPath) if os.path.isfile(tbiFilename): srcFilename = tbiFilename dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename) ## 2012.7.17 commented out because md5sum is calcualted above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) # #2012.7.17 record the size of db_entry.path (folder or file) self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir) vcfFile.close() logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum) else: logMessage += " is empty (no loci) or not VCF file.\n" self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: session.rollback() #delete all target files but exit gracefully (exit 0) self.cleanUpAndExitOnFailure(exitCode=0)
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.data_dir: self.data_dir = self.db_vervet.data_dir data_dir = self.data_dir inputFileRealPath = os.path.realpath(self.inputFname) logMessage = "Adding file %s to db .\n"%(self.inputFname) if os.path.isfile(inputFileRealPath): if self.individual_alignment_id: individual_alignment = VervetDB.IndividualAlignment.get(self.individual_alignment_id) elif self.parent_individual_alignment_id: individual_alignment = self.db_vervet.copyParentIndividualAlignment(parent_individual_alignment_id=self.parent_individual_alignment_id,\ mask_genotype_method_id=self.mask_genotype_method_id,\ data_dir=self.data_dir, local_realigned=self.local_realigned) else: #alignment for this library of the individual_sequence individual_sequence = VervetDB.IndividualSequence.get(self.individual_sequence_id) individual_alignment = self.db_vervet.getAlignment(individual_sequence_id=self.individual_sequence_id,\ path_to_original_alignment=None, sequencer=individual_sequence.sequencer,\ sequence_type=individual_sequence.sequence_type, sequence_format=individual_sequence.format, \ ref_individual_sequence_id=self.ref_sequence_id, \ alignment_method_id=self.alignment_method_id, alignment_format=self.format,\ individual_sequence_filtered=individual_sequence.filtered, read_group_added=1, data_dir=data_dir, \ mask_genotype_method_id=self.mask_genotype_method_id, \ parent_individual_alignment_id=self.parent_individual_alignment_id,\ individual_sequence_file_raw_id=self.individual_sequence_file_raw_id,\ local_realigned=self.local_realigned, read_group=self.read_group) needSessionFlush = False if not individual_alignment.path: individual_alignment.path = individual_alignment.constructRelativePath() needSessionFlush = True if self.mask_genotype_method_id and \ individual_alignment.mask_genotype_method_id!=self.mask_genotype_method_id: individual_alignment.mask_genotype_method_id = self.mask_genotype_method_id needSessionFlush = True if self.individual_sequence_file_raw_id and \ individual_alignment.individual_sequence_file_raw_id != self.individual_sequence_file_raw_id: individual_alignment.individual_sequence_file_raw_id = self.individual_sequence_file_raw_id needSessionFlush = True if needSessionFlush: session.add(individual_alignment) session.flush() try: md5sum = utils.get_md5sum(inputFileRealPath) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=4) db_entry = VervetDB.IndividualAlignment.query.filter_by(md5sum=md5sum).first() if db_entry and db_entry.id!=individual_alignment.id and db_entry.path and os.path.isfile(os.path.join(data_dir, db_entry.path)): sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s, is already in db.\n"%\ (db_entry.path, md5sum, inputFileRealPath)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=3) if individual_alignment.md5sum is None or individual_alignment.md5sum!=md5sum: individual_alignment.md5sum = md5sum session.add(individual_alignment) session.flush() try: #move the file and update the db_entry's path as well exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=individual_alignment, filename=os.path.basename(inputFileRealPath), \ inputDir=os.path.split(inputFileRealPath)[0], dstFilename=os.path.join(self.data_dir, individual_alignment.path), \ relativeOutputDir=None, shellCommand='cp -rL', \ srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\ constructRelativePathFunction=individual_alignment.constructRelativePath) except: sys.stderr.write('Except in copying %s to db-storage with except info: %s\n'%(inputFileRealPath, repr(sys.exc_info()))) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) if exitCode!=0: sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with code=%s.\n"%(exitCode)) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=exitCode) try: #make sure these files are stored in self.dstFilenameLs and self.srcFilenameLs #copy further files if there are if self.inputFnameLs: for inputFname in self.inputFnameLs: if inputFname!=self.inputFname: #2013.3.18 make sure it has not been copied. logMessage = self.db_vervet.copyFileWithAnotherFilePrefix(inputFname=inputFname, \ filenameWithPrefix=individual_alignment.path, \ outputDir=self.data_dir,\ logMessage=logMessage, srcFilenameLs=self.srcFilenameLs, \ dstFilenameLs=self.dstFilenameLs) self.db_vervet.updateDBEntryPathFileSize(db_entry=individual_alignment, data_dir=data_dir) ## 2012.7.17 commented out because md5sum is calculated above #db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir) #copy the bai index file if it exists baiFilename = '%s.bai'%(self.inputFname) if not os.path.isfile(baiFilename): sys.stderr.write("") self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) if os.path.isfile(baiFilename): srcFilename = baiFilename dstFilename = os.path.join(self.data_dir, '%s.bai'%(individual_alignment.path)) utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename) logMessage += "bai file %s has been copied to %s.\n"%(srcFilename, dstFilename) self.srcFilenameLs.append(srcFilename) self.dstFilenameLs.append(dstFilename) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=5) else: logMessage += "%s doesn't exist.\n"%(inputFileRealPath) self.outputLogMessage(logMessage) if self.commit: try: session.flush() session.commit() except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() self.cleanUpAndExitOnFailure(exitCode=3) else: #delete all target files but exit gracefully (exit 0) self.sessionRollback(session) self.cleanUpAndExitOnFailure(exitCode=0)