def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs): tmpDb = None if len(refDbs)>1 and not any( not os.path.exists(db) # indexed db prefix or os.path.isdir(db) # indexed db in directory or (os.path.isfile(db) and ('.tar' in db or '.tgz' in db or '.zip' in db)) # packaged indexed db for db in refDbs): # this is a scenario where all refDbs are unbuilt fasta # files. we can simplify and speed up execution by # concatenating them all and running deplete_method # just once tmpDb = mkstempfname('.fasta') merge_compressed_files(refDbs, tmpDb, sep='\n') refDbs = [tmpDb] samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, **kwargs) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam) if tmpDb: os.unlink(tmpDb)
def test_isEmpty(self): samtools = tools.samtools.SamtoolsTool() self.assertTrue( samtools.isEmpty( os.path.join(util.file.get_test_input_path(), 'empty.bam'))) self.assertFalse( samtools.isEmpty( os.path.join(util.file.get_test_input_path(), 'almost-empty.bam'))) self.assertFalse( samtools.isEmpty( os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam'))) self.assertFalse( samtools.isEmpty( os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')))
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, threads=1, JVMmemory=None): samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, threads=threads, JVMmemory=JVMmemory) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs): samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, **kwargs) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def align_bam(self, inBam, refDb, outBam, options=None, threads=None, JVMmemory=None): options = options or [] samtools = tools.samtools.SamtoolsTool() threads = util.misc.sanitize_thread_count(threads) # fetch list of RGs rgs = list(samtools.getReadGroups(inBam).keys()) if len(rgs) == 0: # Can't do this raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: # Only one RG, keep it simple self.align_one_rg(inBam, refDb, outBam, options=options, threads=threads) else: # Multiple RGs, align one at a time and merge align_bams = [] for rg in rgs: tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg)) self.align_one_rg( inBam, refDb, tmp_bam, rgid=rg, options=options, threads=threads ) if not samtools.isEmpty(tmp_bam): align_bams.append(tmp_bam) else: log.warning("No alignment output for RG %s in file %s against %s", rg, inBam, refDb) if len(align_bams)==0: with util.file.tempfname('.empty.sam') as empty_sam: samtools.dumpHeader(inBam, empty_sam) samtools.sort(empty_sam, outBam) else: # Merge BAMs, sort, and index picardOptions = ['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true'] tools.picard.MergeSamFilesTool().execute( align_bams, outBam, picardOptions=picardOptions, JVMmemory=JVMmemory ) for bam in align_bams: os.unlink(bam)
def align_one_rg(self, inBam, refDb, outBam, rgid=None, preset=None, options=None, threads=None, JVMmemory=None): """ Performs an alignment of one read group in a bam file to a reference fasta file using minimap2. Emits alignments in sorted, index bam files. inBam may contain more read groups, but we will subset input to the specified rgid. preset may be specified as a valid value for "minimap2 -x" which depends on the type of data (short accurate reads vs long noisy reads). If preset is set to None, we will autodetect based on the PL (platform) tag in the read group header (e.g. illumina, ont, pacbio) """ options = list(options).copy() or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group with util.file.tempfname('.onebam.bam') as tmp_bam: samtools.view(['-1', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.isEmpty(tmp_bam): log.warning("No reads present for RG %s in file: %s", rgid, inBam) shutil.copyfile(tmp_bam, outBam) return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) # get the read group line to give to mm2 readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line.rstrip("\r\n") if not readgroup_line: raise Exception() # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to minimap2 to write out options.extend(('-R', readgroup_line.replace('\t','\\t'))) # dynamically determine the mode of operation if '-x' not in options: if preset is None: platform = list(x for x in readgroup_line.split('\t') if x.startswith('PL:')) if len(platform) != 1: raise Exception("cannot autodetect minimap2 aligner mode when PL: tag is not set in the read group header for {}: {}".format(inBam, readgroup_line)) else: platform = platform[0][3:].lower() if platform == 'illumina': preset = 'sr' elif platform == 'ont': preset = 'map-ont' elif platform == 'pacbio': preset = 'map-pb' else: raise Exception("PL: tag {} for read group {} in bam {} refers to a data type we do not know how to map with minimap2".format(platform, rgid, inBam)) options.extend(('-x', preset)) # perform actual alignment if samtools.isEmpty(one_rg_inBam): # minimap doesn't like empty inputs, so copy empty bam through samtools.sort(one_rg_inBam, outBam) else: self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam)
def test_isEmpty(self): samtools = tools.samtools.SamtoolsTool() self.assertTrue(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'empty.bam'))) self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'almost-empty.bam'))) self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam'))) self.assertFalse(samtools.isEmpty(os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')))