def addtags(bam, tagfile, output, sam=False, trim_suffix=True, mode="tag", nproc=1): """Add tags to reads from individual cells Copies BAM entries to a new file, adding a read tag to cells matching an input table Parameters ---------- bam : str Path to BAM file. tagfile : str Tab-delimited file containing cell barcode, read tag to be added, tag information output : str Name for output BAM file. sam : bool, optional Output SAM format. Default is BAM format. trim_suffix: bool, optional Remove trailing 2 characters from cell barcode in bam file (sometimes needed to match 10x barcodes). nproc : int, optional Number of processors to use. Default is 1. mode : str Either tag (default) or readname. Some BAM file store the cell barcode in the readname rather than under a read tag. Raises ------ Exception If samtools merge of temporary BAM files fails """ nproc = int(nproc) tags = _readtags(tagfile) inputBam = pysam.AlignmentFile(bam, "rb") intervals = utils.chunk_bam(inputBam, nproc) inputBam.close() p = Pool(nproc) tempfiles = p.map_async( functools.partial( _add_read_tags, bam=bam, sam=sam, output=output, cb=tags, trim_suffix=trim_suffix, mode=mode, ), intervals.values(), ).get(9999999) mergestring = ("samtools merge -@ " + str(nproc) + " " + output + " " + " ".join(tempfiles)) call(mergestring, shell=True) if os.path.exists(output): [os.remove(i) for i in tempfiles] else: raise Exception("samtools merge failed, temp files not deleted")
def filterbarcodes(cells, bam, output, sam=False, trim_suffix=True, nproc=1, mode="tag"): """Filter reads based on input list of cell barcodes Copy BAM entries matching a list of cell barcodes to a new BAM file. Parameters ---------- cells : str Path to file containing cell barcodes, or comma-separated list of cell barcodes. File can be gzip compressed. bam : str Path to BAM file. output : str Name for output BAM file. sam : bool, optional Output SAM format. Default is BAM format. trim_suffix: bool, optional Remove trailing 2 characters from cell barcode in bam file (sometimes needed to match 10x barcodes). nproc : int, optional Number of processors to use. Default is 1. mode : str Either tag (default) or readname. Some BAM file store the cell barcode in the readname rather than under a read tag. Raises ------ Exception If samtools merge of temporary BAM files fails """ nproc = int(nproc) cb = utils.read_cells(cells) inputBam = pysam.AlignmentFile(bam, "rb") intervals = utils.chunk_bam(inputBam, nproc) inputBam.close() p = Pool(nproc) tempfiles = p.map_async( functools.partial( _iterate_reads, bam=bam, sam=sam, output=output, cb=cb, trim_suffix=trim_suffix, mode=mode, ), intervals.values(), ).get(9999999) mergestring = ("samtools merge -@ " + str(nproc) + " " + output + " " + " ".join(tempfiles)) call(mergestring, shell=True) if os.path.exists(output): [os.remove(i) for i in tempfiles] else: raise Exception("samtools merge failed, temp files not deleted")
def filterbarcodes( cells, bam, readname_barcode, cellbarcode, sam=False, trim_suffix=True, nproc=1 ): """Filter reads based on input list of cell barcodes Copy BAM entries matching a list of cell barcodes to a new BAM file. Output BAM files will be named according to the group name in the file provided. Parameters ---------- cells : str Path to file containing cell barcodes and the group associated with each barcode. File can be gzip compressed. A separate BAM file will be created for each group of cells. bam : str Path to BAM file. trim_suffix: bool, optional Remove trailing 2 characters from cell barcode in bam file (sometimes needed to match 10x barcodes). nproc : int, optional Number of processors to use. Default is 1. cellbarcode : str Tag used for cell barcode. Default is CB (used by cellranger) readname_barcode : regex A regular expression for matching cell barcode in read name. If None (default), use the read tags. Raises ------ Exception If samtools merge of temporary BAM files fails """ nproc = int(nproc) cb = utils.read_cell_barcode_file(cells) unique_classes = list(set(chain.from_iterable(cb.values()))) inputBam = pysam.AlignmentFile(bam, "rb") intervals = utils.chunk_bam(inputBam, nproc) inputBam.close() if readname_barcode is not None: readname_barcode = re.compile(readname_barcode) p = Pool(nproc) idents = p.map_async( functools.partial( _iterate_reads, bam=bam, cb=cb, classes=unique_classes, trim_suffix=trim_suffix, cellbarcode=cellbarcode, readname_barcode=readname_barcode ), intervals.values(), ).get(9999999) mergeAll(idents=idents, classes=unique_classes, nproc=nproc, remove=True)