def parse_bam(bam_path): ''' Returns alignment information for each reference sequence as an OrderedDict ''' alignments = OrderedDict() with open(bam_path, 'r') as bam_fh: bam = simplesam.Reader(bam_fh) refs_lens = { n.replace('SN:', ''): int(l[0].replace('LN:', '')) for n, l in bam.header['@SQ'].items() } refs_records = defaultdict(list) for r in bam: for id in refs_lens: refs_records[r.rname].append(r) if '*' in refs_records: del refs_records['*'] # assert len(refs_records) <= 1, 'Detected primary mappings to more than one reference' # Use samtools view to extract single contig primary mappings # Otherwise would make a useful enhancement for ref_id, records in refs_records.items(): alignments[ref_id] = parse_records(ref_id, refs_lens[ref_id], records) return alignments
def processRegion(R): global fname r = R[0] + ":" + str(R[1]) + "-" + str(R[1] + 1) with open(fname, 'rb') as filenameopen: samfile = simplesam.Reader(filenameopen, regions=r) S = [R[1], [], [], [], [], [], [], []] while True: try: #get next read read = samfile.next() if read.duplicate or not read.passing or read.secondary or not read.mapped: continue #indel at site in question? (clip, ind, HP, ASXS) = readFeatures(read) gappedSeq = read.gapped('seq') try: S[1].append(gappedSeq[R[1] - read.pos]) except IndexError: sys.stderr.write(str(R[1]) + "\t" + str(read.pos) + "\n") return None S[2].append(ord(read.gapped('qual')[R[1] - read.pos]) - 33) S[3].append(min((len(gappedSeq) - R[1]), R[1])) #not exact due to indel S[4].append(clip) S[5].append(ind) S[6].append(HP) S[7].append(ASXS) except StopIteration: #no more reads samfile.close() samfile.p.wait() #Prevent Z-status samtools processes global simMode return siteFeatures(S, R[2], simMode)
def parse_bam(bam_path): ''' Returns alignment information for each reference sequence as an OrderedDict ''' alignments = OrderedDict() with open(bam_path, 'r') as bam_fh: bam = simplesam.Reader(bam_fh) refs_lens = { n.replace('SN:', ''): int(l[0].replace('LN:', '')) for n, l in bam.header['@SQ'].items() } refs_records = { id: (r for r in bam if r.rname == id) for id in refs_lens } for ref_id, records in refs_records.items(): alignments[ref_id] = parse_records(ref_id, refs_lens[ref_id], records) return alignments
def main(): parser = ArgumentParser(description = "Utility to split a tagged BAM/SAM " + \ "file into separate SAM files for different barcodes.") parser.add_argument("input", help = "BAM/SAM tagged file") parser.add_argument("outdir", help = "Directory where to store the splitted SAM files") parser.add_argument("-t", "--tag-name", default="XC", help = "TAG name (default=XC)") parser.add_argument("-b", "--barcodes-file", help = "Selection of cell barcodes (default: all barcodes in BAM/SAM file)") args = parser.parse_args() mkdir(args.outdir) # user provided a subset of barcodes (create dictionary and set a flag) if args.barcodes_file: barcodes = dict.fromkeys(open(args.barcodes_file).read().split(), 0) subset_barcodes = True else: barcodes = dict() subset_barcodes = False in_samfile = open(args.input, "r") in_sam = simplesam.Reader(in_samfile) print("Selected TAG name: {name}".format(name=args.tag_name), file=stderr) info("Analyzing file {file}".format(file=args.input)) selected_reads = 0 for tot_reads, read in enumerate(in_sam): if (tot_reads + 1) % 1e5 == 0: info("Reads (selected/total): {s}/{t}".format(s=str(selected_reads+1), t=str(tot_reads+1))) cell_bc = read.tags.get(args.tag_name) if (cell_bc and subset_barcodes and (cell_bc in barcodes)) or (cell_bc and not subset_barcodes): selected_reads += 1 out_filename = join_path(args.outdir, cell_bc + ".sam") if not barcodes.get(cell_bc): # NB: both get zero and get null equal false barcodes[cell_bc] = barcodes.get(cell_bc, 0) + 1 with open(out_filename, "w") as out_samfile: out_sam = simplesam.Writer(out_samfile, in_sam.header) out_sam.write(read) else: barcodes[cell_bc] += 1 with open(out_filename, "a") as out_samfile: print(str(read), end="", file=out_samfile) in_sam.close()
def _get_base_alignment(self, read): if self.bwapy_aligner: alignments = self.bwapy_aligner.align_seq(''.join(read.sequence)) if len(alignments) == 0: return None alignment = alignments[0] cigar = alignment.cigar is_reverse_complement = alignment.orient == '-' mapped_position = alignment.pos else: read_fastq_filename = None with tempfile.NamedTemporaryFile(mode='w', delete=False, prefix='nadavca_tmp', suffix='.fastq') as file: read_fastq_filename = file.name file.write(read.fastq) bwa_output_filename = None with tempfile.NamedTemporaryFile(delete=True, prefix='nadavca_tmp', suffix='.sam') as file: bwa_output_filename = file.name subprocess.run([ self.bwa_executable, 'mem', self.reference_filename, read_fastq_filename, '-o', bwa_output_filename ], stderr=subprocess.PIPE, check=True) with simplesam.Reader(open(bwa_output_filename, 'r')) as reader: sam = reader.next() if not sam.mapped: return None cigar = sam.cigar is_reverse_complement = sam.reverse mapped_position = sam.pos - 1 os.remove(read_fastq_filename) os.remove(bwa_output_filename) oriented_read = Genome.reverse_complement( read.sequence) if is_reverse_complement else read.sequence index_in_read = 0 index_in_reference = mapped_position base_mapping = [] parsed_cigar = self._parse_cigar(cigar) for num, operation in parsed_cigar: if operation == 'S': index_in_read += num elif operation == 'M': for i in range(num): if self.reference[index_in_reference] == oriented_read[ index_in_read]: base_mapping.append( (index_in_read, index_in_reference)) index_in_read += 1 index_in_reference += 1 elif operation == 'D': index_in_reference += num elif operation == 'I': index_in_read += num else: raise ValueError( 'Unknown cigar operation: {}'.format(operation)) if is_reverse_complement: for i, val in enumerate(base_mapping): base_mapping[i] = (len(read.sequence) - 1 - val[0], len(self.reference) - 1 - val[1]) base_mapping.reverse() return numpy.array(base_mapping, dtype=numpy.int), is_reverse_complement
def processRegion(R): #samtools format region string chrom = R.split(":")[0] global CONTIGS if chrom not in CONTIGS: return [] start = int(R.split(":")[1].split("-")[0]) end = int(R.split(":")[1].split("-")[1]) predictionArray = [ ] #list of feature vectors, 0th element in each vector is chrom and 1st is bp position SITES = dict() last_site_eval = -1 nreads = 0 global fname with open(fname, 'rb') as filenameopen: samfile = simplesam.Reader(filenameopen, regions=R) while True: try: #get next read read = samfile.next() if read.duplicate or not read.passing or read.secondary: continue nreads += 1 (clip, clipLen, ind, pair, strands) = readFeatures(read) gappedSeq = read.gapped('seq') #one time compute gappedQual = read.gapped('qual') #Features for the whole read for (p, refpos) in enumerate( read.coords ): # "genomic coordinates for the gapped alignment" if gappedSeq[p] == "-": continue refpos = p + read.pos if refpos in SITES: S = SITES[refpos] else: S = [ refpos, [], [], [], [], [], [], [], [] ] #bases, bqs, readpos, clip, ind, HP, ASXS, pair, strands, clipLen SITES[refpos] = S S[1].append(gappedSeq[p]) S[2].append(ord(gappedQual[p]) - 33) S[3].append(min((len(gappedSeq) - p), p)) #not exact due to indel S[4].append(clip) S[5].append(ind) S[6].append(pair) S[7].append(strands) S[8].append(clipLen) if nreads >= READBATCH or len( SITES ) > SITEBATCH: #batch size for evaluating sites where all reads seen -> memory savings for refpos in list( SITES ): # This will produce a list from the keys of the dictionary that will not change during iteration if refpos >= max(last_site_eval + 1, start) and refpos < min( read.pos, end): S = SITES[refpos] else: continue feature_vector = siteFeatures(S) if type(feature_vector) != type(None): predictionArray.append([chrom, refpos] + feature_vector) del SITES[S[0]] last_site_eval = read.pos - 1 nreads = 0 except StopIteration: #no more reads for refpos in list( SITES ): # This will produce a list from the keys of the dictionary that will not change during iteration if refpos >= max(last_site_eval + 1, start) and refpos < min(read.pos, end): S = SITES[refpos] else: continue feature_vector = siteFeatures(S) if type(feature_vector) != type(None): predictionArray.append([chrom, refpos - 1] + feature_vector) del SITES[S[0]] break #all sites classified samfile.close() samfile.p.wait() #Prevent Z-status samtools processes sys.stderr.write(R + "\n") return predictionArray
sys.stderr.write(R + "\n") return predictionArray parser = argparse.ArgumentParser( description='python filter.py --bam sample.bam --nproc 8') parser.add_argument('--bam', help='Input bam file name', required=True) parser.add_argument('--nproc', help="parallelism", required=False, default=1) args = parser.parse_args() NFEAT = 33 NPAR = int(args.nproc) fname = args.bam with open(fname, 'rb') as F: S = simplesam.Reader(F) regions = S.tile_genome(WINDOW_SIZE) S.close() S.p.wait() if NPAR > 1: import multiprocessing as mp pool = mp.Pool( processes=NPAR, maxtasksperchild=1 ) #new process is started for each region, max NPAR active at a time for result in pool.imap_unordered(processRegion, regions): p = [str(x) for x in result] #predictionArray if len(p) > 0: print("\n".join(p)) else: for r in regions: p = [str(x) for x in processRegion(r)] #predictionArray
import pyfastx import simplesam import os os.chdir( '/research/projects/yu3grp/IO_JY/yu3grp/LVXSCID/patients_scATACseq/multiome_P1' ) bam_file = './03_chimeric/P1_scMulti_ATAC_S1_pe.mated.filter.bam' out_sam_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_wCB.sam' cellID_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_R2.fastq' #fa = pyfastx.Fastx('./LVX_SCID_P1_S1_L001_pe.mated.filter2.bam_readbarcode') fa = pyfastx.Fastx(cellID_file) barcodes = {} for name, seq, qual, comment in fa: barcodes[name] = seq barcode_tag = 'CB' with simplesam.Reader(open(bam_file)) as in_bam: with simplesam.Writer(open(out_sam_file, 'w'), in_bam.header) as out_sam: for read in in_bam: #read[umi_tag] = barcodes[read.qname][0] read[barcode_tag] = barcodes[read.qname] out_sam.write(read)
def processRegion(R): #bed interval -> samtools format region string global fname chrom = R[0] start = int(R[1]) r = chrom + ":" + str(start + 1) + "-" + str(start + 2) #1-indexed alignments = [] alignmentStarts = [] alignmentEnds = [] readNames = [] basesAtSite = [] pileupStart = None pileupEnd = None with open(fname, 'rb') as filenameopen: samfile = simplesam.Reader(filenameopen, regions=r) global fastafname reference = Fasta( fastafname ) #Fasta() is apparently not "thread safe" to use as global and creating per-process does not affect performance numPhased = 0 while True: try: #get next read read = samfile.next() if read.duplicate or not read.passing or read.secondary: continue readNames.append(read.qname) try: HP = read["PH"] numPhased += 1 except: HP = None C = read.cigars if C[0][1] == "S": qstart = C[0][0] else: qstart = 0 rstart = read.pos if pileupStart is None or rstart < pileupStart: pileupStart = rstart if C[-1][1] == "S": qend = C[-1][0] else: qend = 0 rend = rstart + len(read) if pileupEnd is None or rend > pileupEnd: pileupEnd = rend qSeqAln = read.gapped('seq') #mismatches and indels alignmentStarts.append(rstart) alignmentEnds.append(rend) refSeq = reference[R[0]][rstart - 1:rend - 1].seq.upper() thisReadAln = [] P = read.coords #positions in reference for qSeqAln addedBase = False try: for i in range(len(P)): #position in reference refPos = P[i] qChar = qSeqAln[i] if refPos == start: #don't include the site of interest - see if they cluster otherwise. if i > 0 and refPos - 1 != P[ i - 1] or qChar != refSeq[refPos - rstart]: basesAtSite.append(1) else: basesAtSite.append(0) addedBase = True elif i > 0 and refPos - 1 != P[i - 1]: if len(thisReadAln) > 0: thisReadAln[-1] = 1 #insertion in query - change previous position in ref. elif qChar == "-": thisReadAln.append(1) #deletion in query elif qChar != refSeq[refPos - rstart]: thisReadAln.append(1) #mismatch else: thisReadAln.append(0) except IndexError: sys.stderr.write(str(len(P)) + "\n") sys.stderr.write(str(len(qSeqAln)) + "\n") sys.stderr.write(str(len(refSeq)) + "\n") sys.stderr.write(str(max(P)) + "\n") sys.stderr.write(str(rstart) + "\n") sys.stderr.write(str(P) + "\n") sys.stderr.write(str(qSeqAln) + "\n") sys.stderr.write(str(refSeq) + "\n") return None alignments.append(thisReadAln) if not addedBase: basesAtSite.append(1) #read didn't align to that position except StopIteration: #no more reads depth = len(readNames) if depth <= 0: predictionString = None sys.stderr.write(str(R) + "\n") break fracPhased = 1.0 * numPhased / depth totalLen = pileupEnd - pileupStart + 2 depthAlt = [0 for _ in range(totalLen)] depthRef = [0 for _ in range(totalLen)] sumRef = [0 for i in range(totalLen)] sumAlt = [0 for i in range(totalLen)] readsAtSite = [None for _ in range(depth)] for (i, seq) in enumerate(alignments): L = len(seq) readsAtSite[i] = [ 0 for _ in range(alignmentStarts[i] - pileupStart) ] + [1] + seq + [1] + [ 0 for _ in range(pileupEnd - alignmentStarts[i] - L) ] #the 1's surrounding seq indicate the read ends - hopefully columns are still in line. if basesAtSite[i] == 0: for j in range(-1, L + 1): depthRef[j + alignmentStarts[i] - pileupStart + 1] += 1 for j in range(totalLen): sumRef[j] += readsAtSite[i][j] #events on h0 else: for j in range(-1, L + 1): depthAlt[j + alignmentStarts[i] - pileupStart + 1] += 1 for j in range(totalLen): sumAlt[j] += readsAtSite[i][j] #events on h1 min_pval = 1 min_table = None min_start = 0 global V for k in range(totalLen): if chrom + ":" + str(pileupStart + k) in V: continue #Called variant from VCF p = pvalue(sumAlt[k], depthAlt[k] - sumAlt[k], sumRef[k], depthRef[k] - sumRef[k]) if p.two_tail < min_pval: min_pval = p.two_tail if min_pval < MIN_FISHER_PVAL: predictionString = None else: predictionString = ("\t".join( R + [str(min_pval), str(fracPhased)])) break samfile.close() samfile.p.wait() #Prevent Z-status samtools process return predictionString