def fetch_count_read (alignment_file, seq_name, start, end): """ Count the number of read that are at least partly overlapping a specified chromosomic region @param alignment_file Path to a sam or a bam file @param seq_name Name of the sequence where read are to be aligned on @param start Start genomic coordinates of the area of alignment @param end End End genomic coordinates of the area of alignment """ # Specific imports from pysam import AlignmentFile # Init a generator on the sam or bam file with pysam if alignment_file[-3:].lower() == "bam": al = AlignmentFile(alignment_file, "rb") elif alignment_file[-3:].lower() == "sam": al = AlignmentFile(alignment_file, "r") else: raise Exception("Wrong file format (sam or bam)") # Count read aligned at least partly on the specified region n = 0 for i in al.fetch(seq_name, start, end): n += 1 al.close() return n
def subset_bamfile(sam, barcodes): """ Subset a SAM/BAM file, keeping only alignments from given cellular barcodes """ from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) barcodes = set(barcode.strip() for barcode in barcodes) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: cb = match.group('CB') if cb in barcodes: out_file.write(aln)
def get_counts(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed) mat = np.zeros(len(chunks), dtype=np.int) bamHandle = AlignmentFile(args.bam) j = 0 for chunk in chunks: for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper): if read.is_proper_pair and not read.is_reverse: if args.atac: #get left position l_pos = read.pos + 4 #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: l_pos = read.pos ilen = abs(read.template_length) r_pos = l_pos + ilen - 1 if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)): mat[j] += 1 j += 1 bamHandle.close() np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
def bamtag(sam, umi_only): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile if umi_only: parser_re = re.compile('.*:UMI_(?P<MB>.*)') else: parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') start_time = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) for count, aln in enumerate(track): if not count % 100000: logger.info("Processed %d alignments.") match = parser_re.match(aln.qname) tags = aln.tags if not umi_only: aln.tags += [('XC', match.group('CB'))] aln.tags += [('XR', match.group('MB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
def umappedq2zero(bamdir): """ Reads in a BAM file, setting the MAPQ value for an alignment segment to zero if it is unmapped. Opens up both infile and outfile and outputs these modified reads to outfile. """ if not os.path.exists(bamdir): sys.stderr.write("Sorry, but the specified directory does not exist.") sys.exit(1) bamfiles = os.listdir(bamdir) bampaths = filter(lambda x: x.endswith(".bam"), bamfiles) bampaths = map(lambda x: os.path.join(bamdir, x), bampaths) for bam in bampaths: inbam = AlignmentFile(bam, "rb") # Template is specified to maintain the same header information. outbam = AlignmentFile("temp.bam", "wb", template=inbam) # Construct reads iterator using fetch. reads = inbam.fetch(until_eof=True) for read in reads: if read.is_unmapped == True: read.mapping_quality = 0 outbam.write(read) # Don't omit any reads! # Overwrite the original with the new file with MAPQs set to zero. os.rename("temp.bam", bam)
def _read_bam_frag(inbam, filter_exclude, all_bins, sections1, sections2, rand_hash, resolution, tmpdir, region, start, end, half=False, sum_columns=False): bamfile = AlignmentFile(inbam, 'rb') refs = bamfile.references bam_start = start - 2 bam_start = max(0, bam_start) try: dico = {} for r in bamfile.fetch(region=region, start=bam_start, end=end, # coords starts at 0 multiple_iterators=True): if r.flag & filter_exclude: continue crm1 = r.reference_name pos1 = r.reference_start + 1 crm2 = refs[r.mrnm] pos2 = r.mpos + 1 try: pos1 = sections1[(crm1, pos1 / resolution)] pos2 = sections2[(crm2, pos2 / resolution)] except KeyError: continue # not in the subset matrix we want crm = crm1 * (crm1 == crm2) try: dico[(crm, pos1, pos2)] += 1 except KeyError: dico[(crm, pos1, pos2)] = 1 # print '%-50s %5s %9s %5s %9s' % (r.query_name, # crm1, r.reference_start + 1, # crm2, r.mpos + 1) if half: for c, i, j in dico: if i < j: del dico[(c, i, j)] out = open(os.path.join(tmpdir, '_tmp_%s' % (rand_hash), '%s:%d-%d.tsv' % (region, start, end)), 'w') out.write(''.join('%s\t%d\t%d\t%d\n' % (c, a, b, v) for (c, a, b), v in dico.iteritems())) out.close() if sum_columns: sumcol = {} cisprc = {} for (c, i, j), v in dico.iteritems(): # out.write('%d\t%d\t%d\n' % (i, j, v)) try: sumcol[i] += v cisprc[i][all_bins[i][0] == all_bins[j][0]] += v except KeyError: sumcol[i] = v cisprc[i] = [0, 0] cisprc[i][all_bins[i][0] == all_bins[j][0]] += v return sumcol, cisprc except Exception, e: exc_type, exc_obj, exc_tb = exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print e print(exc_type, fname, exc_tb.tb_lineno)
def samfile(filename): """ A context manager to open and close a SAM/BAM file. @param filename: A C{str} file name to open. """ f = AlignmentFile(filename) yield f f.close()
def test_downgrade_read_edges_binary(self): binary = os.path.join(BAM_BIN_DIR, "downgrade_bam_edge_qual") bam_fpath = os.path.join(TEST_DATA_DIR, "sample_rev.bam") with NamedTemporaryFile() as out_fhand: cmd = [binary, "-o", out_fhand.name, bam_fpath] check_call(cmd) sam = AlignmentFile(out_fhand.name) res = [0, 0] read = sam.next() assert list(read.query_qualities[:2]) == res assert read.get_tag("dl") == "8)5B" assert read.get_tag("dr") == "8?>>"
def extract_barcode(sam, barcode): parser_re = re.compile(".*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)") sam_file = AlignmentFile(sam, mode="r") filter_file = AlignmentFile("-", mode="wh", template=sam_file) track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group("CB") if CB == barcode: filter_file.write(aln)
def read_bam_frag_filter(inbam, filter_exclude, all_bins, sections, resolution, outdir, extra_out,region, start, end): bamfile = AlignmentFile(inbam, 'rb') refs = bamfile.references try: dico = {} for r in bamfile.fetch(region=region, start=start - (1 if start else 0), end=end, # coords starts at 0 multiple_iterators=True): if r.flag & filter_exclude: continue crm1 = r.reference_name pos1 = r.reference_start + 1 crm2 = refs[r.mrnm] pos2 = r.mpos + 1 try: pos1 = sections[(crm1, pos1 / resolution)] pos2 = sections[(crm2, pos2 / resolution)] except KeyError: continue # not in the subset matrix we want try: dico[(pos1, pos2)] += 1 except KeyError: dico[(pos1, pos2)] = 1 cisprc = {} for (i, j), v in dico.iteritems(): if all_bins[i][0] == all_bins[j][0]: try: cisprc[i][0] += v cisprc[i][1] += v except KeyError: cisprc[i] = [v, v] else: try: cisprc[i][1] += v except KeyError: cisprc[i] = [0, v] out = open(path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)), 'w') dump(dico, out, HIGHEST_PROTOCOL) out.close() out = open(path.join(outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % ( region, start, end, extra_out)), 'w') dump(cisprc, out, HIGHEST_PROTOCOL) out.close() except Exception, e: exc_type, exc_obj, exc_tb = exc_info() fname = path.split(exc_tb.tb_frame.f_code.co_filename)[1] print e print(exc_type, fname, exc_tb.tb_lineno)
def test_calmd_bam(self): ref_fpath = os.path.join(TEST_DATA_DIR, "CUUC00007_TC01.fasta") bam_fpath = os.path.join(TEST_DATA_DIR, "sample.bam") orig_qual = AlignmentFile(bam_fpath).next().qual try: out_bam = NamedTemporaryFile() calmd_bam(bam_fpath, ref_fpath, out_bam.name) samfile = AlignmentFile(out_bam.name) calmd_qual = samfile.next().qual assert orig_qual != calmd_qual assert calmd_qual == "HHHHHHBHGGH!!!!!!!!!!!!!!!!!!!!!!!!!!!" finally: if os.path.exists(out_bam.name): out_bam.close()
def __init__(self, filename): self.samfile = AlignmentFile(filename) # self.referenceInsertions will be keyed by offset into the reference # sequence. The inserted bases would need to begin at this offset. The # value will be a Counter whose keys are the nucleotides proposed for # insertion, with a value indicating how many times the nucleotide was # proposed for insertion at that offset. self.referenceInsertions = defaultdict(Counter)
def gather_sv_data(options, collection): # Read regions of interest BED file regions = BedTool(options.region_file) # Read BAM file bamfile = AlignmentFile(options.bam_file, "rb") # Intersect regions for reg in regions: for read in bamfile.fetch(reg.chrom, reg.start, reg.end): #print read if read.query_name.endswith("2d"): collection[read.query_name] = [] if read.query_name.startswith("ctg"): collection[read.query_name] = [] #print read.reference_id, read.reference_start, read.reference_end #print read.query_name, read.query_alignment_start, read.query_alignment_end bamfile.close()
def bamtag(sam): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations if is_python3(): queryalignment = next(track) else: queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: aln.tags += [('XC', match.group('CB'))] if "molecular" in annotations: aln.tags += [('RX', match.group('MB'))] if "sample" in annotations: aln.tags += [('XS', match.group('SB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time))) logger.info("Processed %d alignments." % count)
def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname)
return bigg if __name__=="__main__": import argparse parser=argparse.ArgumentParser() parser.add_argument("-b", "--bamfile", help="the sorted and indexed bam file") parser.add_argument("-o", "--out", default="bigg.bed", help="the output file name") args = parser.parse_args() # make a file using the functions samfile=AlignmentFile(args.bamfile) fw=open(args.out, "w") for n, record in enumerate(samfile): try: bigg=sam_to_bigGenePred(record) fw.write(bigg.to_str()) fw.write("\n") except ValueError: pass #if n>100: #break fw.close() samfile.close()
def __init__(self, output, indexed_sequence_list, index_options): header = self.build_header(indexed_sequence_list, index_options) ensure_dir_exists(output) self.writer = AlignmentFile(output, 'wb', header=header) self.lock = Lock()
def test_downngrade_read_edges(self): # With softclip bam_fpath = os.path.join(TEST_DATA_DIR, "sample.bam") sam = AlignmentFile(bam_fpath) aligned_read = sam.next() _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) res = [ 9, 9, 9, 9, 9, 9, 3, 9, 8, 8, 9, 9, 9, 9, 9, 39, 39, 39, 38, 38, 36, 33, 36, 38, 36, 38, 38, 38, 38, 39, 39, 38, 38, 38, 9, 9, 9, 9, ] assert list(aligned_read.query_qualities) == res # without softclip sam = AlignmentFile(os.path.join(TEST_DATA_DIR, "seqs.bam")) aligned_read = sam.next() _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) expected = [ 11, 13, 11, 11, 37, 43, 43, 46, 46, 57, 57, 48, 57, 57, 42, 41, 32, 35, 38, 38, 38, 38, 41, 41, 39, 37, 37, 44, 42, 48, 47, 57, 47, 47, 48, 47, 57, 57, 54, 48, 57, 48, 54, 50, 50, 50, 50, 50, 57, 59, 54, 54, 54, 57, 57, 59, 57, 52, 52, 52, 52, 57, 57, 57, 57, 52, 52, 52, 52, 29, 27, 27, 22, ] assert list(aligned_read.query_qualities) == expected # reverse # rev seqs (sam specification puts all the alignment query # forward(cigar, seq, qual, ...). Reverse is inly noted in the flag bam_fpath = os.path.join(TEST_DATA_DIR, "sample_rev.bam") sam = AlignmentFile(bam_fpath) aligned_read = sam.next() aligned_read = sam.next() aligned_read = sam.next() _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) res = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0] assert list(aligned_read.query_qualities[:14]) == res
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) track = sam_file.fetch(until_eof=True) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True for i, aln in enumerate(track): count += 1 if not count % 100000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info("%d were filtered for not matching known barcodes." % nomatchcb) if aln.is_unmapped: unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') buf = StringIO() for key in evidence: line = '{},{}\n'.format(key, evidence[key]) buf.write(unicode(line), "utf-8") buf.seek(0) evidence_table = pd.read_csv(buf) evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) genes.to_csv(out)
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence if sam.endswith(".sam"): logger.error("To use the fasttagcount subcommand, the alignment file must be a " "coordinate sorted, indexed BAM file.") sys.exit(1) logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(lambda: collections.defaultdict(float)) bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) transcript_map = collections.defaultdict(set) sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]] if gene_map: for transcript, gene in gene_map.items(): if transcript in sam_transcripts: transcript_map[gene].add(transcript) else: for transcript in sam_transcripts: transcript_map[transcript].add(transcript) missing_transcripts = set() alignments_processed = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' current_transcript = None count_this_read = True transcripts_processed = 0 genes_processed = 0 cells = list(cb_hist.index) targets_seen = set() if umi_matrix: bare_evidence_handle = open(umi_matrix, "w") bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): for transcript in transcripts: for aln in sam_file.fetch(transcript): alignments_processed += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) continue else: target_name = txid targets_seen.add(target_name) # Scale evidence by number of hits evidence[CB][MB] += weigh_evidence(aln.tags) bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: logger.info("%d genes processed." % genes_processed) logger.info("%d transcripts processed." % transcripts_processed) logger.info("%d alignments processed." % alignments_processed) earray = [] for cell in cells: umis = [1 for _, v in evidence[cell].items() if v >= minevidence] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") earray = [] if umi_matrix: for cell in cells: earray.append(str(int(bare_evidence[cell]))) bare_evidence_handle.write(",".join([gene] + earray) + "\n") evidence = collections.defaultdict(lambda: collections.defaultdict(int)) bare_evidence = collections.defaultdict(int) genes_processed += 1 if umi_matrix: bare_evidence_handle.close() # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) targets = targets.sort_index() df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(out) if umi_matrix: df = pd.read_csv(umi_matrix, index_col=0, header=0) df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(umi_matrix)
def samfile_from_args(args): return AlignmentFile(args.bam)
def analyzeAlignment(self, alignmentOutputDirectory): print ('\nStep 2.) Parse the alignment and create a new consensus sequence.') # Load up the Alignment Reference file, we'll need it. alignmentReferenceFileName = join(alignmentOutputDirectory,'AlignmentReference.fasta') alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0] # Count the reads in the input file totalReadCount = len(list(parse(self.readInput, self.readInputFormat))) #self.readInputFormat #self.readInput # We generate a new consensus sequence from the alignment results. newConsensusSequence = "" # Open the bam file bamfile = AlignmentFile(join(alignmentOutputDirectory,'alignment.bam'), 'rb') # Open alignment analysis text file alignmentSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AlignmentSummary.csv')) alignmentSummaryFile.write('Ref_Position,Ref_Base,Reference_Adjustment,Aligned_Count,Unaligned_Count,Match_Count,Mismatch_Count,In_Count,Del_Count,A_Count,G_Count,C_Count,T_Count\n') # A smaller log. I will provide human-readable descriptions of the # bases that were adjusted in the new consensus sequence. # TODO: Provide surrounding sequence as well, maybe it's a repeat region.... # Acutally NAH, I want to just put it in the wrangler log. #adjustedBasesSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AdjustedBases.txt')) # Todo: I should keep a more structured array of info for these alignments. # Store this info into an object #class columnStats(): alignmentInfo = AlignmentInfo() # Keep a running total of adjustments made to the reference. # If this total is 0, then theoretically the consensus matches the alignment reference, and we're done. totalSequenceAdjustments = 0 # Iterate the reference sequence column by column. pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: currentAlignmentColumn = AlignmentColumn() #columnResults = None # columnResults.name='ll' # """referencePosition = 0 referenceBase = '' referenceAdjustment = '?' alignedCount = 0 unalignedCount = 0 matchCount = 0 mismatchCount = 0 inCount = 0 delCount = 0 aCount = 0 gCount = 0 cCount = 0 tCount = 0""" currentAlignmentColumn.referencePosition = pileupColumn.reference_pos currentAlignmentColumn.referenceBase = alignmentRef[pileupColumn.reference_pos].upper() currentAlignmentColumn.alignedCount = pileupColumn.nsegments currentAlignmentColumn.unalignedCount = totalReadCount - currentAlignmentColumn.alignedCount # Iterate the Reads at this position for pileupRead in pileupColumn.pileups: # If this read is a deletion if(pileupRead.is_del == 1): currentAlignmentColumn.delCount += 1 # else if this read is an insertion elif(pileupRead.indel > 0): #print ('INSERTION DETECTED, INDEL=' + str(pileupRead.indel)) currentAlignmentColumn.inCount += 1 # Else if it is a refskip (TODO What does this mean? no read aligned? Count these?) elif(pileupRead.is_refskip): print('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name) raise Exception('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name) # else this means we have a base aligned at this position for this read. else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() #print('Reference,Current:' + referenceBase + ',' + currentBase) #print('Curr') if(currentBase == currentAlignmentColumn.referenceBase): currentAlignmentColumn.matchCount += 1 else: currentAlignmentColumn.mismatchCount += 1 # Count the nucleotide if (currentBase == 'A'): currentAlignmentColumn.aCount += 1 elif (currentBase == 'G'): currentAlignmentColumn.gCount += 1 elif (currentBase == 'C'): currentAlignmentColumn.cCount += 1 elif (currentBase == 'T'): currentAlignmentColumn.tCount += 1 else: print('Unknown Base found in Alignment at position ' + str(currentAlignmentColumn.referencePosition) + ':' + currentBase) raise Exception('Unknown Base in Alignment') # TODO: What if the query insertion sequence is longer than one base? # Maybe I can only adjust one base per iteration, is that okay? Probably for the Best, actually.. # Don't worry bout it for now. # Calculate highest frequency base # I hope this algorithm makes sense, probably there is a smarter way to do it. if(currentAlignmentColumn.aCount >= currentAlignmentColumn.gCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'A' mostFrequentBaseCount = currentAlignmentColumn.aCount elif(currentAlignmentColumn.gCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.gCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'G' mostFrequentBaseCount = currentAlignmentColumn.gCount elif(currentAlignmentColumn.cCount >= currentAlignmentColumn.tCount): mostFrequentBase = 'C' mostFrequentBaseCount = currentAlignmentColumn.cCount else: mostFrequentBase = 'T' mostFrequentBaseCount = currentAlignmentColumn.tCount # Add the next base to the new consensus sequence if (currentAlignmentColumn.matchCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.inCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.delCount): # Aligned bases match the reference, add reference base to the consensus. referenceAdjustment='-' newConsensusSequence += currentAlignmentColumn.referenceBase elif (currentAlignmentColumn.inCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.inCount >= currentAlignmentColumn.delCount): # Aligned bases show an insertion. # Add the Reference Base and the Insertion Base to the consensus. totalSequenceAdjustments += 1 referenceAdjustment='I' newConsensusSequence += currentAlignmentColumn.referenceBase + mostFrequentBase self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Insertion' + '\n(' + str(currentAlignmentColumn.inCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.inCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > ' + currentAlignmentColumn.referenceBase + mostFrequentBase + ')' + '\n') #TODO: I need to insert multiple bases, if that is waht the alignment suggests. elif (currentAlignmentColumn.delCount >= currentAlignmentColumn.mismatchCount): # Reads show a deletion. # Don't add anything to the consensus. totalSequenceAdjustments += 1 referenceAdjustment='D' self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Deletion' + '\n(' + str(currentAlignmentColumn.delCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.delCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > _)' + '\n') else: # Mismatch base. # Add the highest read count base to the reference. # It might actually be the same base as the reference, # Because this just means there are more mismatches than matches. # Problematic base, at least we'll notice here. # TODO: What to do with highly heterozygous Positions? # I should report those that look particularly heterozygous, somewhere. newConsensusSequence += mostFrequentBase totalSequenceAdjustments += 1 referenceAdjustment='M' self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Mismatch' + '\n(' + str(mostFrequentBaseCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * mostFrequentBaseCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads' '\n(' + currentAlignmentColumn.referenceBase + ' > ' + mostFrequentBase + ')' + '\n') # Write a line to the alignment Summary alignmentSummaryFile.write(str(currentAlignmentColumn.referencePosition) + ',' + str(currentAlignmentColumn.referenceBase) + ',' + str(referenceAdjustment) + ',' + str(currentAlignmentColumn.alignedCount) + ',' + str(currentAlignmentColumn.unalignedCount) + ',' + str(currentAlignmentColumn.matchCount) + ',' + str(currentAlignmentColumn.mismatchCount) + ',' + str(currentAlignmentColumn.inCount) + ',' + str(currentAlignmentColumn.delCount) + ',' + str(currentAlignmentColumn.aCount) + ',' + str(currentAlignmentColumn.gCount) + ',' + str(currentAlignmentColumn.cCount) + ',' + str(currentAlignmentColumn.tCount) + '\n') alignmentInfo.alignmentColumns.append(currentAlignmentColumn) print('\nTotal Sequence Adjustments:' + str(totalSequenceAdjustments) + ' (How many bases the consensus differs from the reference.)\n') # Write the newly constructed consensus sequence. currentConsensusSequenceFileName = join(alignmentOutputDirectory, 'Consensus.fasta') consensusWriter = createOutputFile(currentConsensusSequenceFileName) # TODO: How to i give this a better name? Can I find a gene guess or something? sequenceID = "Consensus_Sequence" write([SeqRecord(Seq(newConsensusSequence, IUPAC.unambiguous_dna), id=sequenceID, description="") ], consensusWriter, 'fasta') consensusWriter.close() self.wranglerLog.write('Total Sequence Adjustments:' + str(totalSequenceAdjustments) + '\n') # Close Summary Files alignmentSummaryFile.close() #adjustedBasesSummaryFile.close() return alignmentInfo
def sparse_count_reads_in_regions(bamfile, regions, storage, flank=0, log=None, template_length=1000, count_both_ends=False): """ This function obtains the counts per bins of equal size across the genome. The function automatically extracts the genome size from the bam file header. If group tags are available, they will be used to extract the indices from. Finally, the function autmatically detects whether the bam-file contains paired-end or single-end reads. Paired-end reads are counted once at the mid-point between the two pairs while single-end reads are counted at the 5' end. For paired-end reads it is optionally possible to count both read ends by setting count_both_ends=True. Parameters ---------- bamfile : str Path to a bamfile. The bamfile must be indexed. regions : str BED or GFF file containing the regions of interest. storage : str Path to the output hdf5 file, which contains the counts per chromsome. flank : int Extension of the regions in base pairs. Default: 0 template_length : int Assumed template length. This is used when counting paired-end reads at the mid-point and the individual reads do not overlap with the given region, but the mid-point does. count_both_ends : bool Indicates whether for paired-end sequences, the ends of both mates should be counted separately. Default: False. """ # Obtain the header information afile = AlignmentFile(bamfile, 'rb') # extract genome lengths if log is not None: f = open(log, 'w') fwrite = f.write else: fwrite = print fwrite('Make countmatrix from region\n') fwrite('bamfile: {}\n'.format(bamfile)) fwrite('bedfile: {}\n\n'.format(regions)) fwrite('get genomesize\n') # extract genome size genomesize = {} for chrom, length in zip(afile.references, afile.lengths): genomesize[chrom] = length fwrite('found {} chromosomes'.format(len(genomesize))) nreg = 0 regfile = BedTool(regions) nreg = len(regfile) fwrite('number of regions to collect counts from: {}'.format(nreg)) if 'RG' in afile.header: use_group = True else: use_group = False # get barcodes from header barcodes = {} if use_group: # extract barcodes for idx, item in enumerate(afile.header['RG']): barcodes[item['ID']] = idx else: barcodes['dummy'] = 0 fwrite('found {} barcodes'.format(len(barcodes))) # barcode string for final table barcode_string = ';'.join([item['ID'] for item in afile.header['RG']]) sdokmat = dok_matrix((nreg, len(barcodes)), dtype='int32') nbarcode_inregions = {key: 0 for key in barcodes} if count_both_ends: # if both ends are counted, template_length is irrelevant tlen = 0 else: tlen = template_length for idx, iv in enumerate(regfile): iv.start -= flank iv.end += flank if iv.chrom not in genomesize: # skip over peaks/ regions from chromosomes # that are not contained in the bam file continue fetchstart = max(iv.start - tlen, 0) fetchend = min(iv.end + tlen, genomesize[iv.chrom]) for aln in afile.fetch(iv.chrom, fetchstart, fetchend): if aln.is_proper_pair and aln.is_read1 and not count_both_ends: pos = min(aln.reference_start, aln.next_reference_start) # count paired end reads at midpoint midpoint = pos + abs(aln.template_length) // 2 if midpoint >= iv.start and midpoint < iv.end: sdokmat[ idx, barcodes[aln.get_tag('RG' ) if use_group else 'dummy']] += 1 nbarcode_inregions[ aln.get_tag('RG') if use_group else 'dummy'] += 1 if not aln.is_paired or count_both_ends: # count single-end reads at 5p end if not aln.is_reverse: if aln.reference_start >= iv.start and aln.reference_start < iv.end: sdokmat[ idx, barcodes[aln. get_tag('RG' ) if use_group else 'dummy']] += 1 else: if aln.reference_start + aln.reference_length - 1 >= iv.start and \ aln.reference_start + aln.reference_length - 1 < iv.end: sdokmat[ idx, barcodes[aln. get_tag('RG' ) if use_group else 'dummy']] += 1 nbarcode_inregions[ aln.get_tag('RG') if use_group else 'dummy'] += 1 afile.close() fwrite('sparse matrix shape: {}'.format(sdokmat.shape)) fwrite('density: {}'.format(sdokmat.nnz / np.prod(sdokmat.shape))) # store the results in COO sparse matrix format spcoo = sdokmat.tocoo() # sort lexicographically order_ = np.lexsort((spcoo.col, spcoo.row)) indices = np.asarray([x for x in zip(spcoo.row, spcoo.col)], dtype=np.int64)[order_] values = spcoo.data.astype(np.float32)[order_] cont = {'region': indices[:, 0], 'cell': indices[:, 1], 'count': values} df = pd.DataFrame(cont) with open(storage, 'w') as title: title.write('# ' + barcode_string + '\n') df.to_csv(storage, mode='a', sep='\t', header=True, index=False) #main output file names = [key for key in barcodes] counts = [nbarcode_inregions[key] for key in barcodes] df = pd.DataFrame({'barcodes': names, 'counts': counts}) df.to_csv(storage + '.counts', sep='\t', header=True, index=False) fwrite('total number of tags with barcodes: {}'.format(df.counts.sum())) if log is not None: f.close()
def extract_covariates(bam_path: str, reference_path: str, contig: str, start: int, end: int, start_fetch: int, end_fetch: int, filter_kwargs: dict, covariate_kwargs: dict): """ Count mismatches and matches for similar base-calls Returns: match_mismatch(dict) : dictionary ( covariate_key: [mismatches, matches], .. ) """ # known is a set() containing locations of known variation (snps) # @todo: extend to indels global known # <- Locations, set of (contig, position) tuples to ignore joined = dict() # Filters which select which reads are used to estimate covariates: min_mapping_quality = filter_kwargs.get('min_mapping_quality', 0) deduplicate = filter_kwargs.get('deduplicate', False) filter_qcfailed = filter_kwargs.get('filter_qcfailed', False) variant_blacklist_vcf_files = filter_kwargs.get( 'variant_blacklist_vcf_files', None) # Obtain all variants in the selected range: blacklist = set() if variant_blacklist_vcf_files is not None: for path in variant_blacklist_vcf_files: with pysam.VariantFile(path) as bf: for record in bf.fetch(contig, start_fetch, end_fetch): blacklist.add(record.pos) with AlignmentFile(bam_path) as alignments, FastaFile( reference_path) as fa: reference = CachedFasta(fa) # @todo: prefetch selected region for read in alignments.fetch(contig, start_fetch, end_fetch): if (deduplicate and read.is_duplicate) or \ (read.is_qcfail and filter_qcfailed) or \ (read.mapping_quality < min_mapping_quality): continue for qpos, refpos, refbase in read.get_aligned_pairs( matches_only=True, with_seq=True): if refpos > end or refpos < start: # Prevent the same location to be counted multiple times continue if refpos in blacklist: continue refbase = refbase.upper() if refbase == 'N' or (read.reference_name, refpos) in known: continue key = get_covariate_key(read, qpos, refpos, reference, refbase, **covariate_kwargs) if key is None: continue matched = (refbase == read.query_sequence[qpos]) try: joined[key][matched] += 1 except KeyError: if matched: joined[key] = array('l', [0, 1]) else: joined[key] = array('l', [1, 0]) return joined
def phaseHeterozygousReads(self): # TODO: Should this method accept a cluster count? # That will break some things. What things? # This method is only called from this file, in the summarizeAnalysis method. print('Splitting reads by heterozygous positions') # Get a list of reads for later. parsedReads = list(parse(self.readInput, self.readInputFormat)) heterozygousConsensusDirectory = join(self.outputRootDirectory,'HeterozygousAlignment') # Open the bam file print ('opening final alignment_bamfile') bamfile = AlignmentFile(join(heterozygousConsensusDirectory,'alignment.bam'), 'rb') # Load up the Alignment Reference file, we'll need it. alignmentReferenceFileName = join(heterozygousConsensusDirectory,'AlignmentReference.fasta') alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0] # get list of AlignedReads print ('Making a list of Aligned Reads.') readIDs = [] for read in parsedReads: if not read.id in readIDs: readIDs.append(read.id) readIDs.sort() # Heterozygous base list heterozygousBasesSummaryFile = createOutputFile(join(heterozygousConsensusDirectory, 'HeterozygousBases.txt')) heterozygousBasesSummaryFile.write('List of Heterozygous Bases (0-based):\n') if (self.snps is not None and len(self.snps) > 0): # A string of SNPs was passed in, I don't need to calculate them myself. # TODO: I could write alignment stats here, like I do when i self-calculate the hetero positions. # This is just a simple list of 0-based positions. for snp in self.snps: heterozygousBasesSummaryFile.write(str(snp) + '\n') else: # get list of Heterozygous Positions # TODO: I suppose I don't need to align 100% of reads to determine heterozygosity. # Maybe this would speed up if i use a smaller alignment, or stop the loop after X reads print('Getting a list of Heterozygous Positions:') self.snps = [] pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: readCount = 0 matchCount = 0 mismatchCount = 0 insCount = 0 delCount = 0 # dictionary of base counts. referenceBase = alignmentRef.seq[pileupColumn.pos].upper() # Iterate the Reads at this position. Each read at each position is either: # ins, Del, match, mismatch. #TODO: is it possible to exclude secondary/supplemetnary in the pileups method? No. for pileupRead in pileupColumn.pileups: #TODO: Important. Filter secondary / supplementary reads. This is causing problems, these secondary reads are FULL of snps. # Difficulty: these parameters are on an aligned segment. alignedSegmentObject = pileupRead.alignment if(False): pass elif(alignedSegmentObject.is_secondary): #print ('Secondary read at Position ' + str(pileupColumn.pos)) pass elif(alignedSegmentObject.is_supplementary): #print ('Supplementary read at Position ' + str(pileupColumn.pos)) pass # Just trying some things, not sure what these mean. #elif (alignedSegmentObject.is_unmapped): # print('UNMAPPED READ!!!!!!!!!!!!!!!!!!! what does that mean?') #elif (alignedSegmentObject.is_qcfail): # print('This read was a QC failure. What does that mean?????????????') else: readCount += 1 # indels if(pileupRead.is_del == 1): delCount += 1 elif(pileupRead.indel > 0): insCount += 1 else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() if(currentBase == referenceBase): matchCount += 1 else: mismatchCount += 1 # This is a cheap way to stop analysis early. I will only analyze the first 250 reads. # Potential problem: are these reads sorted somehow? Maybe my numbers are biased by only looking at the # first reads # Todo: This is another parameter that can be tuned. Add to inputs? Maybe. maxAnalyzedReadCounts = 1000 if(readCount > maxAnalyzedReadCounts): break matchProportion = (1.0 * matchCount / readCount) insertionProportion = (1.0 * insCount / readCount) deletionProportion = (1.0 * delCount / readCount) mismatchProportion = (1.0 * mismatchCount / readCount) #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Match/Mismatch : ' + str(matchCount) + '/' + str(mismatchCount)) #print ('Match Percentage ' + str(matchProportion)) # TODO: Should accepted match proprtion be a commandline parameter? # if > 75% of bases match, this is not a heterzygous position baseCutoff = .70 if(matchProportion > baseCutoff or insertionProportion > baseCutoff or deletionProportion > baseCutoff): pass #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount)) #print ('This position does not look heterozygous.') # If coverage is very low, we should not use this position # This logic is flawed, i think this is never working. elif ((1.0 * pileupColumn.n / readCount) < .25): pass elif (mismatchProportion > baseCutoff): pass # These are the hardcoded values I used for the DRA analysis. Cheating. # # I want to write a condition where we don't use the position if it's not clearly polymorphic. # #elif (False): # # pass # # If the mismatch proportion is too high, what happens? What if there are 2 different bases that are mismatched, like if both my alleles have a different snp from reference. I'll miss that right now. # # # TEMP, this is very temporary. This is specific to a reference. # # TODO : Fix these hard coded values. # TODO: I don't really need this code, this is to ignore regions of my DRA reference. # Instead, I can pass in a list of 1-based polymorphic positions to sort based on those. A "whitelist" instead of a "blacklist" # # In a perfect world....I could tell what positions are heterozygous, but I can't. # # I can tell if this sequence is a homopolymer though, but looking at the bases around it.....But that's not the correct thing to do. # # I can keep this logic but make it a parameter. Big deletion regions are hard to analyze so I'm just ignoring them for now. # elif(5890 <= pileupColumn.pos <= 5970): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # elif (6203 <= pileupColumn.pos <= 6212): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # # Big String of A's # elif (774 <= pileupColumn.pos <= 796): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass # #Known homopolymer positions....this is terrible programming. # # I could at least pass these in ad ignored positions.... # elif (pileupColumn.pos in (403,430, 1479, 1510, 1683, # 1991, 1996, 1997, 2003, 2009, 2093, 2100, 2133, 2134, 2191, # 2262, 2289, 2294, 2342, 2449, 2450, 2524, 2647, 2663, 2732, # 2895, 2902, 3113, 3114, 3180, 3197, 3362, 3396, 3453, 3542, # 3551, 3665, 3832, 3903, 3953, 4108, 4109, 4400, 4639, 4698, # 4703, 4769, 4785, 4786, 4828, 4878, 5084, 5301, 5302, 5449, # 5575, 5597, 6155, 6279, 6280, 6314, 6375, 6376, 6712, 6755, # 6790, 7084, 7631, 7718, 7769, 7971, 7978, 8132, 8133, 8134, # 8314, 8315, 8352, 8476, 8477, 8478, 8642, 8650, 8651, 8652, # 8653, 8654, 8655, 8656, 8657, 8698, 8725, 8753, 8759 # )): # print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.') # pass else: #heterozygousBasesSummaryFile.write (str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + '\n') heterozygousBasesSummaryFile.write(str(pileupColumn.pos) + ', Coverage ' + str( pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str( insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + ' : ' + str(round(deletionProportion,2)) + '/' + str(round(insertionProportion, 2)) + '/' + str(round(matchProportion, 2)) + '/' + str(round(mismatchProportion, 2)) + '\n') self.snps.append(pileupColumn.pos) heterozygousBasesSummaryFile.close() #print ('Pileup Column # ' + str(pileupIterator)) print('Calculating read distance arrays:') # I'm making this distance array. In this array, a 0 represents a Match. a 1 represents indels or substitutions. # This way I can calculate "distance" in an arbitrary number of dimensions # Distance is a euclidian way to represent how far away a read is from the consensus, # based on the heterozygous positions. Each heterozygous position is a "dimension" in this space distanceArrays = {} for readID in readIDs: # TODO: A Bug! Initializing this list as 0s will bias the results. # TODO: Pileupcolumn loop is not hitting each read. Only...half sometimes. Some reads are not analyzed. # Why? SPOTTED IT! bamfile.pileup has a default to maximum read depth of 8000 #distanceArrays[readID] = list([999] * len(self.snps)) distanceArrays[readID] = list([0] * len(self.snps)) # I spotted the bug!!! pileup defaults to maximum 8000 read depth. That's bad!. pileupIterator = bamfile.pileup(alignmentRef.id,max_depth=99999999) #pileupIterator = bamfile.pileup(alignmentRef.id) for pileupColumn in pileupIterator: currentColumn = pileupColumn.pos # Only do this if the column number exists in our list of heterozygous positions if currentColumn in self.snps: heterozygousPositionIndex = self.snps.index(currentColumn) currentAnalyzedReadCount = 0 # A debugging variable, i dont think I actually use this count. referenceBase = alignmentRef.seq[currentColumn].upper() for pileupRead in pileupColumn.pileups: currentAnalyzedReadCount += 1 readID = pileupRead.alignment.query_name #print('Pos:' + str(currentColumn) + ', Refbase:' + str(referenceBase) + ', Read:' + str(readID)) # In this model, the distance is either 0 or 1. This was intentional but # Maybe we can tune the algorithm using these distances. # This could actually be tuned to do the heterozygous split using ONLY snps. # TODO: if we're having problems splitting based on homopolymers check this spot. # Maybe, I want to count indels as 0, no distance. # TODO: Something to try: indels are -1. SNPS are 1. Match = 0 # Maybe that would help the sorting? # TODO: Newest idea. Default to 0. 1 is match, -1 is indels. -1 is mismatches. I think that's it. if(pileupRead.is_del == 1): distanceArrays[readID][heterozygousPositionIndex] = -1 elif(pileupRead.indel > 0): distanceArrays[readID][heterozygousPositionIndex] = -1 else: currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper() if(currentBase == referenceBase): #print('Assinging Match. Column=' + str(currentColumn) + ', CurrentBase:' + str(currentBase) + ', HeterozygousPosIndex=' + str(heterozygousPositionIndex)) distanceArrays[readID][heterozygousPositionIndex] = 1 else: distanceArrays[readID][heterozygousPositionIndex] = -1 print('At position ' + str(heterozygousPositionIndex + 1) + ' I analyzed ' + str(currentAnalyzedReadCount) + ' reads.') self.printDistanceArrays(distanceArrays, join(self.heterozygousDirectory, 'DistanceArrays.csv')) # TODO: Im making 3 clusters. that worked. I need to make a parameter for cluster count. clusteredReadIDs = self.clusterReads(distanceArrays, 2) # Dictionary of results to return. Key is location of the consensus sequence. # Value is the # of reads represented in this consensus alignment. coverageResults = {} for zeroBasedClusterIndex, readCluster in enumerate(clusteredReadIDs): # I want to call the Strand (1 and 2), not Strand (0 and 1). clusterIndex = zeroBasedClusterIndex + 1 clusteredReadIDs = readCluster.keys() clusterOutputDir = join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'ClusteredReads') distanceArrayFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'DistanceArrays.csv') self.printDistanceArrays(readCluster, distanceArrayFileName) readOutputFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'Reads.' + self.readInputFormat) readOutputFile = createOutputFile(readOutputFileName) # Loop parsed reads, grab reads belonging to this cluster. # FYI it looks like each input is clustered in the output, i haven't found a missing read yet. I should still check. for readObject in parsedReads: #print ('ReadClusterIndex=' + str(zeroBasedClusterIndex)) #print ('AllReadID=' + str(readObject.id)) for clusteredReadID in clusteredReadIDs: #print ('clusteredReadID=' + str(clusteredReadID)) if (readObject.id == clusteredReadID): write([readObject], readOutputFile, self.readInputFormat) break readOutputFile.close() currentWranglerObject = AlleleWrangler( readOutputFileName , join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'Alignment') , join(self.heterozygousDirectory, 'AlignmentReference.fasta') , 6 , self.numberThreads , False , self.snps) currentCoverageResults = currentWranglerObject.analyzeReads() # Merge the dictionaries of coverage values ane return them. for key in currentCoverageResults.keys(): coverageResults[key] = currentCoverageResults[key] print ('Done Phasing Reads.') return coverageResults
#! /usr/bin/env python import sys import re from pysam import AlignmentFile import pandas as pd #program to identify reads containing leader sequence and TRS sequences #from bamfiles aligned to the MHV genome bamfile = AlignmentFile(file(sys.argv[1]), "rb") pos = dict() for read in bamfile.fetch("MHVA59"): leader = "TTTAAATCTAA" if re.search(leader, read.seq): CIGAR = read.cigartuples d = dict() [ d[t[0]].append(t[1]) if t[0] in list(d.keys()) else d.update({t[0]: [t[1]]}) for t in CIGAR ] key = read.query_name
def build_ref_clusters(data, idx, iregion): """ Given a chunk of regions this will pull in the reference for each region and then pull in all consens reads matching to that region. It uses cigar info to align the consens reads with the ref. This also merges consens from the same sample that were not merged earlier, which is why we expect no duplicate samples in the output of reference assemblies. """ # prepare i/o for bamfile with mapped reads bamfile = AlignmentFile( os.path.join(data.dirs.across, "{}.cat.sorted.bam".format(data.name)), 'rb') # dict to map chromosome names to integers faidict = chroms2ints(data, False) # prepare i/o for pysam reference indexed reffai = FastaFile(data.params.reference_sequence) # store path to cluster bit outbit = os.path.join(data.tmpdir, "aligned_{}.fa".format(idx)) # get clusters iregions = iter(iregion) clusts = [] while 1: # pull in all consens reads mapping to a bed region try: region = next(iregions) reads = bamfile.fetch(*region) except StopIteration: break # build a dict to reference seqs and cigars by name mstart = 9e12 mend = 0 rdict = {} for read in reads: rstart = read.reference_start rend = rstart + read.qlen mstart = min(mstart, rstart) mend = max(mend, rend) rdict[read.qname] = (read.seq, read.cigar, rstart, rend) keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0]) # pull in the reference for this region (1-indexed) refs = reffai.fetch(region[0], mstart + 1, mend + 1) # make empty array rlen = mend - mstart arr = np.zeros((len(keys) + 1, rlen), dtype=bytes) arr[0] = list(refs.upper()) # fill arr with remaining samples for idx, key in enumerate(keys): seq, cigar, start, end = rdict[key] # how far ahead of ref start and short of ref end is this read fidx = start - mstart eidx = arr.shape[1] - (mend - end) # enter into the array, trim end if longer than pulled ref arr[idx + 1, fidx:eidx] = list(seq)[:eidx - fidx] # mod sequence according to cigar for indels and ambigs # csums is the location of impute on the seq, so it must be # incremented by fidx and not extend past eidx for cidx, cig in enumerate(cigar): if cig[0] == 4: csums = sum(i[1] for i in cigar[:cidx]) csums += eidx if csums < fidx: arr[idx + 1, csums] = arr[idx + 1, csums].lower() if cig[0] == 1: csums = sum(i[1] for i in cigar[:cidx]) csums += eidx if csums < fidx: arr[idx + 1, csums] = b"-" # fill terminal edges with N arr[arr == b""] = b"N" # duplicates merge here (only perfect merge on all Ns) and reshape # the array to match. This will need to be resolved in catgs... # if it does not merge then try: keys, arr = resolve_duplicates(keys, arr) except IPyradError: pass # get consens seq and variant site index clust = [ ">reference_{}:{}:{}-{}\n{}".format( 0, faidict[region[0]] + 1, mstart + 1, mend + 1, # 1-indexed b"".join(arr[0]).decode()) ] for idx, key in enumerate(keys): clust.append(">{}\n{}".format(key, b"".join(arr[idx + 1]).decode())) clusts.append("\n".join(clust)) # dump to temp file until concat in next step. with open(outbit, 'w') as outfile: if clusts: outfile.write("\n//\n//\n".join(clusts) + "\n//\n//\n")
def open_bam_file(file_name): try: return AlignmentFile(file_name, 'rb') except ValueError: return open(file_name)
def __init__(self, bam_file): from pysam import AlignmentFile bam = AlignmentFile(bam_file) self.bam_header = bam.header
class Writer(Thread): def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None): super().__init__() self.fd = fd self.log = [] self.mode = mode self.duplex = duplex self.aligner = aligner self.iterator = iterator self.fastq = mode == 'wfq' self.group_key = group_key self.output = AlignmentFile( fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq, reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names if aligner else [], reference_lengths=[ len(aligner.seq(name)) for name in aligner.seq_names ] if aligner else [], text=sam_header(groups), ) ) def run(self): with CSVLogger(summary_file(), sep='\t') as summary: for read, res in self.iterator: seq = res['sequence'] qstring = res.get('qstring', '*') mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = res.get('mapping', False) mods_tags = res.get('mods', []) if self.duplex: samples = len(read[0].signal) + len(read[1].signal) read_id = '%s;%s' % (read[0].read_id, read[1].read_id) else: samples = len(read.signal) read_id = read.read_id tags = [ f'RG:Z:{read.run_id}_{self.group_key}', f'qs:i:{round(mean_qscore)}', *read.tagdata(), *mods_tags, ] if len(seq): if self.mode == 'wfq': write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags) else: self.output.write( AlignedSegment.fromstring( sam_record(read_id, seq, qstring, mapping, tags=tags), self.output.header ) ) if self.duplex: summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping)) else: summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) self.log.append((read_id, samples)) else: logger.warn("> skipping empty sequence %s", read_id)
def run_process(opts, mutect2_vcf, mutect2_bam): outputvcf = opts.output # Open VCF, BAM m2vcf = VariantFile(mutect2_vcf) m2bam = AlignmentFile(mutect2_bam, 'rb') old_chrom = '' old_pos = -1 old_ref = '' old_alts = () variants_list = list() # Get Splited Variants for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts if chrom == old_chrom and pos == old_pos + 1 and len(old_ref) == 1 and len(ref) == 1 and len(alts) == 1: tmp_dict = { "chrom" : chrom, "start_pos" : old_pos, "end_pos" : pos, "ref" : old_ref + ref, "alt" : old_alts[0] + alts[0] } variants_list.append(tmp_dict) old_chrom = chrom old_pos = pos old_ref = ref old_alts = alts # Get Read Information for v in variants_list: reads = m2bam.fetch(v["chrom"], v["start_pos"] - 1, v["end_pos"]) ref_read_cnt = 0 alt_read_cnt = 0 alt_first_cnt = 0 alt_second_cnt = 0 f1r2_ref_cnt = 0 f2r1_ref_cnt = 0 f1r2_alt_cnt = 0 f2r1_alt_cnt = 0 dp = 0 for read in reads: if not read.is_secondary and not read.is_supplementary and not read.is_unmapped and not read.is_duplicate: query_position_list = read.get_reference_positions() try: q_start_index = query_position_list.index(v["start_pos"]-1) q_end_index = query_position_list.index(v["end_pos"]-1) query_seq = read.query_sequence[q_start_index] + read.query_sequence[q_end_index] if query_seq == v["ref"]: ref_read_cnt += 1 if read.is_read1: f1r2_ref_cnt += 1 elif read.is_read2: f2r1_ref_cnt += 1 elif query_seq == v["alt"]: alt_read_cnt += 1 if read.is_read1: f1r2_alt_cnt += 1 elif read.is_read2: f2r1_alt_cnt += 1 elif query_seq[0] != v["ref"][0] and query_seq[1] == v["ref"][1]: alt_first_cnt += 1 elif query_seq[0] == v["ref"][0] and query_seq[1] != v["ref"][1]: alt_second_cnt += 1 dp += 1 except: continue v["ref_cnt"] = ref_read_cnt v["alt_cnt"] = alt_read_cnt v["alt_first_cnt"] = alt_first_cnt v["alt_second_cnt"] = alt_second_cnt v["f1r2"] = (f1r2_ref_cnt, f1r2_alt_cnt) v["f2r1"] = (f2r1_ref_cnt, f2r1_alt_cnt) v["dp"] = dp # Re-index True:False m2vcf_index = 0 m2vcf_flag = list() second_flag = True for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos if second_flag == True: m2vcf_flag.append(True) else: m2vcf_flag.append(False) second_flag = True for v in variants_list: if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0: if v["alt_first_cnt"] == 0: m2vcf_flag[m2vcf_index] = False if v["alt_second_cnt"] == 0: second_flag = False m2vcf_index += 1 # Write Recrod & VCF new_header = m2vcf.header new_header.formats.add("MDV", "1", "Integer", "Merged Di-Allelic Variant : Backed Phased variant that was splited snp before") vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header) m2vcf_index = 0 for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos if m2vcf_flag[m2vcf_index] == True: vcf_out.write(record) for v in variants_list: if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0: record2 = vcf_out.new_record() record2.chrom = v["chrom"] record2.pos = v["start_pos"] record2.ref = v["ref"] record2.alts = (v["alt"],) record2.info["DP"] = v["dp"] if "F1R2" in record2.samples[0]: record2.samples[0]["F1R2"] = v["f1r2"] record2.samples[0]["F2R1"] = v["f2r1"] record2.samples[0]["AD"] = (v["ref_cnt"], v["alt_cnt"]) record2.samples[0]["DP"] = v["dp"] record2.samples[0]["AF"] = float(v["alt_cnt"]) / float(v["dp"]) record2.samples[0]["GT"] = ("0", "0") record2.samples[0]["MDV"] = True vcf_out.write(record2) else: continue m2vcf_index += 1
class CTCWriter(Thread): """ CTC writer process that writes output numpy training data. """ def __init__( self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90, min_accuracy=0.99, ref_fn=None, groups=None ): super().__init__() self.fd = fd self.log = [] self.mode = mode self.aligner = aligner self.iterator = iterator self.min_coverage = min_coverage self.min_accuracy = min_accuracy self.output = AlignmentFile( fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq', reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names, reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names], text=sam_header(groups), ) ) def run(self): chunks = [] targets = [] lengths = [] with CSVLogger(summary_file(), sep='\t') as summary: for read, ctc_data in self.iterator: seq = ctc_data['sequence'] qstring = ctc_data['qstring'] mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = ctc_data.get('mapping', False) self.log.append((read.read_id, len(read.signal))) if len(seq) == 0 or mapping is None: continue cov = (mapping.q_en - mapping.q_st) / len(seq) acc = mapping.mlen / mapping.blen refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en) if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq: continue self.output.write( AlignedSegment.fromstring( sam_record(read.read_id, seq, qstring, mapping), self.output.header ) ) summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) if mapping.strand == -1: refseq = mappy.revcomp(refseq) target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})] targets.append(target) chunks.append(read.signal) lengths.append(len(target)) if len(chunks) == 0: sys.stderr.write("> no suitable ctc data to write\n") return chunks = np.array(chunks, dtype=np.float16) targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8) for idx, target in enumerate(targets): targets_[idx, :len(target)] = target lengths = np.array(lengths, dtype=np.uint16) indices = np.random.permutation(typical_indices(lengths)) chunks = chunks[indices] targets_ = targets_[indices] lengths = lengths[indices] summary = pd.read_csv(summary_file(), sep='\t') summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False) output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1')) np.save(os.path.join(output_directory, "chunks.npy"), chunks) np.save(os.path.join(output_directory, "references.npy"), targets_) np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths) sys.stderr.write("> written ctc training data\n") sys.stderr.write(" - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape))) sys.stderr.write(" - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape))) sys.stderr.write(" - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape))) def stop(self): self.join()
def __init__(self, bam_file): bam = AlignmentFile(bam_file) self.bam_header = bam.header
def open_bamfile(sam): from pysam import AlignmentFile sam_mode = 'r' if sam.endswith(".sam") else 'rb' return AlignmentFile(sam, mode=sam_mode)
import sys from pysam import AlignmentFile from argparse import ArgumentParser valid_spliced_reads=0 problem_reads=0 parser = ArgumentParser() parser.add_argument('infile', nargs='?', default='-') parser.add_argument('outfile', nargs='?', default='-') args = parser.parse_args() infile = AlignmentFile(args.infile, 'r') outfile = AlignmentFile(args.outfile, 'wh', template=infile) for read in infile: splice_len = 0 min_edge = 1e6 if read.mapping_quality < 10: continue for cig_op, cig_len in read.cigartuples: if cig_op == 3: # N splice_len += cig_len elif cig_op == 0: min_edge = min(min_edge, cig_len) if splice_len > 50 and min_edge >= 6: outfile.write(read) valid_spliced_reads += 1 if valid_spliced_reads % 100000 == 0: sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) ) sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.get_tid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] self._featureNameMappings = {} # RGID -> ("abstract feature name" -> actual feature name) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] rgFrameRate = ds["FRAMERATEHZ"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate)) # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) featureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._featureNameMappings[rgID] = featureNameMapping self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float)]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } # The pulse features "available" to clients of this file are the intersection # of pulse features available from each read group. self._pulseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._featureNameMappings.values()]) def _loadProgramInfo(self): pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", []) ] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords( pgRecords, dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0.1 try: checkedVersion = self.version if "b" in checkedVersion: raise Exception() else: major, minor, patch = checkedVersion.split('.') assert major >= 3 assert minor >= 0 assert patch >= 1 except: raise IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0.1 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not(self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
def write_matrix(inbam, resolution, biases, outdir, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), normalizations=('decay', ), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, extra='', half_matrix=True, nchunks=100, tmpdir='.', append_to_tar=None, ncpus=8, cooler=False, row_names=False, chr_order=None, verbose=True): """ Writes matrix file from a BAM file containing interacting reads. The matrix will be extracted from the genomic BAM, the genomic coordinates of this matrix will be at the intersection of two regions defined byt the parameters region1, start1, end1 and region2, start2, end2. If the wanted matrix is than the second coodinate can be skipped. :param inbam: path to BAM file (generated byt TADbit) :param resolution: resolution at which we want to write the matrix :param biases: path to a file with biases :param outdir: path to a folder where to write output files :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param ('decay',) normalization: tuple with normalizations to use, can be 'decay', 'norm' or/and 'raw'. One file per normalization will be created. :param None region1: chromosome name of the first region from which to extract the matrix :param None region1: chromosome name of the first region from which to extract the matrix :param None start1: start coordinate of the first region from which to extract the matrix :param None end1: end coordinate of the first region from which to extract the matrix :param None region2: chromosome name of the second region from which to extract the matrix :param None start2: start coordinate of the second region from which to extract the matrix :param None end2: end coordinate of the second region from which to extract the matrix :param True half_matrix: writes only half of the matrix (and the diagonal) :param '.' tmpdir: where to write temporary files :param None append_to_tar: path to a TAR file were generated matrices will be written directly :param 8 ncpus: number of cpus to use to read the BAM file :param True verbose: speak :param False row_names: Writes geneomic coocrdinates instead of bins. WARNING: results in two extra columns :param None chr_order: chromosome order :param 100 nchunks: maximum number of chunks into which to cut the BAM :returns: path to output files """ if start1 is not None and end1: if end1 - start1 < resolution: raise Exception( 'ERROR: region1 should be at least as big as resolution') if start2 is not None and end2: if end2 - start2 < resolution: raise Exception( 'ERROR: region2 should be at least as big as resolution') if isinstance(normalizations, list): normalizations = tuple(normalizations) elif isinstance(normalizations, basestring): normalizations = tuple([normalizations]) if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) regions, rand_hash, bin_coords, chunks = read_bam(inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, chr_order=chr_order, verbose=verbose) if region1: regions = [region1] if region2: regions.append(region2) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( list(zip(bamfile.references, [x for x in bamfile.lengths]))) start_bin1, end_bin1, start_bin2, end_bin2 = bin_coords section_pos1 = {} section_pos2 = {} totals = OrderedDict() total_num = 0 for c in sections: totals[c] = total_num total_num += sections[c] / resolution + 1 if row_names: if len(regions) in [1, 2]: offset = start_bin1 - totals[regions[0]] section_pos1 = dict((i, (region1, resolution * (i + offset))) for i in range(end_bin1 - start_bin1)) if region2: offset = start_bin2 - totals[regions[1]] section_pos2 = dict((i, (region2, resolution * (i + offset))) for i in range(end_bin2 - start_bin2)) else: section_pos1 = dict((v + i, (c, i)) for c, v in totals.iteritems() for i in range(sections[c] // resolution + 1)) section_pos2 = section_pos1 if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region( biases, bin_coords) elif normalizations != ('raw', ): raise Exception( 'ERROR: should provide path to file with biases (pickle).') else: bads1 = bads2 = {} if verbose: printime(' - Writing matrices') # define output file name name = _generate_name(regions, (start1, start2), (end1, end2), resolution, chr_order) # prepare file header outfiles = [] if cooler: if 'h5py' not in modules: raise Exception( 'ERROR: cooler output is not available. Probably ' + 'you need to install h5py\n') if 'decay' in normalizations or 'raw&decay' in normalizations: raise Exception( 'ERROR: decay and raw&decay matrices cannot be exported ' 'to cooler format. Cooler only accepts weights per column/row') fnam = 'raw_%s_%s%s.mcool' % (name, nicer(resolution).replace(' ', ''), ('_' + extra) if extra else '') if os.path.exists(os.path.join(outdir, fnam)): os.remove(os.path.join(outdir, fnam)) out_raw = cooler_file(os.path.join(outdir, fnam), resolution, sections, regions) out_raw.create_bins() out_raw.prepare_matrix(start_bin1, start_bin2) outfiles.append((os.path.join(outdir, fnam), fnam)) else: if 'raw' in normalizations: fnam = 'raw_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_raw = StringIO() outfiles.append((out_raw, fnam)) else: out_raw = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_raw.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_raw.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # write file header if 'norm' in normalizations: fnam = 'nrm_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_nrm = StringIO() outfiles.append((out_nrm, fnam)) else: out_nrm = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_nrm.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_nrm.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if 'decay' in normalizations or 'raw&decay' in normalizations: fnam = 'dec_%s_%s%s.abc' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') if append_to_tar: out_dec = StringIO() outfiles.append((out_dec, fnam)) else: out_dec = open(os.path.join(outdir, fnam), 'w') outfiles.append((os.path.join(outdir, fnam), fnam)) for reg in regions: out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg])) out_dec.write('# %s resolution:%d\n' % (name, resolution)) if region2: out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out_dec.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) # functions to write lines of pairwise interactions def write_raw(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_raw.write('{}\t{}\n'.format(get_name(a, b), v)) def writer(_, a, b, v): out_raw.write('{}\t{}\n'.format(get_name(a, b), v)) return writer2 if func else writer def write_bias(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_nrm.write('{}\t{}\n'.format(get_name(a, b), v / bias1[a] / bias2[b])) def writer(_, a, b, v): out_nrm.write('{}\t{}\n'.format(get_name(a, b), v / bias1[a] / bias2[b])) return writer2 if func else writer def write_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) def writer(c, a, b, v): out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) return writer2 if func else writer def write_expc_2reg(func=None): def writer2(c, a, b, v): func(c, a, b, v) out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) def writer(c, a, b, v): out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))])) return writer2 if func else writer def write_expc_err(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\n'.format(get_name(a, b), 'nan')) def writer(c, a, b, v): try: out_dec.write('{}\t{}\n'.format( get_name(a, b), v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\n'.format(get_name(a, b), 'nan')) return writer2 if func else writer def write_raw_and_expc(func=None): def writer2(c, a, b, v): func(c, a, b, v) try: out_dec.write('{}\t{}\n'.format( get_name(a, b), v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\n'.format(get_name(a, b), v, v / bias1[a] / bias2[b])) def writer(c, a, b, v): try: out_dec.write('{}\t{}\n'.format( get_name(a, b), v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)])) except KeyError: # different chromosomes out_dec.write('{}\t{}\n'.format(get_name(a, b), v, v / bias1[a] / bias2[b])) return writer2 if func else writer def get_row_name(a, b): return '{}\t{}\t{}\t{}\t'.format(*(section_pos1[a] + section_pos2[b])) def get_bin_name(a, b): return '{}\t{}\t'.format(a, b) get_name = get_row_name if row_names else get_bin_name write = None if 'raw' in normalizations: write = write_raw(write) if 'norm' in normalizations and not cooler: write = write_bias(write) if 'decay' in normalizations and not cooler: if len(regions) in [1, 2]: if region2: write = write_expc_2reg(write) else: write = write_expc(write) else: write = write_expc_err(write) if 'raw&decay' in normalizations and not cooler: write = write_raw_and_expc(write) # pull all sub-matrices and write full matrix if region2 is not None: # already half-matrix in this case half_matrix = False if cooler: for ichunk, c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean, include_chunk_count=True): if j > k: continue if j not in bads1 and k not in bads2: out_raw.write_iter(ichunk, j, k, v) out_raw.close() else: if half_matrix: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k > j: continue if j not in bads1 and k not in bads2: write(c, j, k, v) else: for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if j not in bads1 and k not in bads2: write(c, j, k, v) fnames = {} if append_to_tar: lock = LockFile(append_to_tar) with lock: archive = taropen(append_to_tar, "a:") for fobj, fnam in outfiles: fobj.seek(0) info = archive.tarinfo(name=fnam) info.size = len(fobj.buf) archive.addfile(tarinfo=info, fileobj=fobj) archive.close() else: if cooler: fnames['RAW'] = out_raw.name if 'norm' in normalizations: fnam = 'nrm_%s_%s%s.mcool' % (name, nicer(resolution).replace( ' ', ''), ('_' + extra) if extra else '') copyfile(outfiles[0][0], os.path.join(outdir, fnam)) out_nrm = cooler_file(os.path.join(outdir, fnam), resolution, sections, regions) bias_data_row = [1. / b if b > 0 else 0 for b in bias1] bias_data_col = [1. / b if b > 0 else 0 for b in bias2] out_nrm.write_weights(bias_data_row, bias_data_col, *bin_coords) outfiles.append((os.path.join(outdir, fnam), fnam)) fnames['NRM'] = os.path.join(outdir, fnam) else: if 'raw' in normalizations: out_raw.close() fnames['RAW'] = out_raw.name if 'norm' in normalizations: out_nrm.close() fnames['NRM'] = out_nrm.name if 'decay' in normalizations: out_dec.close() fnames['DEC'] = out_dec.name if 'raw&decay' in normalizations: out_dec.close() fnames['RAW&DEC'] = out_dec.name # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return fnames
def load_bam(bam_path): return AlignmentFile(data_path(bam_path))
def read_bam(inbam, filter_exclude, resolution, ncpus=8, region1=None, start1=None, end1=None, region2=None, start2=None, end2=None, nchunks=100, tmpdir='.', verbose=True, normalize=False, max_size=None, chr_order=None): bamfile = AlignmentFile(inbam, 'rb') bam_refs = bamfile.references bam_lengths = bamfile.lengths if chr_order: bam_refs_idx = [ bam_refs.index(chr_ord) for chr_ord in chr_order if chr_ord in bam_refs ] if not bam_refs_idx: raise Exception('''ERROR: Wrong number of chromosomes in chr_order. Found %s in bam file \n''' % (' '.join(bam_refs))) bam_refs = [ bam_ref for bam_ref in [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] bam_lengths = [ bam_len for bam_len in [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] sections = OrderedDict( list(zip(bam_refs, [x // resolution + 1 for x in bam_lengths]))) # get chromosomes and genome sizes total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] # define genomic bins bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in range(len_crm)]) if not bins: raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm)) # define start, end position of region to grab start_bin1 = 0 end_bin1 = len(bins) + 1 regions = bam_refs if region1: regions = [region1] if region2: regions.append(region2) else: total = len(bins) if start1 is not None or end1: raise Exception('ERROR: Cannot use start/end1 without region') if start1 is not None: start_bin1 = section_pos[region1][0] + start1 // resolution else: if region1: start_bin1 = section_pos[region1][0] else: start_bin1 = 0 start1 = 0 if end1 is not None: end_bin1 = section_pos[region1][0] + end1 // resolution else: if region1: end_bin1 = section_pos[region1][1] end1 = sections[region1] * resolution else: end_bin1 = total end1 = total * resolution # define chunks, using at most 100 sub-divisions of region1 total = end_bin1 - start_bin1 regs = [] begs = [] ends = [] njobs = min(total, nchunks) + 1 nbins = total // njobs + 1 for i in range(start_bin1, end_bin1, nbins): if i + nbins > end_bin1: # make sure that we stop at the right place nbins = end_bin1 - i try: (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, fin2) = bins[i], bins[-1] if crm1 != crm2: fin1 = sections[crm1] regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin1 * resolution + resolution) # last nt included # be sure we don't miss regions in between the start and end bins start_chunk = i + fin1 - beg1 end_chunk = i + nbins - 1 if i + nbins - 1 < len( bins) else len(bins) - 1 (crm1, beg1) = bins[start_chunk] fin1 = beg1 for j in range(start_chunk, end_chunk + 1): (crm2, beg2) = bins[j] if crm1 == crm2: fin1 = beg2 continue regs.append(crm1) begs.append(beg1 * resolution) fin1 = sections[crm1] ends.append( fin1 * resolution + resolution - 1) # last nt not included (overlap with next window) (crm1, beg1) = (crm2, beg2) regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin1 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # reduce dictionaries all_bins = [] seenbins = set() for crm in regions: beg_crm = section_pos[crm][0] if region1: start = start_bin1 - beg_crm end = end_bin1 - beg_crm else: start = 0 end = section_pos[crm][1] - section_pos[crm][0] all_bins.extend([(crm, i) for i in range(start, end) if not (crm, i) in seenbins]) seenbins = set(all_bins) del (seenbins) bins_dict1 = dict((j, i) for i, j in enumerate(all_bins)) if region2: if not region2 in section_pos: raise Exception('ERROR: chromosome %s not found' % region2) bins = [] beg_crm = section_pos[region2][0] if start2 is not None: start_bin2 = section_pos[region2][0] + start2 // resolution else: start_bin2 = section_pos[region2][0] start2 = 0 if end2 is not None: end_bin2 = section_pos[region2][0] + end2 // resolution else: end_bin2 = section_pos[region2][1] end2 = sections[region2] * resolution start = start_bin2 - beg_crm end = end_bin2 - beg_crm bins = [(region2, i) for i in range(start, end)] bins_dict2 = dict([(j, i) for i, j in enumerate(bins)]) else: start_bin2 = start_bin1 end_bin2 = end_bin1 bins_dict2 = bins_dict1 size1 = end_bin1 - start_bin1 size2 = end_bin2 - start_bin2 if verbose: printime('\n (Matrix size %dx%d)' % (size1, size2)) if max_size and max_size < size1 * size2: raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most ' '{2}x{2}').format(size1, size2, int(max_size**0.5))) pool = mu.Pool(ncpus) # create random hash associated to the run: rand_hash = "%016x" % getrandbits(64) ## RUN! if verbose: printime('\n - Parsing BAM (%d chunks)' % (len(regs))) mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash))) # empty all_bins array if we are not going to normalize if not normalize: all_bins = [] procs = [] for i, (region, b, e) in enumerate(zip(regs, begs, ends)): if ncpus == 1: _read_bam_frag( inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e, ) else: procs.append( pool.apply_async(_read_bam_frag, args=( inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e, ))) pool.close() if verbose: print_progress(procs) pool.join() bin_coords = start_bin1, end_bin1, start_bin2, end_bin2 chunks = regs, begs, ends return regions, rand_hash, bin_coords, chunks
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.get_tid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords( zip(refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] # RGID -> ("abstract feature name" -> actual feature name) self._baseFeatureNameMappings = {} self._pulseFeatureNameMappings = {} for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join( ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] rgFrameRate = ds["FRAMERATEHZ"] # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) baseFeatureNameMapping = { key.split(":")[0]: key for key in ds.keys() if key in BASE_FEATURE_TAGS } pulseFeatureNameMapping = { key.split(":")[0]: key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping readGroupTable_.append( (rgID, rgName, rgReadType, rgChem, rgFrameRate, frozenset(baseFeatureNameMapping.iterkeys()))) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.int32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O"), ("FrameRate", float), ("BaseFeatures", "O") ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} # The base/pulse features "available" to clients of this file are the intersection # of features available from each read group. self._baseFeaturesAvailable = set.intersection(*[ set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values() ]) self._pulseFeaturesAvailable = set.intersection(*[ set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values() ]) def _loadProgramInfo(self): pgRecords = [(pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", [])] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords(pgRecords, dtype=[("ID", "O"), ("Version", "O"), ("CommandLine", "O") ]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set( (c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0.1 badVersionException = IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0.1 are supported)") checkedVersion = self.version if "b" in checkedVersion: raise badVersionException else: major, minor, patch = checkedVersion.split('.') if not (major, minor, patch) >= (3, 0, 1): raise badVersionException def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not (self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasBaseFeature(self, featureName): return featureName in self._baseFeaturesAvailable def baseFeaturesAvailable(self): return self._baseFeaturesAvailable def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable def hasPulseFeatures(self): """ Is this BAM file a product of running analysis with the PacBio-internal analysis mode enabled? """ return self.hasPulseFeature("PulseCall") @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
def _read_bam_frag(inbam, filter_exclude, all_bins, sections1, sections2, rand_hash, resolution, tmpdir, region, start, end, half=False, sum_columns=False): bamfile = AlignmentFile(inbam, 'rb') refs = bamfile.references bam_start = start - 2 bam_start = max(0, bam_start) try: dico = {} for r in bamfile.fetch( region=region, start=bam_start, end=end, # coords starts at 0 multiple_iterators=True): if r.flag & filter_exclude: continue crm1 = r.reference_name pos1 = r.reference_start + 1 crm2 = refs[r.mrnm] pos2 = r.mpos + 1 try: pos1 = sections1[(crm1, pos1 // resolution)] pos2 = sections2[(crm2, pos2 // resolution)] except KeyError: continue # not in the subset matrix we want crm = crm1 * (crm1 == crm2) try: dico[(crm, pos1, pos2)] += 1 except KeyError: dico[(crm, pos1, pos2)] = 1 # print '%-50s %5s %9s %5s %9s' % (r.query_name, # crm1, r.reference_start + 1, # crm2, r.mpos + 1) if half: for c, i, j in dico: if i < j: del dico[(c, i, j)] out = open( os.path.join(tmpdir, '_tmp_%s' % (rand_hash), '%s:%d-%d.tsv' % (region, start, end)), 'w') out.write(''.join('%s\t%d\t%d\t%d\n' % (c, a, b, v) for (c, a, b), v in dico.items())) out.close() if sum_columns: sumcol = {} cisprc = {} for (c, i, j), v in dico.items(): # out.write('%d\t%d\t%d\n' % (i, j, v)) try: sumcol[i] += v cisprc[i][all_bins[i][0] == all_bins[j][0]] += v except KeyError: sumcol[i] = v cisprc[i] = [0, 0] cisprc[i][all_bins[i][0] == all_bins[j][0]] += v return sumcol, cisprc except Exception as e: exc_type, exc_obj, exc_tb = exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(e) print(exc_type, fname, exc_tb.tb_lineno)
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region=None, verbose=True, clean=True): """ :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2 :param resolution: the resolution of the experiment (size of a bin in bases) :param None biases: path to pickle file where are stored the biases. Keys in this file should be: 'biases', 'badcol', 'decay' and 'resolution' :param '.' tmpdir: path to folder where to create temporary files :param 8 ncpus: :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param None region: chromosome name, if None, all genome will be loaded :returns: HiC_data object """ bam = AlignmentFile(fnam) genome_seq = OrderedDict((c, l) for c, l in zip( bam.references, [x / resolution + 1 for x in bam.lengths])) bam.close() sections = [] for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in xrange(len_crm)]) size = sum(genome_seq.values()) chromosomes = {region: genome_seq[region]} if region else genome_seq dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec, resolution=resolution) if biases: if isinstance(biases, basestring): biases = load(open(biases)) if biases['resolution'] != resolution: raise Exception('ERROR: resolution of biases do not match to the ' 'one wanted (%d vs %d)' % (biases['resolution'], resolution)) if region: chrom_start = 0 for crm in genome_seq: if crm == region: break len_crm = genome_seq[crm] chrom_start += len_crm imx.bads = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol']) imx.bias = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases']) else: imx.bads = biases['badcol'] imx.bias = biases['biases'] imx.expected = biases['decay'] get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude, normalization='raw', tmpdir=tmpdir, clean=clean, ncpus=ncpus, dico=imx, region1=region, verbose=verbose) imx._symmetricize() imx.symmetricized = True return imx
def test_downgrade_read_edges(self): # With softclip bam_fpath = os.path.join(TEST_DATA_DIR, 'sample.bam') sam = AlignmentFile(bam_fpath) aligned_read = sam.next() _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) res = [ 9, 9, 9, 9, 9, 9, 3, 9, 8, 8, 9, 9, 9, 9, 9, 39, 39, 39, 38, 38, 36, 33, 36, 38, 36, 38, 38, 38, 38, 39, 39, 38, 38, 38, 9, 9, 9, 9 ] assert list(aligned_read.query_qualities) == res # without softclip sam = AlignmentFile(os.path.join(TEST_DATA_DIR, 'seqs.bam')) aligned_read = sam.next() _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) expected = [ 11, 13, 11, 11, 37, 43, 43, 46, 46, 57, 57, 48, 57, 57, 42, 41, 32, 35, 38, 38, 38, 38, 41, 41, 39, 37, 37, 44, 42, 48, 47, 57, 47, 47, 48, 47, 57, 57, 54, 48, 57, 48, 54, 50, 50, 50, 50, 50, 57, 59, 54, 54, 54, 57, 57, 59, 57, 52, 52, 52, 52, 57, 57, 57, 57, 52, 52, 52, 52, 29, 27, 27, 22 ] assert list(aligned_read.query_qualities) == expected # reverse # rev seqs (sam specification puts all the alignment query # forward(cigar, seq, qual, ...). Reverse is inly noted in the flag bam_fpath = os.path.join(TEST_DATA_DIR, 'sample_rev.bam') sam = AlignmentFile(bam_fpath) aligned_read = sam.next() aligned_read = sam.next() aligned_read = sam.next() original_qual = aligned_read.query_qualities _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) res = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0] assert list(aligned_read.query_qualities[:14]) == res # check that we can restore the cuals from the tag _restore_qual_from_tag(aligned_read) assert original_qual == aligned_read.query_qualities # only restore left quals aligned_read = sam.next() original_qual = aligned_read.query_qualities _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) aligned_read.set_tag(RIGTH_DOWNGRADED_TAG, None) changed_rquals = aligned_read.query_qualities[-5:] _restore_qual_from_tag(aligned_read) assert aligned_read.query_qualities[-5:] == changed_rquals assert aligned_read.query_qualities[:10] == original_qual[:10] # only restore rigth quals sam = AlignmentFile(os.path.join(TEST_DATA_DIR, 'seqs.bam')) aligned_read = sam.next() original_qual = aligned_read.query_qualities _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30) aligned_read.set_tag(LEFT_DOWNGRADED_TAG, None) changed_lquals = aligned_read.query_qualities[:5] _restore_qual_from_tag(aligned_read) assert aligned_read.query_qualities[:5] == changed_lquals assert aligned_read.query_qualities[10:] == original_qual[10:]
def __init__(self, output, indexed_sequence_list, index_options): header = self.build_header(indexed_sequence_list, index_options) self.writer = AlignmentFile(output, 'wb', header=header)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def phase_structural_variants(sv_vcf, long_reads_bam, workdir): sv_vcf_basename = os.path.basename(sv_vcf) if sv_vcf_basename.endswith('.vcf'): offset = -4 elif sv_vcf_basename.endswith('.vcf.gz'): offset = -7 else: return sv_filtered_phased_vcf = workdir + '/' + sv_vcf_basename[:offset] + '.filtered.phased.vcf' vcf_in = VariantFile(sv_vcf) vcf_out = VariantFile(sv_filtered_phased_vcf, 'w', header=vcf_in.header) bam_in = AlignmentFile(long_reads_bam) phasing_stat_f = open(workdir + '/' + 'phasing_stat.txt', 'w') chr_to_include = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'] """ chr_to_include = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'] """ phasing_stat = {'INS' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DEL' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'INV' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'BND' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DUP:TANDEM' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}, 'DUP_INT' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}} prev_chrom = '' for rec in vcf_in.fetch(): sv_chrom = rec.chrom if sv_chrom in chr_to_include: if sv_chrom != prev_chrom: logging.info('Processing {0}'.format(sv_chrom)) prev_chrom = sv_chrom if rec.filter.keys()[0] == 'PASS': sv_pos = rec.pos sv_read_ids = rec.info['READS'] sv_support = rec.info['SUPPORT'] sv_type = rec.info['SVTYPE'] phasing_stat[sv_type]['Total'] += 1 begin_pos = sv_pos - 1 if 'END' in rec.info: end_pos = rec.info['END'] else: end_pos = sv_pos hap1_counter = 0 hap2_counter = 0 try: read_iterator = bam_in.fetch(sv_chrom, begin_pos-2000, end_pos+2000) except ValueError: read_iterator = bam_in.fetch(sv_chrom, begin_pos, end_pos) for read in read_iterator: if read.query_name in sv_read_ids: if read.has_tag('HP'): read_hp = read.get_tag('HP') hap1_counter += read_hp == 1 hap2_counter += read_hp == 2 threshold_read_count = max(int(0.85 * sv_support), 5) threshold_het = 0.8 threshold_hom = 0.2 if (hap1_counter + hap2_counter) >= threshold_read_count: allele_frequency_hap1 = hap1_counter / float(hap1_counter + hap2_counter) allele_frequency_hap2 = hap2_counter / float(hap1_counter + hap2_counter) if allele_frequency_hap1 >= threshold_hom and allele_frequency_hap1 < threshold_het: rec.samples[0]['GT'] = (1, 1) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HOM'] += 1 elif allele_frequency_hap1 >= threshold_het: rec.samples[0]['GT'] = (1, 0) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HET'] += 1 elif allele_frequency_hap2 >= threshold_het: rec.samples[0]['GT'] = (0, 1) rec.samples[0].phased = True phasing_stat[sv_type]['Phased HET'] += 1 vcf_out.write(rec) phasing_stat_f.write('\tTotal\tPhased HOM\tPhased HET\n') for sv in phasing_stat: phasing_stat_f.write('{0}:\t{1}\t{2}\t{3}\n'.format(sv, phasing_stat[sv]['Total'], phasing_stat[sv]['Phased HOM'], phasing_stat[sv]['Phased HET'])) phasing_stat_f.close()
args = parser.parse_args() # Logger logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process if args.output_reads_2: # Write all reads in a pair of files (R1 and R2) with FastqIO(args.output_reads_2, "w") as writer_r2: with FastqIO(args.output_reads, "w") as writer_r1: with AlignmentFile(args.input_aln, "rb", check_sq=False) as reader: for curr_read in reader.fetch(until_eof=True): if not curr_read.is_secondary and not curr_read.is_supplementary: if args.keep_qc_failed or not curr_read.is_qcfail: barcode = args.reads_barcode if barcode is None and curr_read.has_tag( args.barcode_tag): barcode = curr_read.get_tag( args.barcode_tag).replace("-", "+") description = "{}:{}:0:{} {}={}".format( "1" if curr_read.is_read1 else "2", "Y" if curr_read.is_qcfail else "N", "" if barcode is None else barcode, args.umi_qual_tag, curr_read.get_tag(args.umi_qual_tag)) read = Sequence(
def filter_reads(input_bam, output_bam, whitelist=None, blacklist=None, percentage=None, count=None, seed=None, ignore_metadata=False, relative=None, anonymize=False, use_barcodes=False): if output_bam is None: log.error("Must specify output file") return 1 output_bam = op.abspath(output_bam) if not op.isdir(op.dirname(output_bam)): log.error("Output path '{d}' does not exist.".format( d=op.dirname(output_bam))) return 1 n_specified = 4 - [whitelist, blacklist, percentage, count].count(None) if n_specified != 1: log.error("You must choose one and only one of the following " + "options: --whitelist, --blacklist, --count, --percentage") return 1 if seed is not None: random.seed(seed) if whitelist is None and blacklist is None: if not 0 < percentage < 100 and not count > 0: log.error("No reads selected for output.") return 1 output_ds = None if output_bam.endswith(".xml"): if not input_bam.endswith(".xml"): print "DataSet output only supported for DataSet inputs." return 1 ds_type = output_bam.split(".")[-2] ext2 = { "subreadset": "subreads", "alignmentset": "subreads", "consensusreadset": "ccs", "consensusalignmentset": "ccs" } if not ds_type in ext2: raise ValueError("Invalid dataset type 't'".format(t=ds_type)) output_ds = output_bam output_bam = ".".join( output_ds.split(".")[:-2] + [ext2[ds_type], "bam"]) if output_bam == input_bam: log.error("Input and output files must not be the same path") return 1 elif not output_bam.endswith(".bam"): log.error("Output file name must end in either '.bam' or '.xml'") return 1 n_file_reads = 0 have_zmws = set() scraps_bam = barcode_set = None with openDataFile(input_bam) as ds_in: if not isinstance(ds_in, ReadSet): raise TypeError("{t} is not an allowed dataset type".format( t=type(ds_in).__name__)) # TODO(nechols)(2016-03-11): refactor this to enable propagation of # filtered scraps if not ds_in.isIndexed: log.error("Input BAM must have accompanying .pbi index") return 1 for ext_res in ds_in.externalResources: if ext_res.barcodes is not None: assert barcode_set is None or barcode_set == ext_res.barcodes barcode_set = barcode_set f1 = ds_in.resourceReaders()[0] if percentage is not None or count is not None: whitelist = _create_whitelist(ds_in, percentage, count) # convert these to Python sets _whitelist = _process_zmw_list(whitelist) _blacklist = _process_zmw_list(blacklist) scraps_in = None if output_ds is not None and output_ds.endswith(".subreadset.xml"): for ext_res in ds_in.externalResources: if ext_res.scraps is not None: if use_barcodes: log.warn("Scraps BAM is present but lacks " + "barcodes - will not be propagated " + "to output SubreadSet") else: scraps_in = IndexedBamReader(ext_res.scraps) break with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: for bam_in in ds_in.resourceReaders(): n_records, have_zmws_ = _process_bam_whitelist( bam_in, bam_out, whitelist=_whitelist, blacklist=_blacklist, use_barcodes=use_barcodes, anonymize=anonymize) n_file_reads += n_records have_zmws.update(have_zmws_) if scraps_in is not None: scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam) with AlignmentFile(scraps_bam, 'wb', template=scraps_in.peer) as scraps_out: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in_ = IndexedBamReader(ext_res.scraps) n_records, have_zmws_ = _process_bam_whitelist( scraps_in_, scraps_out, _whitelist, _blacklist, use_barcodes=use_barcodes, anonymize=anonymize) have_zmws.update(have_zmws_) if n_file_reads == 0: log.error("No reads written") return 1 log.info("{n} records from {z} ZMWs written".format(n=n_file_reads, z=len(have_zmws))) def _run_pbindex(bam_file): try: rc = subprocess.call(["pbindex", bam_file]) except OSError as e: if e.errno == 2: log.warn("pbindex not present, will not create .pbi file") else: raise _run_pbindex(output_bam) if output_ds is not None: with openDataSet(input_bam) as ds_in: ds_out = ds_in.__class__(output_bam) if scraps_bam is not None: _run_pbindex(scraps_bam) ds_out.externalResources[0].scraps = scraps_bam # XXX it doesn't pick up the .pbi file - sort of annoying # but since the pbcore API doesn't provide a read for the # scraps automatically anyway, the impact is minimal if barcode_set is not None: ds_out.externalResources[0].barcodes = barcode_set if not ignore_metadata: ds_out.metadata = ds_in.metadata ds_out.updateCounts() if relative: ds_out.makePathsRelative(op.dirname(output_ds)) ds_out.write(output_ds) log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__, x=output_ds)) return 0
class PaddedSAM(object): """ Obtain aligned (padded) queries from a SAM/BAM file. @param filename: The C{str} name of the SAM/BAM file. """ def __init__(self, filename): self.samfile = AlignmentFile(filename) # self.referenceInsertions will be keyed by offset into the reference # sequence. The inserted bases would need to begin at this offset. The # value will be a Counter whose keys are the nucleotides proposed for # insertion, with a value indicating how many times the nucleotide was # proposed for insertion at that offset. self.referenceInsertions = defaultdict(Counter) def close(self): """ Close the opened SAM/BAM file. """ self.samfile.close() def referencesToStr(self, indent=0): """ List the reference names and their lengths. @param indent: An C{int} number of spaces to indent each line. @return: A C{str} describing known reference names and their lengths. """ samfile = self.samfile result = [] indent = ' ' * indent for i in range(samfile.nreferences): result.append('%s%s (length %d)' % ( indent, samfile.get_reference_name(i), samfile.lengths[i])) return '\n'.join(result) def queries(self, referenceName=None, minLength=0, rcSuffix='', dropSecondary=False, dropSupplementary=False, dropDuplicates=False, allowDuplicateIds=False, keepQCFailures=False, rcNeeded=False, padChar='-', queryInsertionChar='N'): """ Produce padded (with gaps) queries according to the CIGAR string and reference sequence length for each matching query sequence. @param referenceName: The C{str} name of the reference sequence to print alignments for. This is only needed if the SAM/BAM alignment was made against multiple references *and* they are not all of the same length. If there is only one reference sequence or if all reference sequences are of the same length, there is no need to provide a reference name (i.e., pass C{None}). @param minLength: Ignore queries shorter than this C{int} value. Note that this refers to the length of the query sequence once it has been aligned to the reference. The alignment may introduce C{queryInsertionChar} characters into the read, and these are counted towards its length because the alignment is assuming the query is missing a base at those locations. @param rcSuffix: A C{str} to add to the end of query names that are reverse complemented. This is added before the /1, /2, etc., that are added for duplicated ids (if there are duplicates and C{allowDuplicateIds} is C{False}. @param dropSecondary: If C{True}, secondary matches will not be yielded. @param dropSupplementary: If C{True}, supplementary matches will not be yielded. @param dropDuplicates: If C{True}, matches flagged as optical or PCR duplicates will not be yielded. @param allowDuplicateIds: If C{True}, repeated query ids (due to secondary or supplemental matches) will not have /1, /2, etc. appended to their ids. So repeated ids may appear in the yielded FASTA. @param keepQCFailures: If C{True}, reads that are marked as quality control failures will be included in the output. @param rcNeeded: If C{True}, queries that are flagged as matching when reverse complemented should have reverse complementing when preparing the output sequences. This must be used if the program that created the SAM/BAM input flags reversed matches but does not also store the reverse complemented query. @param padChar: A C{str} of length one to use to pad queries with to make them the same length as the reference sequence. @param queryInsertionChar: A C{str} of length one to use to insert into queries when the CIGAR string indicates that the alignment of a query would cause a deletion in the reference. This character is inserted as a 'missing' query character (i.e., a base that can be assumed to have been lost due to an error) whose existence is necessary for the match to continue. @raises UnequalReferenceLengthError: If C{referenceName} is C{None} and the reference sequence lengths in the SAM/BAM file are not all identical. @raises UnknownReference: If C{referenceName} does not exist. @return: A generator that yields C{Read} instances that are padded with gap characters to align them to the length of the reference sequence. """ samfile = self.samfile if referenceName: referenceId = samfile.get_tid(referenceName) if referenceId == -1: raise UnknownReference( 'Reference %r is not present in the SAM/BAM file.' % referenceName) referenceLength = samfile.lengths[referenceId] else: # No reference given. All references must have the same length. if len(set(samfile.lengths)) != 1: raise UnequalReferenceLengthError( 'Your SAM/BAM file has %d reference sequences, and their ' 'lengths (%s) are not all identical.' % ( samfile.nreferences, ', '.join(map(str, sorted(samfile.lengths))))) referenceId = None referenceLength = samfile.lengths[0] # Hold the count for each id so we can add /1, /2 etc to duplicate # ids (unless --allowDuplicateIds was given). idCount = Counter() MATCH_OPERATIONS = {CMATCH, CEQUAL, CDIFF} for read in samfile.fetch(): query = read.query_sequence if (read.is_unmapped or (read.is_secondary and dropSecondary) or (read.is_supplementary and dropSupplementary) or (read.is_duplicate and dropDuplicates) or (read.is_qcfail and not keepQCFailures) or (referenceId is not None and read.reference_id != referenceId)): continue if read.is_reverse: if rcNeeded: query = DNARead('id', query).reverseComplement().sequence if rcSuffix: read.query_name += rcSuffix referenceStart = read.reference_start atStart = True queryIndex = 0 referenceIndex = referenceStart alignedSequence = '' for operation, length in read.cigartuples: # The operations are tested in the order they appear in # https://samtools.github.io/hts-specs/SAMv1.pdf It would be # more efficient to test them in order of frequency of # occurrence. if operation in MATCH_OPERATIONS: atStart = False alignedSequence += query[queryIndex:queryIndex + length] elif operation == CINS: # Insertion to the reference. This consumes query bases but # we don't output them because the reference cannot be # changed. I.e., these bases in the query would need to be # inserted into the reference. Remove these bases from the # query but record what would have been inserted into the # reference. atStart = False for i in range(length): self.referenceInsertions[referenceIndex + i][ query[queryIndex + i]] += 1 elif operation == CDEL: # Delete from the reference. Some bases from the reference # would need to be deleted to continue the match. So we put # an insertion into the query to compensate. atStart = False alignedSequence += queryInsertionChar * length elif operation == CREF_SKIP: # Skipped reference. Opens a gap in the query. For # mRNA-to-genome alignment, an N operation represents an # intron. For other types of alignments, the # interpretation of N is not defined. So this is unlikely # to occur. atStart = False alignedSequence += queryInsertionChar * length elif operation == CSOFT_CLIP: # Bases in the query that are not part of the match. We # remove these from the query if they protrude before the # start or after the end of the reference. According to the # SAM docs, 'S' operations may only have 'H' operations # between them and the ends of the CIGAR string. if atStart: # Don't set atStart=False, in case there's another 'S' # operation. unwantedLeft = length - referenceStart if unwantedLeft > 0: # The query protrudes left. Copy its right part. alignedSequence += query[queryIndex + unwantedLeft: queryIndex + length] referenceStart = 0 else: referenceStart -= length alignedSequence += query[ queryIndex:queryIndex + length] else: unwantedRight = ( (referenceStart + len(alignedSequence) + length) - referenceLength) if unwantedRight > 0: # The query protrudes right. Copy its left part. alignedSequence += query[ queryIndex:queryIndex + length - unwantedRight] else: alignedSequence += query[ queryIndex:queryIndex + length] elif operation == CHARD_CLIP: # Some bases have been completely removed from the query. # This (H) can only be present as the first and/or last # operation. There is nothing to do as the bases are simply # not present in the query string in the SAM/BAM file. pass elif operation == CPAD: # This is "silent deletion from the padded reference", # which consumes neither query nor reference. atStart = False else: raise ValueError('Unknown CIGAR operation:', operation) if operation in _CONSUMES_QUERY: queryIndex += length if operation in _CONSUMES_REFERENCE: referenceIndex += length # Sanity check that we consumed the entire query. assert queryIndex == len(query) # We cannot test we consumed the entire reference. The CIGAR # string applies to (i.e., exhausts) the query and is silent about # the part of the reference that to the right of the aligned query. # Check the length restriction now that we have (possibly) added # queryInsertionChar characters to pad the query out to the length # it requires to match the reference. if len(alignedSequence) < minLength: continue # Put gap characters before and after the aligned sequence so that # it is offset properly and matches the length of the reference. paddedSequence = ( (padChar * referenceStart) + alignedSequence + padChar * (referenceLength - (referenceStart + len(alignedSequence)))) if allowDuplicateIds: suffix = '' else: count = idCount[read.query_name] idCount[read.query_name] += 1 suffix = '' if count == 0 else '/%d' % count yield Read('%s%s' % (read.query_name, suffix), paddedSequence)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse, parse_tags, gene_tags): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) targets = [x["SN"] for x in sam_file.header["SQ"]] track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True missing_transcripts = set() for i, aln in enumerate(track): if count and not count % 1000000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info("%d were filtered for not matching known barcodes." % nomatchcb) count += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) target_name = txid else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally if missing_transcripts: logger.warn('The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts)) logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') logger.info('Writing evidence') with tempfile.NamedTemporaryFile('w+t') as out_handle: for key in evidence: line = '{},{}\n'.format(key, evidence[key]) out_handle.write(line) out_handle.flush() out_handle.seek(0) evidence_table = pd.read_csv(out_handle, header=None) del evidence evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] elif gene_tags: expanded.sort_index() genes = expanded else: # make data frame have a complete accounting of transcripts targets = pd.Series(index=set(targets)) targets = targets.sort_index() expanded = expanded.reindex(targets.index.values, fill_value=0) genes = expanded genes.fillna(0, inplace=True) genes = genes.astype(int) genes.index.name = "gene" logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) if sparse: pd.Series(genes.index).to_csv(out + ".rownames", index=False, header=False) pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False, header=False) with open(out, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes)) else: genes.to_csv(out)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse, parse_tags, gene_tags): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format( cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info( 'Creating reservoir of subsampled reads ({} per cell)'.format( subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) targets = [x["SN"] for x in sam_file.header["SQ"]] track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True missing_transcripts = set() for i, aln in enumerate(track): if count and not count % 1000000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info( "%d were filtered for not matching known barcodes." % nomatchcb) count += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) target_name = txid else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally if missing_transcripts: logger.warn( 'The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts)) logger.info('Tally done - {:.3}s, {:,} alns/min'.format( tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') logger.info('Writing evidence') with tempfile.NamedTemporaryFile('w+t') as out_handle: for key in evidence: line = '{},{}\n'.format(key, evidence[key]) out_handle.write(line) out_handle.flush() out_handle.seek(0) evidence_table = pd.read_csv(out_handle, header=None) del evidence evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns = ['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] elif gene_tags: expanded.sort_index() genes = expanded else: # make data frame have a complete accounting of transcripts targets = pd.Series(index=set(targets)) targets = targets.sort_index() expanded = expanded.reindex(targets.index.values, fill_value=0) genes = expanded genes.fillna(0, inplace=True) genes = genes.astype(int) genes.index.name = "gene" logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) if sparse: pd.Series(genes.index).to_csv(out + ".rownames", index=False) pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False) with open(out, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes)) else: genes.to_csv(out)
def species_pileup(species_id, args, tempdir, outputdir, contig_file, contigs_db_stats): # Read in contigs information for current species_id contigs = {} contigs_db_stats[ 'species_counts'] += 1 # not being updated and passed as expected with InputStream(contig_file) as file: for rec in Bio.SeqIO.parse(file, 'fasta'): contigs[rec.id] = { "species_id": species_id, "contig_len": int(len(rec.seq)), "contig_seq": str(rec.seq), } contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"] contigs_db_stats['total_seqs'] += 1 # Summary statistics aln_stats = { "genome_length": 0, "total_depth": 0, "covered_bases": 0, "aligned_reads": 0, "mapped_reads": 0, } def keep_read(x): return keep_read_worker(x, args, aln_stats) header = [ 'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c', 'count_g', 'count_t' ] path = f"{outputdir}/{species_id}.snps.lz4" with OutputStream(path) as file: file.write('\t'.join(header) + '\n') zero_rows_allowed = not args.sparse # Loop over alignment for current species's contigs with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile: for contig_id in sorted(list(contigs.keys())): # why need to sort? contig = contigs[contig_id] counts = bamfile.count_coverage( contig_id, start=0, end=contig["contig_len"], quality_threshold=args.aln_baseq, read_callback=keep_read) for ref_pos in range(0, contig["contig_len"]): ref_allele = contig["contig_seq"][ref_pos] depth = sum([counts[nt][ref_pos] for nt in range(4)]) count_a = counts[0][ref_pos] count_c = counts[1][ref_pos] count_g = counts[2][ref_pos] count_t = counts[3][ref_pos] values = [ contig_id, ref_pos + 1, ref_allele, depth, count_a, count_c, count_g, count_t ] if depth > 0 or zero_rows_allowed: file.write('\t'.join(str(val) for val in values) + '\n') aln_stats['genome_length'] += 1 aln_stats['total_depth'] += depth if depth > 0: aln_stats['covered_bases'] += 1 tsprint(json.dumps({species_id: aln_stats}, indent=4)) return (species_id, {k: str(v) for k, v in aln_stats.items()})
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence if sam.endswith(".sam"): logger.error( "To use the fasttagcount subcommand, the alignment file must be a " "coordinate sorted, indexed BAM file.") sys.exit(1) logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format( cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info( 'Creating reservoir of subsampled reads ({} per cell)'.format( subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(lambda: collections.defaultdict(float)) bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) transcript_map = collections.defaultdict(set) sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]] if gene_map: for transcript, gene in gene_map.items(): if transcript in sam_transcripts: transcript_map[gene].add(transcript) else: for transcript in sam_transcripts: transcript_map[transcript].add(transcript) missing_transcripts = set() alignments_processed = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' current_transcript = None count_this_read = True transcripts_processed = 0 genes_processed = 0 cells = list(cb_hist.index) targets_seen = set() if umi_matrix: bare_evidence_handle = open(umi_matrix, "w") bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): for transcript in transcripts: for aln in sam_file.fetch(transcript): alignments_processed += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) continue else: target_name = txid targets_seen.add(target_name) # Scale evidence by number of hits evidence[CB][MB] += weigh_evidence(aln.tags) bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: logger.info("%d genes processed." % genes_processed) logger.info("%d transcripts processed." % transcripts_processed) logger.info("%d alignments processed." % alignments_processed) earray = [] for cell in cells: umis = [ 1 for _, v in evidence[cell].items() if v >= minevidence ] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") earray = [] if umi_matrix: for cell in cells: earray.append(str(int(bare_evidence[cell]))) bare_evidence_handle.write(",".join([gene] + earray) + "\n") evidence = collections.defaultdict( lambda: collections.defaultdict(int)) bare_evidence = collections.defaultdict(int) genes_processed += 1 if umi_matrix: bare_evidence_handle.close() # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) targets = targets.sort_index() df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(out) if umi_matrix: df = pd.read_csv(umi_matrix, index_col=0, header=0) df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(umi_matrix)
from pysam import AlignmentFile from pyfaidx import Fasta def has_mismatch_in_interval(reference, bamfile, chrom, start, end): """ Return whether there is a mismatch in the interval (start, end) in any read mapping to the given chromosome. reference -- a pyfaidx.Fasta object or something that behaves similarly """ for column in bamfile.pileup(chrom, start, end): refbase = reference[chrom][column.pos:column.pos+1] for piledup in column.pileups: if piledup.indel != 0: # Insertion is positive; deletion is negative # Ignore indels continue querybase = piledup.alignment.query_sequence[piledup.query_position] if refbase != querybase: # Mismatch return True return False ref = Fasta('reference.fasta') bamfile = AlignmentFile('mappedreads.bam') has_mismatch_in_interval(ref, bamfile, 'scaffold17', 1000, 2000)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from cStringIO import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: gene_map = dict(p.strip().split() for p in fh) if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') logger.info('Tallying evidence') start_tally = time.time() evidence = collections.defaultdict(int) sam_file = AlignmentFile(sam, mode='r') track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group('CB') MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits evidence[e_tuple] += weigh_evidence(aln.tags) tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * i / tally_time))) logger.info('Collapsing evidence') buf = StringIO() for key in evidence: line = '{},{}\n'.format(key, evidence[key]) buf.write(line) buf.seek(0) evidence_table = pd.read_csv(buf) evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) genes.to_csv(out)
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region=None, verbose=True, clean=True): """ :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2 :param resolution: the resolution of the experiment (size of a bin in bases) :param None biases: path to pickle file where are stored the biases. Keys in this file should be: 'biases', 'badcol', 'decay' and 'resolution' :param '.' tmpdir: path to folder where to create temporary files :param 8 ncpus: :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param None region: chromosome name, if None, all genome will be loaded :returns: HiC_data object """ bam = AlignmentFile(fnam) genome_seq = OrderedDict((c, l) for c, l in zip(bam.references, [x / resolution + 1 for x in bam.lengths])) bam.close() sections = [] for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in xrange(len_crm)]) size = sum(genome_seq.values()) chromosomes = {region: genome_seq[region]} if region else genome_seq dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec, resolution=resolution) if biases: if isinstance(biases, basestring): biases = load(open(biases)) if biases['resolution'] != resolution: raise Exception('ERROR: resolution of biases do not match to the ' 'one wanted (%d vs %d)' % ( biases['resolution'], resolution)) if region: chrom_start = 0 for crm in genome_seq: if crm == region: break len_crm = genome_seq[crm] chrom_start += len_crm imx.bads = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol']) imx.bias = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases']) else: imx.bads = biases['badcol'] imx.bias = biases['biases'] imx.expected = biases['decay'] get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude, normalization='raw', tmpdir=tmpdir, clean=clean, ncpus=ncpus, dico=imx, region1=region, verbose=verbose) imx._symmetricize() imx.symmetricized = True return imx