def mapping(sample, directory): cmd = """ mkdir -p {directory}/bam bwa mem -t 8 /home2/zhaowen/project/EVTpro/reference/EVTREF_VP.fa \\ {directory}/cleandata/{sample}_1.fq.gz \\ {directory}/cleandata/{sample}_2.fq.gz \\ | samtools view -bSh - | samtools sort -@ 8 - -o {directory}/bam/{sample}.bam samtools view {directory}/bam/{sample}.bam -H > {directory}/bam/{sample}.header samtools view {directory}/bam/{sample}.bam -F 2052 -bSh > {directory}/bam/{sample}.temp.bam samtools index {directory}/bam/{sample}.temp.bam """.format(sample=sample, directory=directory) os.system(cmd) bamFile = bamnostic.AlignmentFile( directory + "/bam/" + sample + ".temp.bam", "rb") finalSam = open(directory + "/bam/" + sample + ".sam", "w") for read in bamFile: if str(read).split("\t")[6] == "=": finalSam.write(str(read) + "\n") finalSam.close() cmd2 = """ cat {directory}/bam/{sample}.header {directory}/bam/{sample}.sam | samtools view -bSh - > {directory}/bam/{sample}.final.bam rm {directory}/bam/{sample}.header {directory}/bam/{sample}.sam {directory}/bam/{sample}.temp.bam* samtools index {directory}/bam/{sample}.final.bam """.format(sample=sample, directory=directory) os.system(cmd2)
def parse_realigned_bam_raw(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage, min_read_coverage): fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1])) for it in open(fa_idx_f)) bc_tr_count_dict = {} bc_tr_badcov_count_dict = {} tr_cov_dict = {} read_dict = {} cnt_stat = Counter() bamfile = bs.AlignmentFile(bam_in, "rb") for rec in bamfile.fetch(until_eof=True): if rec.is_unmapped or rec.is_secondary: #or rec.mapping_quality==0: cnt_stat["not_counted"] += 1 continue map_st = rec.reference_start map_en = rec.reference_end tr = rec.reference_name tr_cov = float(map_en - map_st) / fa_idx[tr] tr_cov_dict.setdefault(tr, []).append(tr_cov) if tr not in fa_idx: cnt_stat["not_in_annotation"] += 1 print tr, "not in annotation ???" bc, umi = rec.query_name.split("#")[0].split( "_") # assume cleaned barcode if bc not in bc_tr_count_dict: bc_tr_count_dict[bc] = {} bc_tr_count_dict[bc].setdefault(tr, []).append(umi) cnt_stat["counted_reads"] += 1 tr_kept = dict( (tr, tr) for tr in tr_cov_dict if len([it for it in tr_cov_dict[tr] if it > 0.9]) > min_sup_reads) print("\t" + str(cnt_stat)) return bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept
def selectBam(self): bam_file_name = askopenfilename(title="Select Your File") bam_path = os.path.abspath(bam_file_name) bam_index_path = bam_path + ".bai" print(bam_path, bam_index_path) self.bam = bs.AlignmentFile(bam_path, filepath_index=bam_index_path) print(self.bam.header) print(self.bam.head(n=5))
def realigned_bam_coverage(bam_in, fa_idx_f, coverage_dir): fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1])) for it in open(fa_idx_f)) left_clip_count = Counter() right_clip_count = Counter() tr_strand = Counter() bc_pct = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}} bc_cov_pct = {0: [], 1: [], 2: [], 3: [], 4: []} gene_pct = {0: [], 1: [], 2: [], 3: [], 4: []} bamfile = bs.AlignmentFile(bam_in, "rb") for rec in bamfile.fetch(until_eof=True): if rec.is_unmapped or rec.is_supplementary or rec.is_secondary: continue bc, umi = rec.query_name.split("#")[0].split( "_") # assume cleaned barcode map_st = rec.reference_start map_en = rec.reference_end tr = rec.reference_name if float(map_en - map_st) / fa_idx[tr] < 0.3: continue if rec.cigar[0][0] == 4: # BAM_CSOFT_CLIP left_clip_count[rec.cigar[0][1]] += 1 if rec.cigar[-1][0] == 4: # BAM_CSOFT_CLIP right_clip_count[rec.cigar[-1][1]] += 1 tr_strand[rec.is_reverse] += 1 if not rec.is_reverse: pass gene_pct[tr_len_range(fa_idx[tr])].append( float(map_en - map_st) / fa_idx[tr]) bc_pct[tr_len_range(fa_idx[tr])].setdefault(bc, []).append( float(map_st - 0) / fa_idx[tr]) bc_pct[tr_len_range(fa_idx[tr])].setdefault(bc, []).append( float(map_en - 0) / fa_idx[tr]) bc_cov_pct[tr_len_range(fa_idx[tr])].append( float(map_en - map_st) / fa_idx[tr]) print left_clip_count.most_common(30) print right_clip_count.most_common(30) print tr_strand print np.histogram(bc_pct[0][bc_pct[0].keys()[0]], bins=200, range=(0, 1)) for i in bc_pct: coverage_f = open( os.path.join(coverage_dir, "transcript_cov_per_cell.{}.csv".format(i)), "w") for bc in bc_pct[i]: lhi, _ = np.histogram(bc_pct[i][bc], bins=200, range=(0, 1)) coverage_f.write("{},".format(bc) + ",".join(str(it) for it in lhi) + "\n") coverage_f.close() tr_cov_f = open(os.path.join(coverage_dir, "transcript_cov.csv"), "w") for i in gene_pct: lhi, _ = np.histogram(gene_pct[i], bins=200, range=(0, 1)) tr_cov_f.write("{},".format(i) + ",".join(str(it) for it in lhi) + "\n") tr_cov_f.close()
def read_10x_folder(folder): """Get QC-pass barcodes, genes, and bam file from a 10x folder""" import bamnostic as bs barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv')) bam_file = bs.AlignmentFile(os.path.join(folder, 'possorted_genome_bam.bam'), mode='rb') return barcodes, bam_file
def bam_allele_coverage(bam_in, chr_to_blocks, fa_f, cov_bin_f, vcf_f, cb_seq_dict, min_cnt=100,min_cov=50 ): c2i = {"A":0, "C":1, "G":2, "T":3} # four array.arrays of the same length in order A C G T fa_dict={} vcf_dict={} acc_pct = [] cb_seq_set = set(cb_seq_dict.keys()) for c in get_fa(fa_f): fa_dict[c[0]] = c[1] bamfile = bs.AlignmentFile(bam_in, "rb") #vcf_in = bs.VariantFile(vcf_f) cb_corr_cnt = Counter() for ch in chr_to_blocks: print ch if ch != "chr15": continue homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch]) for ith, bl in enumerate(chr_to_blocks[ch]): cnt = bamfile.count(ch, bl.s, bl.e) try: vcf_dict = dict((it.pos-1, it) for it in vcf_in.fetch(ch[3:], bl.s, bl.e)) except: print ch[3:], "not in vcf. ",ch if cnt < min_cnt: continue cov = bamfile.count_coverage(ch, bl.s, bl.e, quality_threshold=0) # four array.arrays of the same length in order A C G T for v_pos in vcf_dict: if v_pos-bl.s>= len(cov[0]): print "SNP position exceed limit.",v_pos-bl.s,len(cov[0]) continue freq = (cov[0][v_pos-bl.s],cov[1][v_pos-bl.s],cov[2][v_pos-bl.s],cov[3][v_pos-bl.s]) if sum(freq)<min_cov: continue tmp_atcg_set = {} for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=20000): for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_") if cb_seq in cb_seq_set: tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],set()).add(cb_seq) bs = tmp_atcg_set.keys() for ab in range(len(bs)-1): for ab1 in range(ab+1,len(bs)): tmp_set = tmp_atcg_set[bs[ab]] - tmp_atcg_set[snd_b] # x not y if len(tmp_set)>1: update_corr_cnt(list(tmp_set), cb_corr_cnt) tmp_set = tmp_atcg_set[snd_b] - tmp_atcg_set[bs[ab]] # y not x if len(tmp_set)>1: update_corr_cnt(list(tmp_set), cb_corr_cnt) print cb_corr_cnt.most_common(30) cov_bin_out = open(cov_bin_f,"w") for cbs in cb_corr_cnt: cov_bin_out.write("{},{},{}\n".format(cbs[0],cbs[1],cb_corr_cnt[cbs]))
def typingStat(sample, directory, baseCover): bamFile = bamnostic.AlignmentFile( directory + "/bam/" + sample + ".final.bam", "rb") counter = [] for read in bamFile: mapName = str(read).split("\t")[2].split("|")[1] mapBaseCount = read.query_alignment_length if mapBaseCount <= baseCover: continue else: counter.append(mapName) outputDict = Counter(counter) output = sorted(outputDict.items(), key=lambda d: d[1]) return output
def exportIndexes(input_dir): import unique bam_dirs = unique.read_directory(input_dir) print 'Building BAM index files', for file in bam_dirs: if string.lower(file[-4:]) == '.bam': bam_dir = input_dir+'/'+file bamf = pysam.AlignmentFile(bam_dir, "rb" ) ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Indexv lciv9df8scivx print '.', bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir)
# encoding:utf-8 # pzw # 20190214 ##### # update log # v0.2 更新了soft clip与map 的合并方式,根据先后顺序输出序列 # v0.3 更新阈值设置 # v0.4 加入正负链提示 # v1.0 读入文件为bam,自动读取质粒名称,自动判断是否比对到反义链 ##### import re import bamnostic import time ################################# bamFile = bamnostic.AlignmentFile("transgene.final.bam", "rb") results = open("results.txt", "w") basesCover = 30 showBases = 20 # 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多 ################################# ## 提示信息 print "bases Cover set: " + str(basesCover) if showBases == 0: print "show all bases" else: print "show " + str(showBases) + " bases" print "-----------------------------------" start = time.clock() ## 读取头信息
# v1.3 修正过滤逻辑 # v1.2 部分连接位点之间有两边均比对不上的碱基,修复这部分的内容,调整结果呈现方式 # v1.1 改成传参模式,统计总reads数的方法调整 # v1.0 读入文件为bam,自动读取质粒名称,自动判断是否比对到反义链 # v0.4 加入正负链提示 # v0.3 更新阈值设置 # v0.2 更新了soft clip与map 的合并方式,根据先后顺序输出序列 ##### import re import bamnostic import time import sys from collections import Counter ################################# bamFile = bamnostic.AlignmentFile(sys.argv[1], "rb") results = open(sys.argv[2], "w") basesCover = int(sys.argv[3]) showBases = int(sys.argv[4]) # 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多 maxGap = 10 ################################# ########## test ############### # bamFile = bamnostic.AlignmentFile("H04728D.CED1108.bam", "rb") # results = open("results.txt", "w") # basesCover = 30 # showBases = 10 # maxGap = 10 ################################# ## 提示信息
def parseJunctionEntries(bam_dir,multi=False, Species=None, ReferenceDir=None): global bam_file global splicesite_db global IndicatedSpecies global ExonReference IndicatedSpecies = Species ExonReference = ReferenceDir bam_file = bam_dir try: splicesite_db,chromosomes_found, gene_coord_db = retreiveAllKnownSpliceSites() except Exception: print traceback.format_exc() splicesite_db={}; chromosomes_found={} start = time.time() try: import collections; junction_db=collections.OrderedDict() except Exception: try: import ordereddict; junction_db = ordereddict.OrderedDict() except Exception: junction_db={} original_junction_db = copy.deepcopy(junction_db) bam_index = os.path.isfile(bam_dir+'.bai') if bam_index==False: if multi == False: print 'Building BAM index file for', bam_dir from pysam import index index(bam_dir) bamf = pysam.AlignmentFile(bam_dir, "rb" ) chromosome = False chromosomes={} bam_reads=0 count=0 jid = 1 prior_jc_start=0 l1 = None; l2=None o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w") o.write('track name=junctions description="TopHat junctions"\n') export_isoform_models = False if export_isoform_models: io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w") isoform_junctions = copy.deepcopy(junction_db) outlier_start = 0; outlier_end = 0; read_count = 0; c=0 for entry in bamf: bam_reads+=1 cigarstring = entry.cigarstring if cigarstring != None: if 'N' in cigarstring: ### Hence a junction if prior_jc_start == 0: pass elif (entry.pos-prior_jc_start) > 5000 or entry.reference_name != chromosome: ### New chr or far from prior reads writeJunctionBedFile(junction_db,jid,o) #writeIsoformFile(isoform_junctions,io) junction_db = copy.deepcopy(original_junction_db) ### Re-set this object jid+=1 chromosome = entry.reference_name chromosomes[chromosome]=[] ### keep track X=entry.reference_start #if entry.query_name == 'SRR791044.33673569': #print chromosome, entry.pos, entry.reference_length, entry.alen, entry.query_name Y=entry.reference_start+entry.reference_length prior_jc_start = X try: tophat_strand = entry.get_tag('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read except Exception: #if multi == False: print 'No TopHat strand information';sys.exit() tophat_strand = None coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X) #if count > 100: sys.exit() #print entry.query_name,X, Y, entry.cigarstring, entry.cigar, tophat_strand for (five_prime_ss,three_prime_ss) in coordinates: jc = five_prime_ss,three_prime_ss #print X, Y, jc, entry.cigarstring, entry.cigar try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist]) except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]] if export_isoform_models: try: mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI if 'N' in mate.cigarstring: mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos) else: mate_coordinates=[] except Exception: mate_coordinates=[] #print coordinates,mate_coordinates junctions = map(lambda x: tuple(x),coordinates) if len(mate_coordinates)>0: try: isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates) except Exception: isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates] else: if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions: isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [] count+=1 writeJunctionBedFile(junction_db,jid,o) ### One last read-out if multi == False: print bam_reads, count, time.time()-start, 'seconds required to parse the BAM file' o.close() bamf.close() missing_chromosomes=[] for chr in chromosomes_found: if chr not in chromosomes: chr = string.replace(chr,'chr','') if chr not in chromosomes_found: if chr != 'M' and chr != 'MT': missing_chromosomes.append(chr) #missing_chromosomes = ['A','B','C','D'] try: bam_file = export.findFilename(bam_file) except Exception: pass return bam_file, missing_chromosomes
import sqlite3 import os import bamnostic as bn #get connection to the sqlite database conn = sqlite3.connect("E:\speedSplice" + os.path.sep + 'splice.sqlite', isolation_level=None) c = conn.cursor() samfile = bn.AlignmentFile("hg19test.bam", "rb") i = 0 for read in samfile: cigar = read.cigarstring start = read.reference_start+read.cigar[0][1] stop = read.reference_start+read.cigar[0][1]+read.cigar[1][1] if "N" in cigar: i+=1 c.execute("SELECT * FROM splice WHERE from_pos='"+start+"' AND to_pos='"+stop+"'") print (cigar, read.cigartuples, start, stop, read.reference_name) if i > 50: break
def test_check_index(): with bs.AlignmentFile(bs.example_bam) as bam: with pytest.warns(UserWarning): bam.check_index('not_a_file.bai')
def test_get_index(): with pytest.warns(UserWarning): bam_no_bai = bs.AlignmentFile(bs.example_bam, index_filename='not_a_file.bai')
def parse_bam(cls, r2_bam: str, bed: str) -> tuple: """ An R2 read is a valid gene alignment if all of these criteria are met: The read aligns uniquely to a transcript sequence in the reference. The R2 alignment begins within the first five nucleotides. This criterion ensures that the R2 read originates from an actual PCR priming event. The length of the alignment that can be a match or mismatch in the CIGAR string is >60. The read does not align to phiX174. :param r2_bam: :param bed: :param min_mapping_qual: :param priming_window: :param total_cigar_m: :return: """ r2_map_passed = {} r2_map_dropped = set() # read bam file log.info('R2: Processing BAM file') bam = bs.AlignmentFile(r2_bam, 'rb') for read in bam: # check if read is uniquely mapped if read.mapping_quality >= cls.min_mapping_qual: # check if priming occurs in the first n nucleotides nt = cls.priming_window while nt > 0: for operator in read.cigar: if operator[0] == 0: priming = True break # skip remaining cigar operators else: nt = nt - operator[1] break # exit while loop else: priming = False # check if the total CIGAR M-operation is > m if priming: cigar_dict = {} for n, m in read.cigar: cigar_dict.setdefault(n, []).append(m) if sum(cigar_dict[0]) > cls.total_cigar_m: # the read is a valid gene alignment if at least 1 nt is overlapping # should be using query_length, query_alignment_length or reference_length? # read bed file with open(bed, 'r') as bedf: read_start = read.pos + 1 # 1-based transcription start read_end = read.pos + 1 + read.query_length # 1-based transcription end for line in bedf: gene_pos, gene_start, gene_end, gene_symbol = line.split( '\t') # check if read maps in the chromosome of the current gene coordinates if read.reference_name == gene_pos: # check in which gene the read aligns if (int(gene_start) <= read_start < int(gene_end)) \ or (read_start < int(gene_start) <= read_end) \ or (read_start <= int(gene_end) < read_end): r2_map_passed[ read. read_name] = gene_symbol.rstrip( '\n') # write to file the result #with open('../files/mapping_session_1Mtest.txt', 'a') as mappingf: #mappingf.write(f'{read.read_name} {r2_map_passed[read.read_name]}\n') break # skip remaining lines of bed file else: continue # read next gene coordinates else: continue # read next gene coordinates # drop good reads not mapping in any of the given genes if read.read_name not in r2_map_passed.keys(): r2_map_dropped.add(read.read_name) else: r2_map_dropped.add(read.read_name) else: r2_map_dropped.add(read.read_name) else: r2_map_dropped.add(read.read_name) bam.close() return r2_map_passed, r2_map_dropped
def get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon, fa_f, out_dir, cb_seq_dict, bam_short, known_position_dict, min_cov=100, report_pct=(0.15,0.85)): c2i = {"A":0, "C":1, "G":2, "T":3} # four array.arrays of the same length in order A C G T fa_dict={} acc_pct = [] REF_cnt_dict = {} ALT_cnt_dict = {} cb_seq_set = set(cb_seq_dict.keys()) reporting_summary = [] for c in get_fa(fa_f): fa_dict[c[0]] = c[1] bamfile = bs.AlignmentFile(bam_in, "rb") if bam_short is not None: bam_s = bs.AlignmentFile(bam_short, "rb") cb_corr_cnt = Counter() for ch in chr_to_blocks: print ch homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch]) for ith, bl in enumerate(chr_to_blocks[ch]): tmp_bl_flat = get_gene_flat({"NNN":bl.transcript_list}, transcript_to_exon) for ex in tmp_bl_flat["NNN"]: cnt = bamfile.count(ch, ex[0], ex[1]) if cnt < min_cov: continue cov = bamfile.count_coverage(ch, ex[0], ex[1], quality_threshold=0) # four array.arrays of the same length in order A C G T if len(cov[0])<20: continue # ignore tiny exons for i in range(5, len(cov[0])-5): # ignore the bases at the beginning and the end (close to splicing site) tot = float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i]) v_pos = ex[0]+i if tot>min_cov and (fa_dict[ch][v_pos]!="N"): freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot acc_pct.append(freq) base_freq = [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])] base_freq.sort(key=lambda x:x[1],reverse=True) if v_pos == 63318364: print base_freq ALT = [it[0] for it in base_freq if it[0] != fa_dict[ch][v_pos]][0] # the most enriched ALT allele alt_freq = cov[c2i[ALT]][i]/tot if (report_pct[0]< alt_freq < report_pct[1]) or ((ch,v_pos) in known_position_dict): tmp_atcg_set = {} if bam_short is not None: try: cov_s = bam_s.count_coverage(ch, v_pos, v_pos+1, quality_threshold=20) s_tot = cov_s[0][0]+cov_s[1][0]+cov_s[2][0]+cov_s[3][0] if s_tot> (min_cov/2): s_freq = cov_s[c2i[fa_dict[ch][v_pos]]][0]/float(s_tot) else: s_freq = -1 except: s_freq = -1 else: s_freq = -1 seq_ent = seq_entropy(fa_dict[ch][(v_pos-10):(v_pos+10)]) indel_freq = -1 if ((ch,v_pos) in known_position_dict) or ((ex[0]+i not in homo_dict) and (seq_ent > 1) and (s_freq==-1 or (0.05<s_freq<0.95))): for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=20000): c_keep = 0 c_del = 0 for pileupread in pileupcolumn.pileups: if not pileupread.is_del: if not pileupread.is_refskip: c_keep += 1 cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_") if cb_seq in cb_seq_set: tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1 #tmp_set[cb_seq] += 1 if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]: REF_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq) if pileupread.alignment.query_sequence[pileupread.query_position] == ALT: ALT_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq) else: if not pileupread.is_refskip: c_del += 1 indel_freq = c_del/float(c_keep+c_del) tmp_set = set() for b in tmp_atcg_set: tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]<=2) if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set): tmp_set.update(tmp_atcg_set[base_freq[0][0]]) tmp_set.update(tmp_atcg_set[base_freq[1][0]]) rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len(tmp_atcg_set[base_freq[1][0]])) hpg_prob = rv.pmf(len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]]))) else: hpg_prob = 1 reporting_summary.append((ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq)) print "number:", len(reporting_summary) subfolder_name = "mutation" if not os.path.exists(os.path.join(out_dir,subfolder_name)): os.makedirs(os.path.join(out_dir,subfolder_name)) with gzip.open(os.path.join(out_dir,subfolder_name,"ref_cnt.csv.gz"),"wb") as ref_cnt_f: ref_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n") # write header for p in REF_cnt_dict: tmp_c = Counter(REF_cnt_dict[p]) ref_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" ) with gzip.open(os.path.join(out_dir,subfolder_name,"alt_cnt.csv.gz"),"wb") as alt_cnt_f: alt_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n") # write header for p in ALT_cnt_dict: tmp_c = Counter(ALT_cnt_dict[p]) alt_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" ) with gzip.open(os.path.join(out_dir,subfolder_name,"allele_stat.csv.gz"),"wb") as al_stat: al_stat.write("chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency\n") # write header for rec in reporting_summary: al_stat.write(",".join( str(it) for it in rec )+"\n" ) pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1)) with open(os.path.join(out_dir,subfolder_name,"freq_summary.csv"),"w") as cov_bin_out: for ix in range(500): cov_bin_out.write("{},{}\n".format(pt[ix],pct_bin[ix]))
def parse_realigned_bam(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage, min_read_coverage, kwargs): """ """ fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1])) for it in open(fa_idx_f)) bc_tr_count_dict = {} bc_tr_badcov_count_dict = {} tr_cov_dict = {} read_dict = {} cnt_stat = Counter() bamfile = bs.AlignmentFile(bam_in, "rb") if "bc_file" in kwargs.keys(): bc_dict = make_bc_dict(kwargs["bc_file"]) for _, rec in enumerate(bamfile): if rec.is_unmapped or rec.seq == '*': cnt_stat["unmapped"] += 1 continue map_st = rec.reference_start map_en = rec.reference_end tr = rec.reference_name tr_cov = float(map_en - map_st) / fa_idx[tr] tr_cov_dict.setdefault(tr, []).append(tr_cov) inferred_read_length = query_len(rec.cigarstring) if rec.query_name not in read_dict: read_dict.setdefault(rec.query_name, []).append( (tr, rec.get_tag("AS"), tr_cov, float(rec.query_alignment_length) / inferred_read_length, rec.mapping_quality)) else: if rec.get_tag("AS") > read_dict[rec.query_name][0][1]: read_dict[rec.query_name].insert( 0, (tr, rec.get_tag("AS"), tr_cov, float(rec.query_alignment_length) / inferred_read_length, rec.mapping_quality)) elif rec.get_tag("AS") == read_dict[ rec.query_name][0][1] and float( rec.query_alignment_length ) / inferred_read_length == read_dict[ rec.query_name][0][3]: # same aligned sequence if tr_cov > read_dict[rec.query_name][0][ 2]: # choose the one with higher transcript coverage, might be internal TSS read_dict[rec.query_name].insert( 0, (tr, rec.get_tag("AS"), tr_cov, float(rec.query_alignment_length) / inferred_read_length, rec.mapping_quality)) else: read_dict[rec.query_name].append( (tr, rec.get_tag("AS"), tr_cov, float(rec.query_alignment_length) / inferred_read_length, rec.mapping_quality)) if tr not in fa_idx: cnt_stat["not_in_annotation"] += 1 print "\t" + str(tr), "not in annotation ???" tr_kept = dict( (tr, tr) for tr in tr_cov_dict if len([it for it in tr_cov_dict[tr] if it > 0.9]) > min_sup_reads) unique_tr_count = Counter(read_dict[r][0][0] for r in read_dict if read_dict[r][0][2] > 0.9) for r in read_dict: tmp = read_dict[r] tmp = [it for it in tmp if it[0] in tr_kept] if len(tmp) > 0: hit = tmp[0] # transcript_id, pct_ref, pct_reads else: cnt_stat["no_good_match"] += 1 continue # below line creates issue when header line has more than one _. # in this case, umi is assumed to be delimited from the barcode by the last _ #bc, umi = r.split("#")[0].split("_") # assume cleaned barcode split_r = r.split("#")[0].split("_") bc, umi = split_r[-2], split_r[-1] if "bc_file" in kwargs.keys(): bc = bc_dict[bc] if len(tmp) == 1 and tmp[0][4] > 0: if bc not in bc_tr_count_dict: bc_tr_count_dict[bc] = {} bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi) cnt_stat["counted_reads"] += 1 elif len( tmp) > 1 and tmp[0][1] == tmp[1][1] and tmp[0][3] == tmp[1][3]: if hit[1] > 0.8: if bc not in bc_tr_count_dict: bc_tr_count_dict[bc] = {} bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi) cnt_stat["counted_reads"] += 1 else: cnt_stat["ambigious_reads"] += 1 if bc not in bc_tr_badcov_count_dict: bc_tr_badcov_count_dict[bc] = {} bc_tr_badcov_count_dict[bc].setdefault(hit[0], []).append(umi) elif hit[2] < min_tr_coverage or hit[3] < min_read_coverage: cnt_stat["not_enough_coverage"] += 1 if bc not in bc_tr_badcov_count_dict: bc_tr_badcov_count_dict[bc] = {} bc_tr_badcov_count_dict[bc].setdefault(hit[0], []).append(umi) else: if bc not in bc_tr_count_dict: bc_tr_count_dict[bc] = {} bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi) cnt_stat["counted_reads"] += 1 print("\t" + str(cnt_stat)) return bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept
def get_mito_SNV_table(bam_in, fa_f, out_dir, cb_seq_dict, bam_short, ch="chrM", min_cov=1000, report_pct=(0.15,0.85)): c2i = {"A":0, "C":1, "G":2, "T":3} # four array.arrays of the same length in order A C G T fa_dict={} acc_pct = [] REF_cnt_dict = {} ALT_cnt_dict = {} cb_seq_set = set(cb_seq_dict.keys()) reporting_summary = [] for c in get_fa(fa_f): fa_dict[c[0]] = c[1] bl = namedtuple("bl", ["s","e"]) tmp_bl = bl(1, len(fa_dict[ch])-1) bamfile = bs.AlignmentFile(bam_in, "rb") bam_s = bs.AlignmentFile(bam_short, "rb") cb_corr_cnt = Counter() homo_dict = find_homo_regions(fa_dict[ch], [tmp_bl]) cnt = bamfile.count(ch, 0, len(fa_dict[ch])) cov = bamfile.count_coverage(ch, 0, len(fa_dict[ch]), quality_threshold=0) # four array.arrays of the same length in order A C G T for i in range(5, len(cov[0])-5): # ignore the bases at the beginning and the end tot = float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i]) if tot>min_cov and fa_dict[ch][i] != "N": v_pos = i freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot acc_pct.append(freq) base_freq = [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])] base_freq.sort(key=lambda x:x[1],reverse=True) ALT = [it[0] for it in base_freq if it[0] != fa_dict[ch][v_pos]][0] # the most enriched ALT allele alt_freq = cov[c2i[ALT]][i]/tot if report_pct[0]< alt_freq < report_pct[1]: tmp_atcg_set = {} try: cov_s = bam_s.count_coverage(ch, v_pos, v_pos+1, quality_threshold=20) s_tot = cov_s[0][0]+cov_s[1][0]+cov_s[2][0]+cov_s[3][0] if s_tot> (min_cov): s_freq = cov_s[c2i[fa_dict[ch][v_pos]]][0]/float(s_tot) else: s_freq = -1 except: s_freq = -1 seq_ent = seq_entropy(fa_dict[ch][(v_pos-10):(v_pos+10)]) indel_freq = -1 if (0+i not in homo_dict) and (seq_ent > 1) and (s_freq==-1 or (0.05<s_freq<0.95)): for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=1000000): mean_base_q = pileupcolumn.get_mapping_qualities() mean_base_q = sum(mean_base_q)/float(len(mean_base_q)) c_keep = 0 c_del = 0 for pileupread in pileupcolumn.pileups: if not pileupread.is_del: if not pileupread.is_refskip: c_keep += 1 cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_") if cb_seq in cb_seq_set: tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1 #tmp_set[cb_seq] += 1 if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]: REF_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq) if pileupread.alignment.query_sequence[pileupread.query_position] == ALT: ALT_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq) else: if not pileupread.is_refskip: c_del += 1 indel_freq = c_del/float(c_keep+c_del) tmp_set = set() for b in tmp_atcg_set: tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]<=2) if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set): tmp_set.update(tmp_atcg_set[base_freq[0][0]]) tmp_set.update(tmp_atcg_set[base_freq[1][0]]) rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len(tmp_atcg_set[base_freq[1][0]])) hpg_prob = rv.pmf(len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]]))) else: hpg_prob = 1 reporting_summary.append((ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq, mean_base_q)) if not os.path.exists(os.path.join(out_dir,"mutation")): os.makedirs(os.path.join(out_dir,"mutation")) with open(os.path.join(out_dir,"mutation","MT_ref_cnt.csv"),"w") as ref_cnt_f: ref_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n") # write header for p in REF_cnt_dict: tmp_c = Counter(REF_cnt_dict[p]) ref_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" ) with open(os.path.join(out_dir,"mutation","MT_alt_cnt.csv"),"w") as alt_cnt_f: alt_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n") # write header for p in ALT_cnt_dict: tmp_c = Counter(ALT_cnt_dict[p]) alt_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" ) with open(os.path.join(out_dir,"mutation","MT_allele_stat.csv"),"w") as al_stat: al_stat.write("chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency,mean_base_quality\n") # write header for rec in reporting_summary: al_stat.write(",".join( str(it) for it in rec )+"\n" ) pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1)) with open(os.path.join(out_dir,"mutation","MT_freq_summary.csv"),"w") as cov_bin_out: for ix in range(500): cov_bin_out.write("{},{}\n".format(pt[ix],pct_bin[ix]))
def realigned_bam_allele_coverage(bam_in, chr_to_blocks, fa_f, cov_bin_f, cb_seq_dict, vcf_f=None, min_cnt=150,min_cov=100,report_pct=(0.1,0.9) ): c2i = {"A":0, "C":1, "G":2, "T":3} # four array.arrays of the same length in order A C G T fa_dict={} vcf_dict={} acc_pct = [] cb_seq_set = set(cb_seq_dict.keys()) for c in get_fa(fa_f): fa_dict[c[0]] = c[1] bamfile = bs.AlignmentFile(bam_in, "rb") #vcf_in = bs.VariantFile(vcf_f) cb_corr_cnt = Counter() vcf_c = 0 vcf_not_c = 0 for ch in chr_to_blocks: print ch #if ch != "chr15": # continue homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch]) for ith, bl in enumerate(chr_to_blocks[ch]): cnt = bamfile.count(ch, bl.s, bl.e) #try: # vcf_dict = dict((it.pos-1, it) for it in vcf_in.fetch(ch[3:], bl.s, bl.e)) #except: # print ch[3:], "not in vcf. ",ch if cnt < min_cnt: continue acc_pct_tr = [] cov = bamfile.count_coverage(ch, bl.s, bl.e, quality_threshold=0) # four array.arrays of the same length in order A C G T for i in range(10, len(cov[0])-10): # ignore the bases at the beginning and the end tot = float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i]) if (bl.s+i not in homo_dict) and tot>min_cov: acc_pct_tr.append((bl.s+i, cov[c2i[fa_dict[ch][bl.s+i]]][i]/tot, [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])] )) if len(acc_pct_tr)>10: for ix, pct in enumerate(acc_pct_tr): if ix > 1 and ix < len(acc_pct_tr)-1: if report_pct[0]<pct[1]<report_pct[1]: if acc_pct_tr[ix-1][1]>0.95 and acc_pct_tr[ix+1][1]>0.95: if seq_entropy(fa_dict[ch][(pct[0]-10):pct[0]])<1 or seq_entropy(fa_dict[ch][pct[0]:(pct[0]+10)])<1: # ignore h**o regions continue tmp_atcg_set = {} tmp_set = Counter() for pileupcolumn in bamfile.pileup(ch, pct[0], pct[0]+1,truncate=True, min_base_quality=0,ignore_overlaps=False): c_keep = 0 c_del = 0 for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: c_keep += 1 cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_") if cb_seq in cb_seq_set: tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1 tmp_set[cb_seq] += 1 else: c_del += 1 if c_keep/float(c_keep+c_del)<0.7: continue bs = tmp_atcg_set.keys() for b in bs: tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]>1) tmp_set = set(it for it in tmp_set if tmp_set[it]>1) pct[2].sort(key=lambda x:x[1],reverse=True) lead_b = pct[2][0][0] # only look at most enriched two possibilities snd_b = pct[2][1][0] if not (len(tmp_atcg_set[lead_b])>10 and len(tmp_atcg_set[snd_b])>10): continue rv = hypergeom(len(tmp_set), len(tmp_atcg_set[lead_b]), len(tmp_atcg_set[snd_b])) if rv.pmf(len(tmp_atcg_set[lead_b].intersection(tmp_atcg_set[snd_b])))<0.000001 and len(tmp_atcg_set[lead_b].intersection(tmp_atcg_set[snd_b]))<0.9*min(len(tmp_atcg_set[lead_b]), len(tmp_atcg_set[snd_b])): #print ("\ncoverage at base %s = %s" % (pileupcolumn.pos, pileupcolumn.n)) tmp_set = tmp_atcg_set[lead_b] - tmp_atcg_set[snd_b] # x not y if len(tmp_set)>1: update_corr_cnt(list(tmp_set), cb_corr_cnt) tmp_set = tmp_atcg_set[snd_b] - tmp_atcg_set[lead_b] # y not x if len(tmp_set)>1: update_corr_cnt(list(tmp_set), cb_corr_cnt) tmp_set = tmp_atcg_set[lead_b] & tmp_atcg_set[snd_b] # x and y if len(tmp_set)>1: update_corr_cnt(list(tmp_set), cb_corr_cnt) if pct[0] in vcf_dict: vcf_c += 1 else: vcf_not_c += 1 #print ch, pct[0],pct[1],pct[2] acc_pct.extend([it[1] for it in acc_pct_tr]) print cb_corr_cnt.most_common(30) print vcf_c, vcf_not_c cov_bin_out = open(cov_bin_f,"w") for cbs in cb_corr_cnt: cov_bin_out.write("{},{},{}\n".format(cbs[0],cbs[1],cb_corr_cnt[cbs]))
import bamnostic as bs import os.path from functions import comp,parse_md filepath=os.path.join(os.path.dirname(__file__),'../../../T-600-Star data/ru_snorri/gpv213sp1.00.minimap.sorted.bam') bam=bs.AlignmentFile(filepath,'rb') r=next(bam) ref=parse_md(r.get_tag('MD'),r.cigarstring,r.query_sequence) outstr=comp(r.cigarstring,r.query_sequence,ref) #print(len(r.seq)) #print(len(ref)) #print(r.cigarstring+'\n') #print(outstr)
def test_first_read(): with bs.AlignmentFile(bs.example_bam, 'rb') as bam: first_read = next(bam) assert first_read.read_name == 'EAS56_57:6:190:289:82'
##### import re import bamnostic import time import sys ################################# # bamFile = bamnostic.AlignmentFile(sys.argv[1], "rb") # results = open(sys.argv[2], "w") # basesCover = int(sys.argv[3]) # showBases = int(sys.argv[4]) # 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多 # maxGap = 10 ################################# ########## test ############### bamFile = bamnostic.AlignmentFile("H04728D.CED1108.bam", "rb") results = open("results.txt", "w") basesCover = 30 showBases = 10 maxGap = 10 ################################# ## 提示信息 print "bases Cover set: " + str(basesCover) if showBases == 0: print "show all bases" else: print "show " + str(showBases) + " bases" print "-----------------------------------" start = time.clock()
def test_header(): with bs.AlignmentFile(bs.example_bam, 'rb') as bam: expected = {0: ('chr1', 1575), 1: ('chr2', 1584)} observed = bam.header() assert observed == expected
def bld_atac_mtx(list_bam_files, loaded_feat, output_file_name=None, path=None, writing_option='a', header=None, mode='rb', check_sq=True, chromosomes=HUMAN): """ Build a count matrix one set of features at a time. It is specific of ATAC-seq data. It curently do not write down a sparse matrix. It writes down a regular count matrix as a text file. Parameters ---------- list_bam_files: input must be a list of bam file names. One for each cell to build the count matrix for loaded_feat: the features for which you want to build the count matrix output_file_name: name of the output file. The count matrix that will be written down in the current directory. If this parameter is not specified, the output count amtrix will be named 'std_output_ct_mtx.txt' path: path where to find the input file. The output file will be written down in your current directory, it is not affected by this parameter. writing_option: standard writing options for the output file. 'a' or 'w' 'a' to append to an already existing matrix. 'w' to overwrite any previously exisiting matrix. default: 'a' header: if you want to write down the feature name specify this argument. Input must be a list. mode: bamnostic argument 'r' or 'w' for read and write 'b' and 's' for bam or sam if only 'r' is specified, bamnostic will try to determine if the input is either a bam or sam file. check_sq: bamnostic argument. when reading, check if SQ entries are present in header chromosomes: chromosomes of the species you are considering. default value is the human genome (not including mitochondrial genome). HUMAN = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '2', '3', '4', '5', '6', '7', '8', '9','X', 'Y'] MOUSE = '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9','X', 'Y'] Return ------ It does not return any object. The function write down the desired count matrix in a txt file """ if output_file_name==None: output_file_name='std_output_ct_mtx.txt' if path==None: path='' # open file to write output_file = open(path+output_file_name, writing_option) # write header if specified if header != None: output_file.write('sample_name\t') for feature in header: output_file.write(feature) output_file.write('\t') output_file.write('\n') # close file to write output_file.close() # start going through the bam files for name_file in list_bam_files[0:]: ## important variables for output index_feat = {key: 0 for key in chromosomes} val_feat = {key: [0 for x in range(len(loaded_feat[key]))] for key in chromosomes} ## PART 1 read the bam file keep_lines = [] #samfile = bs.AlignmentFile(path+output_file_name, mode="rb", check_sq=False) samfile = bs.AlignmentFile(path+name_file, mode="rb", check_sq=False) #for read in samfile.fetch(until_eof=True): for read in samfile: line = str(read).split('\t') if line[2][3:] in chromosomes: keep_lines.append(line[2:4]) ### print -- output print(name_file, len(keep_lines), 'mapped reads') samfile.close() ## PART2 reads that fall into for element in keep_lines: ## 2 things per line: chrom = element[0][3:] read_pos = int(element[1]) max_value_index = len(loaded_feat[chrom]) ## I want to check if the read map to a feature in the same chrom pointer_feat_pos = index_feat[chrom] for feat_pos in loaded_feat[chrom][pointer_feat_pos:]: pointer_feat_pos += 1 # Go through all features that are smaller than the read position if read_pos > feat_pos[1]: continue # if read_pos fall in a feature elif read_pos > feat_pos[0]: # Update the pointer for the next read if the pointer isn't out of range if pointer_feat_pos < max_value_index: index_feat[chrom] = pointer_feat_pos val_feat[chrom][pointer_feat_pos] += 1 else: index_feat[chrom] = max_value_index # Check the following features without updating the pointer. break else: break for feat_pos in loaded_feat[chrom][pointer_feat_pos:]: # +1 if reads fall into more than one feature if feat_pos[0] < read_pos: val_feat[chrom][pointer_feat_pos] += 1 pointer_feat_pos += 1 # if read_pos > start position of the new feature break elif read_pos < feat_pos[0]: break else: print('error') break # PART 3 # open output_file = open(path+output_file_name, 'a') # write down the result of the cell output_file.write(name_file) output_file.write('\t') for chrom in chromosomes: output_file.write('\t'.join([str(p) for p in val_feat[chrom]])) output_file.write('\t') output_file.write('\n') #close output_file.close()
#This is the script to check bam file for known Y-SNPs import bamnostic as bs import numpy as np import pandas as pd import time bam_file_path = r'D:\Workspace\paleogenetics\project_data\med_isles\I14675.hg19.bam' #bam_file_path = 'project_data/Cheddar_man/SB524A4_lib.merged.markdup.bam' target_snps_path = 'project_data/trunk_and_i_new.csv' output_file_name = 'project_data/med_isles_I14675.csv' bam = bs.AlignmentFile(bam_file_path, 'rb') df = pd.read_csv(target_snps_path) # это - номер хромосомы, 0 - это первая хромосома, 21 - 22-ая, 22 - X, 23 - Y tid_number = 23 df['result_A'] = 0 df['result_T'] = 0 df['result_G'] = 0 df['result_C'] = 0 df['result_A_end'] = 0 df['result_T_end'] = 0 df['result_G_end'] = 0 df['result_C_end'] = 0 build = 'build_37' start = time.time() for bam_reads in bam: if bam_reads.tid == tid_number: first = bam_reads.pos