Ejemplo n.º 1
0
def mapping(sample, directory):
    cmd = """
		mkdir -p {directory}/bam
		bwa mem -t 8 /home2/zhaowen/project/EVTpro/reference/EVTREF_VP.fa \\
			{directory}/cleandata/{sample}_1.fq.gz \\
			{directory}/cleandata/{sample}_2.fq.gz \\
			| samtools view -bSh - | samtools sort -@ 8 - -o {directory}/bam/{sample}.bam
		samtools view {directory}/bam/{sample}.bam -H > {directory}/bam/{sample}.header
		samtools view {directory}/bam/{sample}.bam -F 2052 -bSh > {directory}/bam/{sample}.temp.bam
		samtools index {directory}/bam/{sample}.temp.bam
	""".format(sample=sample, directory=directory)
    os.system(cmd)
    bamFile = bamnostic.AlignmentFile(
        directory + "/bam/" + sample + ".temp.bam", "rb")
    finalSam = open(directory + "/bam/" + sample + ".sam", "w")
    for read in bamFile:
        if str(read).split("\t")[6] == "=":
            finalSam.write(str(read) + "\n")
    finalSam.close()

    cmd2 = """
		cat {directory}/bam/{sample}.header {directory}/bam/{sample}.sam | samtools view -bSh - > {directory}/bam/{sample}.final.bam
		rm {directory}/bam/{sample}.header {directory}/bam/{sample}.sam {directory}/bam/{sample}.temp.bam*
		samtools index {directory}/bam/{sample}.final.bam
	""".format(sample=sample, directory=directory)
    os.system(cmd2)
Ejemplo n.º 2
0
def parse_realigned_bam_raw(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage,
                            min_read_coverage):
    fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1]))
                  for it in open(fa_idx_f))
    bc_tr_count_dict = {}
    bc_tr_badcov_count_dict = {}
    tr_cov_dict = {}
    read_dict = {}
    cnt_stat = Counter()
    bamfile = bs.AlignmentFile(bam_in, "rb")
    for rec in bamfile.fetch(until_eof=True):
        if rec.is_unmapped or rec.is_secondary:  #or rec.mapping_quality==0:
            cnt_stat["not_counted"] += 1
            continue
        map_st = rec.reference_start
        map_en = rec.reference_end
        tr = rec.reference_name
        tr_cov = float(map_en - map_st) / fa_idx[tr]
        tr_cov_dict.setdefault(tr, []).append(tr_cov)
        if tr not in fa_idx:
            cnt_stat["not_in_annotation"] += 1
            print tr, "not in annotation ???"
        bc, umi = rec.query_name.split("#")[0].split(
            "_")  # assume cleaned barcode
        if bc not in bc_tr_count_dict:
            bc_tr_count_dict[bc] = {}
        bc_tr_count_dict[bc].setdefault(tr, []).append(umi)
        cnt_stat["counted_reads"] += 1
    tr_kept = dict(
        (tr, tr) for tr in tr_cov_dict
        if len([it for it in tr_cov_dict[tr] if it > 0.9]) > min_sup_reads)
    print("\t" + str(cnt_stat))
    return bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept
Ejemplo n.º 3
0
 def selectBam(self):
     bam_file_name = askopenfilename(title="Select Your File")
     bam_path = os.path.abspath(bam_file_name)
     bam_index_path = bam_path + ".bai"
     print(bam_path, bam_index_path)
     self.bam = bs.AlignmentFile(bam_path, filepath_index=bam_index_path)
     print(self.bam.header)
     print(self.bam.head(n=5))
Ejemplo n.º 4
0
def realigned_bam_coverage(bam_in, fa_idx_f, coverage_dir):
    fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1]))
                  for it in open(fa_idx_f))
    left_clip_count = Counter()
    right_clip_count = Counter()
    tr_strand = Counter()
    bc_pct = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}}
    bc_cov_pct = {0: [], 1: [], 2: [], 3: [], 4: []}
    gene_pct = {0: [], 1: [], 2: [], 3: [], 4: []}
    bamfile = bs.AlignmentFile(bam_in, "rb")
    for rec in bamfile.fetch(until_eof=True):
        if rec.is_unmapped or rec.is_supplementary or rec.is_secondary:
            continue
        bc, umi = rec.query_name.split("#")[0].split(
            "_")  # assume cleaned barcode
        map_st = rec.reference_start
        map_en = rec.reference_end
        tr = rec.reference_name
        if float(map_en - map_st) / fa_idx[tr] < 0.3:
            continue
        if rec.cigar[0][0] == 4:  # BAM_CSOFT_CLIP
            left_clip_count[rec.cigar[0][1]] += 1
        if rec.cigar[-1][0] == 4:  # BAM_CSOFT_CLIP
            right_clip_count[rec.cigar[-1][1]] += 1
        tr_strand[rec.is_reverse] += 1
        if not rec.is_reverse:
            pass
        gene_pct[tr_len_range(fa_idx[tr])].append(
            float(map_en - map_st) / fa_idx[tr])
        bc_pct[tr_len_range(fa_idx[tr])].setdefault(bc, []).append(
            float(map_st - 0) / fa_idx[tr])
        bc_pct[tr_len_range(fa_idx[tr])].setdefault(bc, []).append(
            float(map_en - 0) / fa_idx[tr])
        bc_cov_pct[tr_len_range(fa_idx[tr])].append(
            float(map_en - map_st) / fa_idx[tr])
    print left_clip_count.most_common(30)
    print right_clip_count.most_common(30)
    print tr_strand
    print np.histogram(bc_pct[0][bc_pct[0].keys()[0]], bins=200, range=(0, 1))
    for i in bc_pct:
        coverage_f = open(
            os.path.join(coverage_dir,
                         "transcript_cov_per_cell.{}.csv".format(i)), "w")
        for bc in bc_pct[i]:
            lhi, _ = np.histogram(bc_pct[i][bc], bins=200, range=(0, 1))
            coverage_f.write("{},".format(bc) +
                             ",".join(str(it) for it in lhi) + "\n")
        coverage_f.close()
    tr_cov_f = open(os.path.join(coverage_dir, "transcript_cov.csv"), "w")
    for i in gene_pct:
        lhi, _ = np.histogram(gene_pct[i], bins=200, range=(0, 1))
        tr_cov_f.write("{},".format(i) + ",".join(str(it)
                                                  for it in lhi) + "\n")
    tr_cov_f.close()
Ejemplo n.º 5
0
def read_10x_folder(folder):
    """Get QC-pass barcodes, genes, and bam file from a 10x folder"""
    import bamnostic as bs

    barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv'))

    bam_file = bs.AlignmentFile(os.path.join(folder,
                                             'possorted_genome_bam.bam'),
                                mode='rb')

    return barcodes, bam_file
Ejemplo n.º 6
0
def bam_allele_coverage(bam_in, chr_to_blocks, fa_f, cov_bin_f, vcf_f, cb_seq_dict, min_cnt=100,min_cov=50 ):
    c2i = {"A":0, "C":1, "G":2, "T":3}  # four array.arrays of the same length in order A C G T
    fa_dict={}
    vcf_dict={}
    acc_pct = []
    cb_seq_set = set(cb_seq_dict.keys())
    for c in get_fa(fa_f):
        fa_dict[c[0]] = c[1]
    bamfile = bs.AlignmentFile(bam_in, "rb")
    #vcf_in = bs.VariantFile(vcf_f)
    cb_corr_cnt = Counter()
    for ch in chr_to_blocks:
        print ch
        if ch != "chr15":
            continue
        homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch])
        for ith, bl in enumerate(chr_to_blocks[ch]):
            cnt = bamfile.count(ch, bl.s, bl.e)
            try:
                vcf_dict = dict((it.pos-1, it) for it in vcf_in.fetch(ch[3:], bl.s, bl.e))
            except:
                print ch[3:], "not in vcf.  ",ch
            if cnt < min_cnt:
                continue
            cov = bamfile.count_coverage(ch, bl.s, bl.e,
            quality_threshold=0)  # four array.arrays of the same length in order A C G T
            for v_pos in vcf_dict:
                if v_pos-bl.s>= len(cov[0]):
                    print "SNP position exceed limit.",v_pos-bl.s,len(cov[0])
                    continue
                freq = (cov[0][v_pos-bl.s],cov[1][v_pos-bl.s],cov[2][v_pos-bl.s],cov[3][v_pos-bl.s])
                if sum(freq)<min_cov:
                    continue
                tmp_atcg_set = {}
                for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=20000):
                    for pileupread in pileupcolumn.pileups:
                        if not pileupread.is_del and not pileupread.is_refskip:
                            cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_")
                            if cb_seq in cb_seq_set:
                                tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],set()).add(cb_seq)
                bs = tmp_atcg_set.keys()
                for ab in range(len(bs)-1):
                    for ab1 in range(ab+1,len(bs)):
                        tmp_set = tmp_atcg_set[bs[ab]] - tmp_atcg_set[snd_b]  # x not y
                        if len(tmp_set)>1:
                            update_corr_cnt(list(tmp_set), cb_corr_cnt)
                        tmp_set = tmp_atcg_set[snd_b] - tmp_atcg_set[bs[ab]]  # y not x
                        if len(tmp_set)>1:
                            update_corr_cnt(list(tmp_set), cb_corr_cnt)
    print cb_corr_cnt.most_common(30)
    cov_bin_out = open(cov_bin_f,"w")
    for cbs in cb_corr_cnt:
        cov_bin_out.write("{},{},{}\n".format(cbs[0],cbs[1],cb_corr_cnt[cbs]))
Ejemplo n.º 7
0
def typingStat(sample, directory, baseCover):
    bamFile = bamnostic.AlignmentFile(
        directory + "/bam/" + sample + ".final.bam", "rb")
    counter = []
    for read in bamFile:
        mapName = str(read).split("\t")[2].split("|")[1]
        mapBaseCount = read.query_alignment_length
        if mapBaseCount <= baseCover:
            continue
        else:
            counter.append(mapName)

    outputDict = Counter(counter)
    output = sorted(outputDict.items(), key=lambda d: d[1])
    return output
Ejemplo n.º 8
0
def exportIndexes(input_dir):
    import unique
    bam_dirs = unique.read_directory(input_dir)
    print 'Building BAM index files',
    for file in bam_dirs:
        if string.lower(file[-4:]) == '.bam':
            bam_dir = input_dir+'/'+file
            bamf = pysam.AlignmentFile(bam_dir, "rb" )
            ### Is there an indexed .bai for the BAM? Check.
            try:
                for entry in bamf.fetch():
                    codes = map(lambda x: x[0],entry.cigar)
                    break
            except Exception:
                ### Make BAM Indexv lciv9df8scivx 
                print '.',
                bam_dir = str(bam_dir)
                #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
                pysam.index(bam_dir)
Ejemplo n.º 9
0
# encoding:utf-8
# pzw
# 20190214
#####
# update log
# v0.2 更新了soft clip与map 的合并方式,根据先后顺序输出序列
# v0.3 更新阈值设置
# v0.4 加入正负链提示
# v1.0 读入文件为bam,自动读取质粒名称,自动判断是否比对到反义链
#####
import re
import bamnostic
import time

#################################
bamFile = bamnostic.AlignmentFile("transgene.final.bam", "rb")
results = open("results.txt", "w")
basesCover = 30
showBases = 20  # 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多
#################################

## 提示信息
print "bases Cover set: " + str(basesCover)
if showBases == 0:
    print "show all bases"
else:
    print "show " + str(showBases) + " bases"
print "-----------------------------------"
start = time.clock()

## 读取头信息
Ejemplo n.º 10
0
# v1.3 修正过滤逻辑
# v1.2 部分连接位点之间有两边均比对不上的碱基,修复这部分的内容,调整结果呈现方式
# v1.1 改成传参模式,统计总reads数的方法调整
# v1.0 读入文件为bam,自动读取质粒名称,自动判断是否比对到反义链
# v0.4 加入正负链提示
# v0.3 更新阈值设置
# v0.2 更新了soft clip与map 的合并方式,根据先后顺序输出序列
#####
import re
import bamnostic
import time
import sys
from collections import Counter

#################################
bamFile = bamnostic.AlignmentFile(sys.argv[1], "rb")
results = open(sys.argv[2], "w")
basesCover = int(sys.argv[3])
showBases = int(sys.argv[4])		# 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多
maxGap = 10
#################################

##########  test  ###############
# bamFile = bamnostic.AlignmentFile("H04728D.CED1108.bam", "rb")
# results = open("results.txt", "w")
# basesCover = 30
# showBases = 10
# maxGap = 10
#################################

## 提示信息
Ejemplo n.º 11
0
def parseJunctionEntries(bam_dir,multi=False, Species=None, ReferenceDir=None):
    global bam_file
    global splicesite_db
    global IndicatedSpecies
    global ExonReference
    IndicatedSpecies = Species
    ExonReference = ReferenceDir
    bam_file = bam_dir
    try: splicesite_db,chromosomes_found, gene_coord_db = retreiveAllKnownSpliceSites()
    except Exception:
        print traceback.format_exc()
        splicesite_db={}; chromosomes_found={}

    start = time.time()
    try: import collections; junction_db=collections.OrderedDict()
    except Exception:
        try: import ordereddict; junction_db = ordereddict.OrderedDict()
        except Exception: junction_db={}
    original_junction_db = copy.deepcopy(junction_db)
    
    bam_index = os.path.isfile(bam_dir+'.bai')
    if bam_index==False:
        if multi == False:
            print 'Building BAM index file for', bam_dir
        from pysam import index
        index(bam_dir)
    bamf = pysam.AlignmentFile(bam_dir, "rb" )
    
    chromosome = False
    chromosomes={}
    bam_reads=0
    count=0
    jid = 1
    prior_jc_start=0
    l1 = None; l2=None
    o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w")
    o.write('track name=junctions description="TopHat junctions"\n')
    export_isoform_models = False
    if export_isoform_models:
        io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w")
        isoform_junctions = copy.deepcopy(junction_db)
    outlier_start = 0; outlier_end = 0; read_count = 0; c=0
    for entry in bamf:
      bam_reads+=1
      cigarstring = entry.cigarstring

      if cigarstring != None:
        if 'N' in cigarstring: ### Hence a junction
            if prior_jc_start == 0: pass
            elif (entry.pos-prior_jc_start) > 5000 or entry.reference_name != chromosome: ### New chr or far from prior reads
                writeJunctionBedFile(junction_db,jid,o)
                #writeIsoformFile(isoform_junctions,io)
                junction_db = copy.deepcopy(original_junction_db) ### Re-set this object
                jid+=1
            
            chromosome = entry.reference_name
            chromosomes[chromosome]=[] ### keep track
            X=entry.reference_start
            #if entry.query_name == 'SRR791044.33673569':
            #print chromosome, entry.pos, entry.reference_length, entry.alen, entry.query_name
            Y=entry.reference_start+entry.reference_length
            prior_jc_start = X

            try: tophat_strand = entry.get_tag('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
            except Exception:
                #if multi == False:  print 'No TopHat strand information';sys.exit()
                tophat_strand = None
            coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X)
            #if count > 100: sys.exit()
            #print entry.query_name,X, Y, entry.cigarstring, entry.cigar, tophat_strand
            for (five_prime_ss,three_prime_ss) in coordinates:
                jc = five_prime_ss,three_prime_ss
                #print X, Y, jc, entry.cigarstring, entry.cigar
                try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist])
                except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]]
                
            if export_isoform_models:
                try:
                    mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI
    
                    if 'N' in mate.cigarstring:
                        mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos)
                    else: mate_coordinates=[]
                except Exception: mate_coordinates=[]
                #print coordinates,mate_coordinates
                junctions = map(lambda x: tuple(x),coordinates)
                if len(mate_coordinates)>0:
                    try:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates)
                    except Exception:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates]
                else:
                    if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = []
                
            count+=1
    writeJunctionBedFile(junction_db,jid,o) ### One last read-out
    if multi == False:
        print bam_reads, count, time.time()-start, 'seconds required to parse the BAM file'
    o.close()
    bamf.close()
    
    missing_chromosomes=[]
    for chr in chromosomes_found:
        if chr not in chromosomes:
            chr = string.replace(chr,'chr','')
            if chr not in chromosomes_found:
                if chr != 'M' and chr != 'MT':
                    missing_chromosomes.append(chr)
    #missing_chromosomes = ['A','B','C','D']
    try: bam_file = export.findFilename(bam_file)
    except Exception: pass
    return bam_file, missing_chromosomes
Ejemplo n.º 12
0
import sqlite3
import os
import bamnostic as bn

#get connection to the sqlite database
conn = sqlite3.connect("E:\speedSplice" + os.path.sep + 'splice.sqlite', isolation_level=None)
c = conn.cursor()


samfile = bn.AlignmentFile("hg19test.bam", "rb")

i = 0
for read in samfile:
    cigar = read.cigarstring
    start = read.reference_start+read.cigar[0][1]
    stop = read.reference_start+read.cigar[0][1]+read.cigar[1][1]
    if "N" in cigar:
        i+=1
        c.execute("SELECT * FROM splice WHERE from_pos='"+start+"' AND to_pos='"+stop+"'")
        print (cigar, read.cigartuples, start, stop, read.reference_name)
    if i > 50:
        break


Ejemplo n.º 13
0
def test_check_index():
    with bs.AlignmentFile(bs.example_bam) as bam:
        with pytest.warns(UserWarning):
            bam.check_index('not_a_file.bai')
Ejemplo n.º 14
0
def test_get_index():
    with pytest.warns(UserWarning):
        bam_no_bai = bs.AlignmentFile(bs.example_bam,
                                      index_filename='not_a_file.bai')
Ejemplo n.º 15
0
    def parse_bam(cls, r2_bam: str, bed: str) -> tuple:
        """
        An R2 read is a valid gene alignment if all of these criteria are met:
        The read aligns uniquely to a transcript sequence in the reference.
        The R2 alignment begins within the first five nucleotides. This criterion
        ensures that the R2 read originates from an actual PCR priming event.
        The length of the alignment that can be a match or mismatch in the CIGAR
        string is >60.
        The read does not align to phiX174.
        :param r2_bam:
        :param bed:
        :param min_mapping_qual:
        :param priming_window:
        :param total_cigar_m:
        :return:
        """
        r2_map_passed = {}
        r2_map_dropped = set()

        # read bam file
        log.info('R2: Processing BAM file')
        bam = bs.AlignmentFile(r2_bam, 'rb')

        for read in bam:

            # check if read is uniquely mapped
            if read.mapping_quality >= cls.min_mapping_qual:

                # check if priming occurs in the first n nucleotides
                nt = cls.priming_window
                while nt > 0:
                    for operator in read.cigar:
                        if operator[0] == 0:
                            priming = True
                            break  # skip remaining cigar operators
                        else:
                            nt = nt - operator[1]
                    break  # exit while loop
                else:
                    priming = False

                # check if the total CIGAR M-operation is > m
                if priming:
                    cigar_dict = {}
                    for n, m in read.cigar:
                        cigar_dict.setdefault(n, []).append(m)
                    if sum(cigar_dict[0]) > cls.total_cigar_m:

                        # the read is a valid gene alignment if at least 1 nt is overlapping
                        # should be using query_length, query_alignment_length or reference_length?

                        # read bed file
                        with open(bed, 'r') as bedf:
                            read_start = read.pos + 1  # 1-based transcription start
                            read_end = read.pos + 1 + read.query_length  # 1-based transcription end
                            for line in bedf:
                                gene_pos, gene_start, gene_end, gene_symbol = line.split(
                                    '\t')

                                # check if read maps in the chromosome of the current gene coordinates
                                if read.reference_name == gene_pos:

                                    # check in which gene the read aligns
                                    if (int(gene_start) <= read_start < int(gene_end)) \
                                    or (read_start < int(gene_start) <= read_end) \
                                    or (read_start <= int(gene_end) < read_end):
                                        r2_map_passed[
                                            read.
                                            read_name] = gene_symbol.rstrip(
                                                '\n')
                                        # write to file the result
                                        #with open('../files/mapping_session_1Mtest.txt', 'a') as mappingf:
                                        #mappingf.write(f'{read.read_name} {r2_map_passed[read.read_name]}\n')
                                        break  # skip remaining lines of bed file
                                    else:
                                        continue  # read next gene coordinates

                                else:
                                    continue  # read next gene coordinates

                            # drop good reads not mapping in any of the given genes
                            if read.read_name not in r2_map_passed.keys():
                                r2_map_dropped.add(read.read_name)

                    else:
                        r2_map_dropped.add(read.read_name)

                else:
                    r2_map_dropped.add(read.read_name)

            else:
                r2_map_dropped.add(read.read_name)

        bam.close()

        return r2_map_passed, r2_map_dropped
Ejemplo n.º 16
0
def get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon, fa_f, out_dir, cb_seq_dict, bam_short, known_position_dict, min_cov=100, report_pct=(0.15,0.85)):
    c2i = {"A":0, "C":1, "G":2, "T":3}  # four array.arrays of the same length in order A C G T
    fa_dict={}
    acc_pct = []
    REF_cnt_dict = {}
    ALT_cnt_dict = {}
    cb_seq_set = set(cb_seq_dict.keys())
    reporting_summary = []
    for c in get_fa(fa_f):
        fa_dict[c[0]] = c[1]
    bamfile = bs.AlignmentFile(bam_in, "rb")
    if bam_short is not None:
        bam_s = bs.AlignmentFile(bam_short, "rb")
    cb_corr_cnt = Counter()
    for ch in chr_to_blocks:
        print ch
        homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch])
        for ith, bl in enumerate(chr_to_blocks[ch]):
            tmp_bl_flat = get_gene_flat({"NNN":bl.transcript_list}, transcript_to_exon)
            for ex in tmp_bl_flat["NNN"]:
                cnt = bamfile.count(ch, ex[0], ex[1])
                if cnt < min_cov:
                    continue
                cov = bamfile.count_coverage(ch, ex[0], ex[1],
                quality_threshold=0)  # four array.arrays of the same length in order A C G T
                if len(cov[0])<20:
                    continue  # ignore tiny exons
                for i in range(5, len(cov[0])-5):  # ignore the bases at the beginning and the end (close to splicing site)
                    tot =  float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i])
                    v_pos = ex[0]+i
                    if tot>min_cov and (fa_dict[ch][v_pos]!="N"):
                        freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot
                        acc_pct.append(freq)
                        base_freq = [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])]
                        base_freq.sort(key=lambda x:x[1],reverse=True)
                        if v_pos == 63318364:
                            print base_freq
                        ALT = [it[0] for it in base_freq if it[0] != fa_dict[ch][v_pos]][0] # the most enriched ALT allele
                        alt_freq = cov[c2i[ALT]][i]/tot
                        if (report_pct[0]< alt_freq < report_pct[1]) or ((ch,v_pos) in known_position_dict):
                            tmp_atcg_set = {}
                            if bam_short is not None:
                                try:
                                    cov_s = bam_s.count_coverage(ch, v_pos, v_pos+1, quality_threshold=20)
                                    s_tot = cov_s[0][0]+cov_s[1][0]+cov_s[2][0]+cov_s[3][0]
                                    if s_tot> (min_cov/2):
                                        s_freq = cov_s[c2i[fa_dict[ch][v_pos]]][0]/float(s_tot)
                                    else:
                                        s_freq = -1
                                except:
                                    s_freq = -1
                            else:
                                s_freq = -1
                            seq_ent = seq_entropy(fa_dict[ch][(v_pos-10):(v_pos+10)])
                            indel_freq = -1
                            if ((ch,v_pos) in known_position_dict) or ((ex[0]+i not in homo_dict) and (seq_ent > 1) and (s_freq==-1 or (0.05<s_freq<0.95))):
                                for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=20000):
                                    c_keep = 0
                                    c_del = 0
                                    for pileupread in pileupcolumn.pileups:
                                        if not pileupread.is_del:
                                            if not pileupread.is_refskip:
                                                c_keep += 1
                                                cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_")
                                                if cb_seq in cb_seq_set:
                                                    tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1
                                                    #tmp_set[cb_seq] += 1
                                                    if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]:
                                                        REF_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq)
                                                    if pileupread.alignment.query_sequence[pileupread.query_position] == ALT:
                                                        ALT_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq)
                                        else:
                                            if not pileupread.is_refskip:
                                                c_del += 1
                                indel_freq = c_del/float(c_keep+c_del)
                                tmp_set = set()
                                for b in tmp_atcg_set:
                                    tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]<=2)
                                if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set):
                                    tmp_set.update(tmp_atcg_set[base_freq[0][0]])
                                    tmp_set.update(tmp_atcg_set[base_freq[1][0]])
                                    rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len(tmp_atcg_set[base_freq[1][0]]))
                                    hpg_prob = rv.pmf(len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]])))
                                else:
                                    hpg_prob = 1
                                reporting_summary.append((ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq))
    print "number:", len(reporting_summary)
    subfolder_name = "mutation"
    if not os.path.exists(os.path.join(out_dir,subfolder_name)):
        os.makedirs(os.path.join(out_dir,subfolder_name))
    with gzip.open(os.path.join(out_dir,subfolder_name,"ref_cnt.csv.gz"),"wb") as ref_cnt_f:
        ref_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n")  # write header
        for p in REF_cnt_dict:
            tmp_c = Counter(REF_cnt_dict[p])
            ref_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" )
    with gzip.open(os.path.join(out_dir,subfolder_name,"alt_cnt.csv.gz"),"wb") as alt_cnt_f:
        alt_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n")  # write header
        for p in ALT_cnt_dict:
            tmp_c = Counter(ALT_cnt_dict[p])
            alt_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" )
    with gzip.open(os.path.join(out_dir,subfolder_name,"allele_stat.csv.gz"),"wb") as al_stat:
        al_stat.write("chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency\n")  # write header
        for rec in reporting_summary:
            al_stat.write(",".join( str(it) for it in rec )+"\n" )
    pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1))
    with open(os.path.join(out_dir,subfolder_name,"freq_summary.csv"),"w") as cov_bin_out:
        for ix in range(500):
            cov_bin_out.write("{},{}\n".format(pt[ix],pct_bin[ix]))
Ejemplo n.º 17
0
def parse_realigned_bam(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage,
                        min_read_coverage, kwargs):
    """
    """
    fa_idx = dict((it.strip().split()[0], int(it.strip().split()[1]))
                  for it in open(fa_idx_f))
    bc_tr_count_dict = {}
    bc_tr_badcov_count_dict = {}
    tr_cov_dict = {}
    read_dict = {}
    cnt_stat = Counter()
    bamfile = bs.AlignmentFile(bam_in, "rb")

    if "bc_file" in kwargs.keys():
        bc_dict = make_bc_dict(kwargs["bc_file"])
    for _, rec in enumerate(bamfile):
        if rec.is_unmapped or rec.seq == '*':
            cnt_stat["unmapped"] += 1
            continue
        map_st = rec.reference_start
        map_en = rec.reference_end
        tr = rec.reference_name
        tr_cov = float(map_en - map_st) / fa_idx[tr]
        tr_cov_dict.setdefault(tr, []).append(tr_cov)

        inferred_read_length = query_len(rec.cigarstring)
        if rec.query_name not in read_dict:
            read_dict.setdefault(rec.query_name, []).append(
                (tr, rec.get_tag("AS"), tr_cov,
                 float(rec.query_alignment_length) / inferred_read_length,
                 rec.mapping_quality))
        else:
            if rec.get_tag("AS") > read_dict[rec.query_name][0][1]:
                read_dict[rec.query_name].insert(
                    0,
                    (tr, rec.get_tag("AS"), tr_cov,
                     float(rec.query_alignment_length) / inferred_read_length,
                     rec.mapping_quality))
            elif rec.get_tag("AS") == read_dict[
                    rec.query_name][0][1] and float(
                        rec.query_alignment_length
                    ) / inferred_read_length == read_dict[
                        rec.query_name][0][3]:  # same aligned sequence
                if tr_cov > read_dict[rec.query_name][0][
                        2]:  # choose the one with higher transcript coverage, might be internal TSS
                    read_dict[rec.query_name].insert(
                        0, (tr, rec.get_tag("AS"), tr_cov,
                            float(rec.query_alignment_length) /
                            inferred_read_length, rec.mapping_quality))
            else:
                read_dict[rec.query_name].append(
                    (tr, rec.get_tag("AS"), tr_cov,
                     float(rec.query_alignment_length) / inferred_read_length,
                     rec.mapping_quality))
        if tr not in fa_idx:
            cnt_stat["not_in_annotation"] += 1
            print "\t" + str(tr), "not in annotation ???"
    tr_kept = dict(
        (tr, tr) for tr in tr_cov_dict
        if len([it for it in tr_cov_dict[tr] if it > 0.9]) > min_sup_reads)
    unique_tr_count = Counter(read_dict[r][0][0] for r in read_dict
                              if read_dict[r][0][2] > 0.9)
    for r in read_dict:
        tmp = read_dict[r]
        tmp = [it for it in tmp if it[0] in tr_kept]
        if len(tmp) > 0:
            hit = tmp[0]  # transcript_id, pct_ref, pct_reads
        else:
            cnt_stat["no_good_match"] += 1
            continue
        # below line creates issue when header line has more than one _.
        # in this case, umi is assumed to be delimited from the barcode by the last _
        #bc, umi = r.split("#")[0].split("_")  # assume cleaned barcode
        split_r = r.split("#")[0].split("_")
        bc, umi = split_r[-2], split_r[-1]
        if "bc_file" in kwargs.keys():
            bc = bc_dict[bc]
        if len(tmp) == 1 and tmp[0][4] > 0:
            if bc not in bc_tr_count_dict:
                bc_tr_count_dict[bc] = {}
            bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi)
            cnt_stat["counted_reads"] += 1
        elif len(
                tmp) > 1 and tmp[0][1] == tmp[1][1] and tmp[0][3] == tmp[1][3]:
            if hit[1] > 0.8:
                if bc not in bc_tr_count_dict:
                    bc_tr_count_dict[bc] = {}
                bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi)
                cnt_stat["counted_reads"] += 1
            else:
                cnt_stat["ambigious_reads"] += 1
                if bc not in bc_tr_badcov_count_dict:
                    bc_tr_badcov_count_dict[bc] = {}
                bc_tr_badcov_count_dict[bc].setdefault(hit[0], []).append(umi)
        elif hit[2] < min_tr_coverage or hit[3] < min_read_coverage:
            cnt_stat["not_enough_coverage"] += 1
            if bc not in bc_tr_badcov_count_dict:
                bc_tr_badcov_count_dict[bc] = {}
            bc_tr_badcov_count_dict[bc].setdefault(hit[0], []).append(umi)
        else:
            if bc not in bc_tr_count_dict:
                bc_tr_count_dict[bc] = {}
            bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi)
            cnt_stat["counted_reads"] += 1
    print("\t" + str(cnt_stat))
    return bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept
Ejemplo n.º 18
0
def get_mito_SNV_table(bam_in, fa_f, out_dir, cb_seq_dict, bam_short, ch="chrM", min_cov=1000, report_pct=(0.15,0.85)):
    c2i = {"A":0, "C":1, "G":2, "T":3}  # four array.arrays of the same length in order A C G T
    fa_dict={}
    acc_pct = []
    REF_cnt_dict = {}
    ALT_cnt_dict = {}
    cb_seq_set = set(cb_seq_dict.keys())
    reporting_summary = []
    for c in get_fa(fa_f):
        fa_dict[c[0]] = c[1]
    bl = namedtuple("bl", ["s","e"])
    tmp_bl = bl(1, len(fa_dict[ch])-1)
    bamfile = bs.AlignmentFile(bam_in, "rb")
    bam_s = bs.AlignmentFile(bam_short, "rb")
    cb_corr_cnt = Counter()
    homo_dict = find_homo_regions(fa_dict[ch], [tmp_bl])
    cnt = bamfile.count(ch, 0, len(fa_dict[ch]))
    cov = bamfile.count_coverage(ch, 0, len(fa_dict[ch]),
    quality_threshold=0)  # four array.arrays of the same length in order A C G T
    for i in range(5, len(cov[0])-5):  # ignore the bases at the beginning and the end
        tot =  float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i])
        if tot>min_cov and fa_dict[ch][i] != "N":
            v_pos = i
            freq = cov[c2i[fa_dict[ch][v_pos]]][i]/tot
            acc_pct.append(freq)
            base_freq = [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])]
            base_freq.sort(key=lambda x:x[1],reverse=True)
            ALT = [it[0] for it in base_freq if it[0] != fa_dict[ch][v_pos]][0] # the most enriched ALT allele
            alt_freq = cov[c2i[ALT]][i]/tot
            if report_pct[0]< alt_freq < report_pct[1]:
                tmp_atcg_set = {}
                try:
                    cov_s = bam_s.count_coverage(ch, v_pos, v_pos+1, quality_threshold=20)
                    s_tot = cov_s[0][0]+cov_s[1][0]+cov_s[2][0]+cov_s[3][0]
                    if s_tot> (min_cov):
                        s_freq = cov_s[c2i[fa_dict[ch][v_pos]]][0]/float(s_tot)
                    else:
                        s_freq = -1
                except:
                    s_freq = -1
                seq_ent = seq_entropy(fa_dict[ch][(v_pos-10):(v_pos+10)])
                indel_freq = -1
                if (0+i not in homo_dict) and (seq_ent > 1) and (s_freq==-1 or (0.05<s_freq<0.95)):
                    for pileupcolumn in bamfile.pileup(ch, v_pos, v_pos+1,truncate=True, min_base_quality=0,ignore_overlaps=False,max_depth=1000000):
                        mean_base_q = pileupcolumn.get_mapping_qualities()
                        mean_base_q = sum(mean_base_q)/float(len(mean_base_q))
                        c_keep = 0
                        c_del = 0
                        for pileupread in pileupcolumn.pileups:
                            if not pileupread.is_del:
                                if not pileupread.is_refskip:
                                    c_keep += 1
                                    cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_")
                                    if cb_seq in cb_seq_set:
                                        tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1
                                        #tmp_set[cb_seq] += 1
                                        if pileupread.alignment.query_sequence[pileupread.query_position] == fa_dict[ch][v_pos]:
                                            REF_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq)
                                        if pileupread.alignment.query_sequence[pileupread.query_position] == ALT:
                                            ALT_cnt_dict.setdefault((ch, v_pos),[]).append(cb_seq)
                            else:
                                if not pileupread.is_refskip:
                                    c_del += 1
                    indel_freq = c_del/float(c_keep+c_del)
                    tmp_set = set()
                    for b in tmp_atcg_set:
                        tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]<=2)
                    if (base_freq[0][0] in tmp_atcg_set) and (base_freq[1][0] in tmp_atcg_set):
                        tmp_set.update(tmp_atcg_set[base_freq[0][0]])
                        tmp_set.update(tmp_atcg_set[base_freq[1][0]])
                        rv = hypergeom(len(tmp_set), len(tmp_atcg_set[base_freq[0][0]]), len(tmp_atcg_set[base_freq[1][0]]))
                        hpg_prob = rv.pmf(len(tmp_atcg_set[base_freq[0][0]].intersection(tmp_atcg_set[base_freq[1][0]])))
                    else:
                        hpg_prob = 1
                    reporting_summary.append((ch, v_pos, fa_dict[ch][v_pos], ALT, freq, s_freq, hpg_prob, seq_ent, indel_freq, mean_base_q))
    if not os.path.exists(os.path.join(out_dir,"mutation")):
        os.makedirs(os.path.join(out_dir,"mutation"))
    with open(os.path.join(out_dir,"mutation","MT_ref_cnt.csv"),"w") as ref_cnt_f:
        ref_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n")  # write header
        for p in REF_cnt_dict:
            tmp_c = Counter(REF_cnt_dict[p])
            ref_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" )
    with open(os.path.join(out_dir,"mutation","MT_alt_cnt.csv"),"w") as alt_cnt_f:
        alt_cnt_f.write("chr,position,"+",".join(cb_seq_dict.keys())+"\n")  # write header
        for p in ALT_cnt_dict:
            tmp_c = Counter(ALT_cnt_dict[p])
            alt_cnt_f.write("{},{},".format(p[0],p[1])+",".join( str(tmp_c[it]) for it in cb_seq_dict.keys() )+"\n" )
    with open(os.path.join(out_dir,"mutation","MT_allele_stat.csv"),"w") as al_stat:
        al_stat.write("chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency,mean_base_quality\n")  # write header
        for rec in reporting_summary:
            al_stat.write(",".join( str(it) for it in rec )+"\n" )
    pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1))
    with open(os.path.join(out_dir,"mutation","MT_freq_summary.csv"),"w") as cov_bin_out:
        for ix in range(500):
            cov_bin_out.write("{},{}\n".format(pt[ix],pct_bin[ix]))
Ejemplo n.º 19
0
def realigned_bam_allele_coverage(bam_in, chr_to_blocks, fa_f, cov_bin_f, cb_seq_dict, vcf_f=None, min_cnt=150,min_cov=100,report_pct=(0.1,0.9) ):
    c2i = {"A":0, "C":1, "G":2, "T":3}  # four array.arrays of the same length in order A C G T
    fa_dict={}
    vcf_dict={}
    acc_pct = []
    cb_seq_set = set(cb_seq_dict.keys())
    for c in get_fa(fa_f):
        fa_dict[c[0]] = c[1]
    bamfile = bs.AlignmentFile(bam_in, "rb")
    #vcf_in = bs.VariantFile(vcf_f)
    cb_corr_cnt = Counter()
    vcf_c = 0
    vcf_not_c = 0
    for ch in chr_to_blocks:
        print ch
        #if ch != "chr15":
        #    continue
        homo_dict = find_homo_regions(fa_dict[ch], chr_to_blocks[ch])
        for ith, bl in enumerate(chr_to_blocks[ch]):
            cnt = bamfile.count(ch, bl.s, bl.e)
            #try:
            #    vcf_dict = dict((it.pos-1, it) for it in vcf_in.fetch(ch[3:], bl.s, bl.e))
            #except:
            #    print ch[3:], "not in vcf.  ",ch
            if cnt < min_cnt:
                continue
            acc_pct_tr = []
            cov = bamfile.count_coverage(ch, bl.s, bl.e,
            quality_threshold=0)  # four array.arrays of the same length in order A C G T
            for i in range(10, len(cov[0])-10):  # ignore the bases at the beginning and the end
                tot =  float(cov[0][i]+cov[1][i]+cov[2][i]+cov[3][i])
                if (bl.s+i not in homo_dict) and tot>min_cov:
                    acc_pct_tr.append((bl.s+i, cov[c2i[fa_dict[ch][bl.s+i]]][i]/tot, [("A",cov[0][i]),("C",cov[1][i]),("G",cov[2][i]),("T",cov[3][i])] ))
            if len(acc_pct_tr)>10:
                for ix, pct in enumerate(acc_pct_tr):
                    if ix > 1 and ix < len(acc_pct_tr)-1:
                        if report_pct[0]<pct[1]<report_pct[1]:
                            if acc_pct_tr[ix-1][1]>0.95 and acc_pct_tr[ix+1][1]>0.95:
                                if seq_entropy(fa_dict[ch][(pct[0]-10):pct[0]])<1 or seq_entropy(fa_dict[ch][pct[0]:(pct[0]+10)])<1:  # ignore h**o regions
                                    continue
                                tmp_atcg_set = {}
                                tmp_set = Counter()
                                for pileupcolumn in bamfile.pileup(ch, pct[0], pct[0]+1,truncate=True, min_base_quality=0,ignore_overlaps=False):
                                    c_keep = 0
                                    c_del = 0
                                    for pileupread in pileupcolumn.pileups:
                                        if not pileupread.is_del and not pileupread.is_refskip:
                                            c_keep += 1
                                            cb_seq, umi_seq = pileupread.alignment.query_name.split("#")[0].split("_")
                                            if cb_seq in cb_seq_set:
                                                tmp_atcg_set.setdefault(pileupread.alignment.query_sequence[pileupread.query_position],Counter())[cb_seq] += 1
                                                tmp_set[cb_seq] += 1
                                        else:
                                            c_del += 1
                                if c_keep/float(c_keep+c_del)<0.7:
                                    continue
                                bs = tmp_atcg_set.keys()
                                for b in bs:
                                    tmp_atcg_set[b] = set(it for it in tmp_atcg_set[b] if tmp_atcg_set[b][it]>1)
                                tmp_set = set(it for it in tmp_set if tmp_set[it]>1)
                                pct[2].sort(key=lambda x:x[1],reverse=True)
                                lead_b = pct[2][0][0]  # only look at most enriched two possibilities
                                snd_b = pct[2][1][0]
                                if not (len(tmp_atcg_set[lead_b])>10 and len(tmp_atcg_set[snd_b])>10):
                                    continue
                                rv = hypergeom(len(tmp_set), len(tmp_atcg_set[lead_b]), len(tmp_atcg_set[snd_b]))
                                if rv.pmf(len(tmp_atcg_set[lead_b].intersection(tmp_atcg_set[snd_b])))<0.000001 and len(tmp_atcg_set[lead_b].intersection(tmp_atcg_set[snd_b]))<0.9*min(len(tmp_atcg_set[lead_b]), len(tmp_atcg_set[snd_b])):
                                    #print ("\ncoverage at base %s = %s" % (pileupcolumn.pos, pileupcolumn.n))
                                    tmp_set = tmp_atcg_set[lead_b] - tmp_atcg_set[snd_b]  # x not y
                                    if len(tmp_set)>1:
                                        update_corr_cnt(list(tmp_set), cb_corr_cnt)
                                    tmp_set = tmp_atcg_set[snd_b] - tmp_atcg_set[lead_b]  # y not x
                                    if len(tmp_set)>1:
                                        update_corr_cnt(list(tmp_set), cb_corr_cnt)
                                    tmp_set = tmp_atcg_set[lead_b] & tmp_atcg_set[snd_b]  # x and y
                                    if len(tmp_set)>1:
                                        update_corr_cnt(list(tmp_set), cb_corr_cnt)
                                    if pct[0] in vcf_dict:
                                        vcf_c += 1
                                    else:
                                        vcf_not_c += 1
                                #print ch, pct[0],pct[1],pct[2]
            acc_pct.extend([it[1] for it in acc_pct_tr])
    print cb_corr_cnt.most_common(30)
    print vcf_c, vcf_not_c
    cov_bin_out = open(cov_bin_f,"w")
    for cbs in cb_corr_cnt:
        cov_bin_out.write("{},{},{}\n".format(cbs[0],cbs[1],cb_corr_cnt[cbs]))
Ejemplo n.º 20
0
import bamnostic as bs
import os.path
from functions import comp,parse_md


filepath=os.path.join(os.path.dirname(__file__),'../../../T-600-Star data/ru_snorri/gpv213sp1.00.minimap.sorted.bam')
bam=bs.AlignmentFile(filepath,'rb')

r=next(bam)

ref=parse_md(r.get_tag('MD'),r.cigarstring,r.query_sequence)

outstr=comp(r.cigarstring,r.query_sequence,ref)

#print(len(r.seq))
#print(len(ref))


#print(r.cigarstring+'\n')
#print(outstr)
Ejemplo n.º 21
0
def test_first_read():
    with bs.AlignmentFile(bs.example_bam, 'rb') as bam:
        first_read = next(bam)
        assert first_read.read_name == 'EAS56_57:6:190:289:82'
Ejemplo n.º 22
0
#####
import re
import bamnostic
import time
import sys

#################################
# bamFile = bamnostic.AlignmentFile(sys.argv[1], "rb")
# results = open(sys.argv[2], "w")
# basesCover = int(sys.argv[3])
# showBases = int(sys.argv[4])		# 当设置为0时显示所有碱基;最终结果根据完全相同的值进行合并,设置值越大(0除外),结果越多
# maxGap = 10
#################################

##########  test  ###############
bamFile = bamnostic.AlignmentFile("H04728D.CED1108.bam", "rb")
results = open("results.txt", "w")
basesCover = 30
showBases = 10
maxGap = 10
#################################

## 提示信息
print "bases Cover set: " + str(basesCover)
if showBases == 0:
	print "show all bases"
else:
	print "show " + str(showBases) + " bases"
print "-----------------------------------"
start = time.clock()
Ejemplo n.º 23
0
def test_header():
    with bs.AlignmentFile(bs.example_bam, 'rb') as bam:
        expected = {0: ('chr1', 1575), 1: ('chr2', 1584)}
        observed = bam.header()
        assert observed == expected
Ejemplo n.º 24
0
def bld_atac_mtx(list_bam_files, loaded_feat, output_file_name=None,
    path=None, writing_option='a', header=None, mode='rb',
    check_sq=True, chromosomes=HUMAN):
    """
    Build a count matrix one set of features at a time. It is specific of ATAC-seq data.
    It curently do not write down a sparse matrix. It writes down a regular count matrix
    as a text file. 
    
    Parameters
    ----------

    list_bam_files: input must be a list of bam file names. One for each cell to 
        build the count matrix for

    loaded_feat: the features for which you want to build the count matrix
        
    output_file_name: name of the output file. The count matrix that will be written
        down in the current directory. If this parameter is not specified, 
        the output count amtrix will be named 'std_output_ct_mtx.txt'

    path: path where to find the input file. The output file will be written down
    in your current directory, it is not affected by this parameter.

    writing_option: standard writing options for the output file. 'a' or 'w'
        'a' to append to an already existing matrix. 'w' to overwrite any 
        previously exisiting matrix. 
        default: 'a'

    header: if you want to write down the feature name specify this argument.
        Input must be a list.

    mode: bamnostic argument 'r' or 'w' for read and write 'b' and 's' for bam or sam
        if only 'r' is specified, bamnostic will try to determine if the input is 
        either a bam or sam file.

    check_sq: bamnostic argument. when reading, check if SQ entries are present in header

    chromosomes: chromosomes of the species you are considering. default value
        is the human genome (not including mitochondrial genome).
        HUMAN = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
                '2', '3', '4', '5', '6', '7', '8', '9','X', 'Y']
        MOUSE = '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
                '2', '3', '4', '5', '6', '7', '8', '9','X', 'Y']

    Return
    ------
    It does not return any object. The function write down the desired count
    matrix in a txt file

    """
        
    if output_file_name==None:
        output_file_name='std_output_ct_mtx.txt'


    if path==None:
        path=''
    
    # open file to write
    output_file = open(path+output_file_name, writing_option)
    # write header if specified
    if header != None:
        output_file.write('sample_name\t')
        for feature in header:
            output_file.write(feature)
            output_file.write('\t')
        output_file.write('\n')
    # close file to write   
    output_file.close()

    # start going through the bam files
    for name_file in list_bam_files[0:]:

        ## important variables for output
        index_feat = {key: 0 for key in chromosomes}    
        val_feat = {key: [0 for x in range(len(loaded_feat[key]))] for key in chromosomes}
    
        ## PART 1 read the bam file
        keep_lines = []
        #samfile = bs.AlignmentFile(path+output_file_name, mode="rb", check_sq=False)
        samfile = bs.AlignmentFile(path+name_file, mode="rb", check_sq=False)
        #for read in samfile.fetch(until_eof=True):
        for read in samfile:
            line = str(read).split('\t')
            if line[2][3:] in chromosomes:
                keep_lines.append(line[2:4])
            ### print -- output
        print(name_file, len(keep_lines), 'mapped reads')
        samfile.close()
        
        ## PART2 reads that fall into 
        for element in keep_lines:
            ## 2 things per line:
            chrom = element[0][3:]
            read_pos = int(element[1])
            max_value_index = len(loaded_feat[chrom])
            ## I want to check if the read map to a feature in the same chrom
            pointer_feat_pos = index_feat[chrom]
            for feat_pos in loaded_feat[chrom][pointer_feat_pos:]:
                pointer_feat_pos += 1
                # Go through all features that are smaller than the read position
                if read_pos > feat_pos[1]:
                    continue
                # if read_pos fall in a feature
                elif read_pos > feat_pos[0]:
                    # Update the pointer for the next read if the pointer isn't out of range
                    if pointer_feat_pos < max_value_index:
                        index_feat[chrom] = pointer_feat_pos
                        val_feat[chrom][pointer_feat_pos] += 1
                    else:
                        index_feat[chrom] = max_value_index
                    # Check the following features without updating the pointer. 
                    break
                else:
                    break
     
            for feat_pos in loaded_feat[chrom][pointer_feat_pos:]:
                # +1 if reads fall into more than one feature
                if feat_pos[0] < read_pos:
                    val_feat[chrom][pointer_feat_pos] += 1
                    pointer_feat_pos += 1
                # if read_pos > start position of the new feature break
                elif read_pos < feat_pos[0]:
                    break
                else:
                    print('error')
                    break
                    
        # PART 3
        # open
        output_file = open(path+output_file_name, 'a')
        # write down the result of the cell
        output_file.write(name_file)
        output_file.write('\t')
        for chrom in chromosomes:
            output_file.write('\t'.join([str(p) for p in val_feat[chrom]]))
            output_file.write('\t')
        output_file.write('\n')
        #close
        output_file.close()
Ejemplo n.º 25
0
#This is the script to check bam file for known Y-SNPs

import bamnostic as bs
import numpy as np
import pandas as pd
import time

bam_file_path = r'D:\Workspace\paleogenetics\project_data\med_isles\I14675.hg19.bam'
#bam_file_path = 'project_data/Cheddar_man/SB524A4_lib.merged.markdup.bam'
target_snps_path = 'project_data/trunk_and_i_new.csv'
output_file_name = 'project_data/med_isles_I14675.csv'

bam = bs.AlignmentFile(bam_file_path, 'rb')
df = pd.read_csv(target_snps_path)
# это - номер хромосомы, 0 - это первая хромосома, 21 - 22-ая, 22 - X, 23 - Y
tid_number = 23
df['result_A'] = 0
df['result_T'] = 0
df['result_G'] = 0
df['result_C'] = 0
df['result_A_end'] = 0
df['result_T_end'] = 0
df['result_G_end'] = 0
df['result_C_end'] = 0

build = 'build_37'
start = time.time()

for bam_reads in bam:
    if bam_reads.tid == tid_number:
        first = bam_reads.pos