def compute_gc_binned_metadata(Meta,Bam): gc_welford = {} # for each contig for contig in Meta.user_contigs: # for each window for (start,end,gc_bin,wlen) in Meta.binned_loci[contig]: gc_count=0 key = tuple2string( ( contig, wlen, gc_bin ) ) if gc_welford.get(key)==None: gc_welford[key] = Welford() # for each alignment for Aln in Bam.bam.fetch(reference=contig, start=start, end=end): if Aln.reference_start == None or Aln.reference_end==None: continue midpoint = alignment_midpoint(Aln) # if the mapped midpoint of the alignment is within the window if start <= midpoint <= end: gc_count+=1 gc_welford[key].update(gc_count) for key in gc_welford: Meta.gc_rc[key] = gc_welford[key].mean Meta.gc_std[key] = gc_welford[key].std
def compute_contig_metadata(Meta,Bam): tlen_welford = {} # for each contig for contig in Meta.user_contigs: ## reset these variables for each contig if tlen_welford.get(contig)==None: tlen_welford[contig] = Welford() read_length_sum,read_count, bp_span = 0,0,0 terminator = False ## for (start,end) in Meta.loci[contig]: if terminator==True: break ######## for Aln in Bam.bam.fetch(reference=contig,start=start,end=end): if Aln.is_unmapped or type(Aln.reference_length) != int: continue midpoint=alignment_midpoint(Aln) if start <= midpoint <= end: read_count+=1 read_length_sum+= Aln.reference_length if abs(Aln.template_length)<= Meta.tlen_cap: tlen_welford[contig].update( abs(Aln.template_length) ) if read_count > Meta.n_reads: bp_span+= Aln.reference_end - start terminator=True break if terminator==False: bp_span += (end-start) ######## if read_count!=0: Meta.read_len[contig] = read_length_sum / float(read_count) else: Meta.read_len[contig]='nan' if bp_span!=0: Meta.doc[contig] = (Meta.read_len[contig] * read_count) / float(bp_span) else: Meta.doc[contig]='nan' Meta.tlen[contig] = tlen_welford[contig].mean Meta.tlen_std[contig] = tlen_welford[contig].std
def gcbin_coverage(AlnFile, Meta, contig, start, end, sequence): # choose window size based on the size of the region window_size = region_window_size(Meta.window_lengths, start, end) # find the mean and STD binned coverage in the SV region in relation to GC content Region = Welford() region_windows = create_windows(window_size, sequence) for window in region_windows: local_start, local_end = window[0], window[1] window_seq = sequence[local_start:local_end] # find the gc bin for the window gc_bin = get_gc_bin(seq_gc_perc(window_seq)) # count up the number of reads in the window win_start, win_end = start+local_start, start+local_end # 1 bp overlap in windows ((0,4),(4,8),(8,12)....) read_count = 0 for Aln in AlnFile.bam.fetch(reference = contig, start = win_start, end = win_end): # skip duplicates if Aln.is_duplicate and Aln.is_unmapped: continue # Some Aln.reference_start == None, which causes error try: midpoint = alignment_midpoint(Aln) except: continue if start <= midpoint <= end: read_count += 1 # calculate the fold change of read count in window to the count of the null with similar gc content key = tuple2string( (contig, window_size, gc_bin) ) try: if int(Meta.gc_rc[key]) != 0: gc_fc = read_count / Meta.gc_rc[key] else: gc_fc = 0 except KeyError: gc_fc = 0 # add the gc fold change value to list of all windows in the sv region Region.update(gc_fc) # return mean and STD for all the windows in the region return (Region.mean, Region.std)
def gcbin_coverage(AlnFile, Meta, contig, start, end): fasta = Meta.fasta # choose window size based on the size of the region region_size = end - start window_size = Meta.window_lengths[0] if region_size >= (2 * Meta.window_lengths[1]): window_size = Meta.window_lengths[1] # find the mean and STD binned coverage in the SV region in relation to GC content Region = Welford() read_count = 0 region_windows = create_windows(window_size, start, end) for window in region_windows: win_start, win_end = window[0], window[1] # find the gc bin for the window gc_bin = get_gc_bin( get_gc_perc(fasta, window_size, contig, win_start, win_end)) # count up the number of reads in the window for Aln in AlnFile.bam.fetch(reference=contig, start=win_start, end=win_end): # Some Aln.reference_start == None, which causes error try: midpoint = alignment_midpoint(Aln) except: continue if start <= midpoint <= end: read_count += 1 # calculate the fold change of read count in window to the count of the null with similar gc content key = tuple2string((contig, window_size, gc_bin)) gc_fc = read_count / Meta.gc_rc[key] # add the gc fold change value to list of all windows in the sv region Region.update(gc_fc) # return mean and STD for all the windows in the region return (Region.mean, Region.std)
def doc_coverage(AlnFile, Meta, contig, start, end): meta_doc = Meta.doc[contig] # find the doc fold change for the sv region read_length_sum = 0 region_size = end - start for Aln in AlnFile.bam.fetch(reference=contig, start=start, end=end): # Some Aln.reference_start == None, which causes error try: midpoint = alignment_midpoint(Aln) except: continue if start <= midpoint <= end: read_length_sum += Aln.reference_length if region_size != 0: sv_doc = read_length_sum / region_size if meta_doc != 0: doc_fc = sv_doc / meta_doc return doc_fc