Exemple #1
0
def compute_gc_binned_metadata(Meta,Bam):
	
	gc_welford = {}
	
	# for each contig
	for contig in Meta.user_contigs:
		# for each window 
		for (start,end,gc_bin,wlen) in Meta.binned_loci[contig]:
			
			gc_count=0
			key = tuple2string( ( contig, wlen, gc_bin ) )
			if gc_welford.get(key)==None: gc_welford[key] = Welford()

			# for each alignment
			for Aln in Bam.bam.fetch(reference=contig, start=start, end=end):
				if Aln.reference_start == None or Aln.reference_end==None: continue
				midpoint = alignment_midpoint(Aln)
				# if the mapped midpoint of the alignment is within the window
				if start <= midpoint <= end: 
					gc_count+=1
			gc_welford[key].update(gc_count)

		for key in gc_welford:
			Meta.gc_rc[key] = gc_welford[key].mean
			Meta.gc_std[key] = gc_welford[key].std
Exemple #2
0
def compute_contig_metadata(Meta,Bam):

	tlen_welford = {}

	# for each contig
	for contig in Meta.user_contigs:
		## reset these variables for each contig
		if tlen_welford.get(contig)==None: tlen_welford[contig] = Welford()
		read_length_sum,read_count, bp_span = 0,0,0
		terminator = False
		##
		for (start,end) in Meta.loci[contig]:
			if terminator==True: break
			########
			for Aln in Bam.bam.fetch(reference=contig,start=start,end=end):

				if Aln.is_unmapped or type(Aln.reference_length) != int: continue
				
				midpoint=alignment_midpoint(Aln)
				
				if start <= midpoint <= end:
					read_count+=1
					read_length_sum+= Aln.reference_length
					
					if abs(Aln.template_length)<= Meta.tlen_cap:
						tlen_welford[contig].update( abs(Aln.template_length) )
					
				if read_count > Meta.n_reads:
					bp_span+= Aln.reference_end - start
					terminator=True
					break

			if terminator==False:
				bp_span += (end-start)

			########		
		if read_count!=0:
			Meta.read_len[contig] = read_length_sum / float(read_count)
		else: 
			Meta.read_len[contig]='nan'
		if bp_span!=0:	
			Meta.doc[contig] = (Meta.read_len[contig] * read_count) / float(bp_span)
		else:
			Meta.doc[contig]='nan'
		Meta.tlen[contig] = tlen_welford[contig].mean
		Meta.tlen_std[contig] = tlen_welford[contig].std
Exemple #3
0
def gcbin_coverage(AlnFile, Meta, contig, start, end, sequence):
	# choose window size based on the size of the region
	window_size = region_window_size(Meta.window_lengths, start, end)

	# find the mean and STD binned coverage in the SV region in relation to GC content
	Region = Welford()

	region_windows = create_windows(window_size, sequence)
	for window in region_windows:
		local_start, local_end = window[0], window[1]
		window_seq = sequence[local_start:local_end]

		# find the gc bin for the window
		gc_bin = get_gc_bin(seq_gc_perc(window_seq))

		# count up the number of reads in the window
		win_start, win_end = start+local_start, start+local_end    # 1 bp overlap in windows ((0,4),(4,8),(8,12)....)

		read_count = 0
		for Aln in AlnFile.bam.fetch(reference = contig, start = win_start, end = win_end):
			# skip duplicates
			if Aln.is_duplicate and Aln.is_unmapped: continue
			# Some Aln.reference_start == None, which causes error
			try:
				midpoint = alignment_midpoint(Aln)
			except:
				continue
			if start <= midpoint <= end:
				read_count += 1

		# calculate the fold change of read count in window to the count of the null with similar gc content
		key = tuple2string( (contig, window_size, gc_bin) )

		try:
			if int(Meta.gc_rc[key]) != 0:
				gc_fc = read_count / Meta.gc_rc[key]
			else:
				gc_fc = 0
		except KeyError:
			gc_fc = 0

		# add the gc fold change value to list of all windows in the sv region
		Region.update(gc_fc)

	# return mean and STD for all the windows in the region
	return (Region.mean, Region.std)
Exemple #4
0
def gcbin_coverage(AlnFile, Meta, contig, start, end):
    fasta = Meta.fasta

    # choose window size based on the size of the region
    region_size = end - start
    window_size = Meta.window_lengths[0]
    if region_size >= (2 * Meta.window_lengths[1]):
        window_size = Meta.window_lengths[1]

    # find the mean and STD binned coverage in the SV region in relation to GC content
    Region = Welford()

    read_count = 0
    region_windows = create_windows(window_size, start, end)
    for window in region_windows:
        win_start, win_end = window[0], window[1]

        # find the gc bin for the window
        gc_bin = get_gc_bin(
            get_gc_perc(fasta, window_size, contig, win_start, win_end))

        # count up the number of reads in the window
        for Aln in AlnFile.bam.fetch(reference=contig,
                                     start=win_start,
                                     end=win_end):
            # Some Aln.reference_start == None, which causes error
            try:
                midpoint = alignment_midpoint(Aln)
            except:
                continue
            if start <= midpoint <= end:
                read_count += 1

        # calculate the fold change of read count in window to the count of the null with similar gc content
        key = tuple2string((contig, window_size, gc_bin))
        gc_fc = read_count / Meta.gc_rc[key]

        # add the gc fold change value to list of all windows in the sv region
        Region.update(gc_fc)

    # return mean and STD for all the windows in the region
    return (Region.mean, Region.std)
Exemple #5
0
def doc_coverage(AlnFile, Meta, contig, start, end):
    meta_doc = Meta.doc[contig]

    # find the doc fold change for the sv region
    read_length_sum = 0
    region_size = end - start
    for Aln in AlnFile.bam.fetch(reference=contig, start=start, end=end):
        # Some Aln.reference_start == None, which causes error
        try:
            midpoint = alignment_midpoint(Aln)
        except:
            continue
        if start <= midpoint <= end:
            read_length_sum += Aln.reference_length
    if region_size != 0:
        sv_doc = read_length_sum / region_size
    if meta_doc != 0:
        doc_fc = sv_doc / meta_doc

    return doc_fc