def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval( Interval(int(i[0]), int(i[1]), value={'anno': ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts), )) intron_match = zeros((len(transcripts), )) blocks = read.get_blocks() junction_lengths = [] for i, j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return (out_counts)
def calculate_score(chrom, start, end,ctype="WPS"): filteredReads = Intersecter() posRange = defaultdict(int) for read in readIterator(args,chrom,start,end): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random() >= options.downsample: continue rstart = min(read.pos,read.pnext)+1 # 1-based rend = rstart+abs(read.isize)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 else: if options.downsample != None and random.random() >= options.downsample: continue rstart = read.pos+1 # 1-based rend = rstart+aln_length(read.cigar)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 if ctype == "WPS": protection = options.protection//2 for pos in xrange(start,end+1): rstart,rend = pos-protection,pos+protection gcount,bcount = 0,0 for read in filteredReads.find(rstart,rend): if (read.start > rstart) or (read.end < rend): bcount +=1 else: gcount +=1 posRange[pos]+=gcount-bcount res = [] for pos in xrange(start,end+1): res.append(posRange[pos]) return res
def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts),)) intron_match = zeros((len(transcripts),)) blocks = read.get_blocks() junction_lengths = [] for i,j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return(out_counts)
def maps_gene(mapped): '''Determine if the mapped alignment falls within a gene.''' global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
def maps_gene(mapped): """Determine if the mapped alignment falls within a gene.""" global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [ intersecter.add_interval( Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes ] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
start, end = init_pos, final_pos window_size = 120 ## define the window size (could be made an option) prot_region = window_size // 2 ## definitely a parameter worth messing with ## write the first line of info for the peak calling file outfile.write("fixedStep chrom=chr%s start=%d step=1" % (chrom, start)) ## This is where the scoring occurs for pos in range(start, end + 1): ## get the start/end points of the window in current position w_start, w_end = pos - prot_region, pos + prot_region ## define the window at each position in region end_points = 0 ## set to 0 each window shift intact_reads = 0 ## get the number of fragments within the window that are fully intact (+1) ## or have an endpoint (-1) for read in start_end_list.find(w_start, w_end): if (read.start > w_start) or (read.end < w_end): end_points += 1 else: intact_reads += 1 ## write the score for each window position to the output wig file outfile.write("\n" + str(intact_reads - end_points)) ## close the file outfile.close() ## print total execution time sys.stderr.write("--- %s seconds ---\n" % (time.time() - start_time))
posRange[rend][1] += 1 elif read.is_reverse: if rend >= regionStart and rend <= regionEnd: posRange[rend][1] += 1 else: if (rstart >= regionStart and rstart <= regionEnd): posRange[rstart][1] += 1 filename = options.outfile % cid outfile = gzip.open(filename, 'w') cov_sites = 0 outLines = [] for pos in range(regionStart, regionEnd + 1): rstart, rend = pos - protection, pos + protection gcount, bcount = 0, 0 for read in filteredReads.find(rstart, rend): if (read.start > rstart) or (read.end < rend): bcount += 1 else: gcount += 1 covCount, startCount = posRange[pos] cov_sites += covCount outLines.append( "%s\t%d\t%d\t%d\t%d\n" % (chrom, pos, covCount, startCount, gcount - bcount)) if strand == "-": outLines = outLines[::-1] for line in outLines: outfile.write(line) outfile.close() if cov_sites == 0 and not options.empty: os.remove(filename)
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False): """ Returns the splice differences between two transcripts. Single exon-comparisons are ignored. Parameters ---------- trans1 : string transcript of interest trans2 : string second transcript of interest transcript_dict : a dictionary of transcript names with values being a list of exons afe : bool whether to include alternate start and ends :TODO make a better return :TODO maybe include something similar to to_plot Returns ------- Exclusive Junctions : 5' upstream exons : 3' downstram exons : Skipped Exons : Diffevent """ # TODO refactor this t1 = transcript_dict[trans1] t2 = transcript_dict[trans2] tree = Intersecter() starts1 = [i[0] for i in t1] starts2 = [i[0] for i in t2] reverse = False if min(starts1) <= min(starts2): s1 = t1 s2 = t2 s2_beg = min(starts2) else: s1 = t2 s2 = t1 reverse = True s2_beg = min(starts1) if reverse: torder = (trans2, trans1) else: torder = (trans1, trans2) # Ignore single-exon stuff if len(s1) <= 1 or len(s2) <= 1: return ([], []) for i in s1: tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno': i[2]})) matching_exons = [] exclusive_juncs = [] skipped_exons = [] altends = [] exon_match = {} s1.sort(key=lambda x: x[0]) s2.sort(key=lambda x: x[0]) max_exon_1 = s1[-1][2] max_exon_2 = s2[-1][2] #end_position_s2 = max([i[1] for i in s2]) s1_end = max([i[1] for i in s1]) prev_match = None if max_exon_1 < s1[0][2]: strand = -1 else: strand = 1 for pcurr in range(len(s2)): start, end, exon_n = s2[pcurr] overlap = tree.find(int(start), int(end)) if len(overlap) == 0: if prev_match and (start < s1_end): #skipped exons cigar = _generate_cigar(s2, pcurr, mskip=1) try: if exon_match[exon_n - strand] == prev_match.value['anno']: try: nm = tree.find(*s2[pcurr + 1][0:2])[0] ocigar = [(3, nm.start - prev_match.end)] nexon = nm.value['anno'] except IndexError: nm = s1[_get_by_exonn( prev_match.value['anno'] + strand, s1)] ocigar = [(3, nm[0] - prev_match.end)] nexon = nm[2] skipped_exons.append( DiffEvent('skipped_exon', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num=(None, exon_n), exon2=(prev_match.value['anno'], nexon))) except KeyError: # Multiple skipped exons ncig = _generate_cigar(s2, pcurr, mskip=1)[1:] skipped_exons[-1]._extend(ncig, cig=2) elif start > s1_end: if prev_match: cigar = _generate_cigar(s2, pcurr, mskip=1) pm = tree.find(*s2[pcurr - 1][0:2])[0] pexon = pm.value['anno'] ocigar = [] for i in range(pexon, max_exon_1 + strand, strand): narg = _get_by_exonn(i, s1) ocigar.append((0, s1[narg][0], s1[narg][1])) try: ocigar.append((3, s1[narg][1] - s1[narg + 1][0])) except IndexError: pass #:TODO extend ocigar till end? altends.append( DiffEvent('AE', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num=(None, exon_n))) else: pass else: # Alternate start site that starts in between exons # of other transcript cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1] try: nm = tree.find(*s2[pcurr + 1][0:2])[0] except IndexError: from IPython import embed embed() nexon = nm.value['anno'] narg = _get_by_exonn(nexon - strand, s1) pmatch = s1[narg] ocigar = [(0, pmatch[1] - pmatch[0]), (3, nm.start - pmatch[1])] altends.append( DiffEvent('AS', start, end, torder, cigar2=cigar, cigar1=ocigar)) elif len(overlap) == 1: if start == overlap[0].start and end == overlap[0].end: s1_exon_n = overlap[0].value['anno'] matching_exons.append( (start, end, (s1_exon_n, exon_n), (0, 0))) if prev_match: if s1_exon_n - prev_match.value['anno'] == strand: pass else: # Difference in exon matches mskip = abs(s1_exon_n - prev_match.value['anno']) - 1 narg = _get_by_exonn(prev_match.value['anno'] + strand, s1) s_s1 = s1[narg] # skipped s1 cigar = _generate_cigar(s1, narg, mskip=mskip) ocigar = [(3, start - s2[pcurr - 1][1])] # Remove previous one skipped_exons.append( DiffEvent('skipped_exon', s_s1[0], s_s1[1], torder, cigar2=ocigar, cigar1=cigar, exon_num=(s_s1[2], None), exon2=(exon_n - strand, exon_n))) prev_match = overlap[0] else: sstart = min(start, overlap[0].start) ssend = max(end, overlap[0].end) # Ignore 5' or 3' differences if (exon_n == max_exon_2 and overlap[0].value['anno'] == max_exon_1): if end == overlap[0].end: prev_match = overlap[0] else: exclusive_juncs.append( (sstart, ssend, (overlap[0].value['anno'], exon_n), (overlap[0].start - start, overlap[0].end - end))) # Deal with partial matches prev_match = overlap[0] exon_match[exon_n] = int(overlap[0].value['anno']) else: pass skipped_exons = EventCollection(transcript_ids=[s1, s2], events=skipped_exons) skipped_exons.events.extend(altends) return (matching_exons, skipped_exons)
def calculate_score(chrom, start, end, ctype="WPS"): filteredReads = Intersecter() posRange = defaultdict(int) for read in readIterator(args, chrom, start, end): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext + read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random( ) >= options.downsample: continue rstart = min(read.pos, read.pnext) + 1 # 1-based rend = rstart + abs(read.isize) - 1 # end included rlength = rend - rstart + 1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart, rend)) if ctype == "COV": for i in range(rstart, rend + 1): if i >= start and i <= end: posRange[i] += 1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart] += 1 if rend >= start and rend <= end: posRange[rend] += 1 else: if options.downsample != None and random.random( ) >= options.downsample: continue rstart = read.pos + 1 # 1-based rend = rstart + aln_length(read.cigar) - 1 # end included rlength = rend - rstart + 1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart, rend)) if ctype == "COV": for i in range(rstart, rend + 1): if i >= start and i <= end: posRange[i] += 1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart] += 1 if rend >= start and rend <= end: posRange[rend] += 1 if ctype == "WPS": protection = options.protection // 2 for pos in xrange(start, end + 1): rstart, rend = pos - protection, pos + protection gcount, bcount = 0, 0 for read in filteredReads.find(rstart, rend): if (read.start > rstart) or (read.end < rend): bcount += 1 else: gcount += 1 posRange[pos] += gcount - bcount res = [] for pos in xrange(start, end + 1): res.append(posRange[pos]) return res
if (rstart >= start and rstart <= end): posRange[rstart][1]+=1 if rend >= start and rend <= end: posRange[rend][1]+=1 elif read.is_reverse: if rend >= start and rend <= end: posRange[rend][1]+=1 else: if (rstart >= start and rstart <= end): posRange[rstart][1]+=1 protection = options.protection//2 for pos in range(start,end+1): rstart,rend = pos-protection,pos+protection gcount,bcount = 0,0 for read in filteredReads.find(rstart,rend): if (read.start > rstart) or (read.end < rend): bcount +=1 else: gcount +=1 posRange[pos][2]+=gcount-bcount if options.coverage != "OFF": output = sys.stdout if options.coverage != "": output = gzip.open(options.coverage,'w') output.write("fixedStep chrom=chr%s start=%d step=1\n"%(outchrom,start)) for pos in range(start,end+1): output.write("%d\n"%(posRange[pos][0])) if options.coverage != "": output.close() else: output.write("\n") if options.starts != "OFF":
The threshold can also be adjusted (thresh_max). """ for line in smooth_list: if line > 0 and not started: started = True start_base = pos + startpos if line < 0 and started: thresh += 1 if thresh >= thresh_max: thresh = 0 started = False end_base = pos + startpos region_len = end_base - start_base if region_len >= nucl_min and region_len <= nucl_max: nucl.add_interval(Interval(start_base, end_base)) count += 1 pos += 1 ## initialize a list for data from find_contig() method contig_list = [] for read in nucl.find(0, end_base + 1): ## for each potential nucleosome interval, call find contig method. find_contig(read, smooth_list, startpos, contig_list) ## lastly, call this method to print all probable nucleosome ranges to a bed file calc_nucl_distance(contig_list, smooth_list, startpos, chrom_num)
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False): """ Returns the splice differences between two transcripts. Single exon-comparisons are ignored. Parameters ---------- trans1 : string transcript of interest trans2 : string second transcript of interest transcript_dict : a dictionary of transcript names with values being a list of exons afe : bool whether to include alternate start and ends :TODO make a better return :TODO maybe include something similar to to_plot Returns ------- Exclusive Junctions : 5' upstream exons : 3' downstram exons : Skipped Exons : Diffevent """ # TODO refactor this t1 = transcript_dict[trans1] t2 = transcript_dict[trans2] tree = Intersecter() starts1 = [i[0] for i in t1] starts2 = [i[0] for i in t2] reverse = False if min(starts1) <= min(starts2): s1 = t1 s2 = t2 s2_beg = min(starts2) else: s1 = t2 s2 = t1 reverse = True s2_beg = min(starts1) if reverse: torder = (trans2, trans1) else: torder = (trans1, trans2) # Ignore single-exon stuff if len(s1) <= 1 or len(s2) <= 1: return([], []) for i in s1: tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':i[2]})) matching_exons = [] exclusive_juncs = [] skipped_exons = [] altends = [] exon_match = {} s1.sort(key=lambda x: x[0]) s2.sort(key=lambda x: x[0]) max_exon_1 = s1[-1][2] max_exon_2 = s2[-1][2] #end_position_s2 = max([i[1] for i in s2]) s1_end = max([i[1] for i in s1]) prev_match = None if max_exon_1 < s1[0][2]: strand = -1 else: strand = 1 for pcurr in range(len(s2)): start, end, exon_n = s2[pcurr] overlap = tree.find(int(start), int(end)) if len(overlap) == 0: if prev_match and (start < s1_end): #skipped exons cigar = _generate_cigar(s2, pcurr, mskip=1) try: if exon_match[exon_n - strand] == prev_match.value['anno']: try: nm = tree.find(*s2[pcurr + 1][0:2])[0] ocigar = [(3, nm.start - prev_match.end)] nexon = nm.value['anno'] except IndexError: nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] ocigar = [(3,nm[0] - prev_match.end)] nexon = nm[2] skipped_exons.append(DiffEvent('skipped_exon', start, end, torder, cigar2=cigar, cigar1 = ocigar, exon_num = (None, exon_n), exon2=(prev_match.value['anno'], nexon)) ) except KeyError: # Multiple skipped exons ncig = _generate_cigar(s2, pcurr, mskip=1)[1:] skipped_exons[-1]._extend(ncig, cig=2) elif start > s1_end: if prev_match: cigar = _generate_cigar(s2, pcurr, mskip=1) pm = tree.find(*s2[pcurr - 1][0:2])[0] pexon = pm.value['anno'] ocigar = [] for i in range(pexon, max_exon_1+strand, strand): narg = _get_by_exonn(i, s1) ocigar.append((0, s1[narg][0], s1[narg][1])) try: ocigar.append((3, s1[narg][1] - s1[narg+1][0])) except IndexError: pass #:TODO extend ocigar till end? altends.append(DiffEvent('AE', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num = (None, exon_n))) else: pass else: # Alternate start site that starts in between exons # of other transcript cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1] try: nm = tree.find(*s2[pcurr + 1][0:2])[0] except IndexError: from IPython import embed embed() nexon = nm.value['anno'] narg = _get_by_exonn(nexon - strand, s1) pmatch = s1[narg] ocigar = [(0, pmatch[1] - pmatch[0]), (3, nm.start - pmatch[1])] altends.append(DiffEvent('AS', start, end, torder, cigar2=cigar, cigar1 = ocigar)) elif len(overlap) == 1: if start == overlap[0].start and end == overlap[0].end: s1_exon_n = overlap[0].value['anno'] matching_exons.append((start, end, (s1_exon_n, exon_n), (0, 0))) if prev_match: if s1_exon_n - prev_match.value['anno'] == strand: pass else: # Difference in exon matches mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1 narg = _get_by_exonn(prev_match.value['anno']+strand, s1) s_s1 = s1[narg] # skipped s1 cigar = _generate_cigar(s1, narg, mskip=mskip) ocigar = [(3, start - s2[pcurr-1][1])] # Remove previous one skipped_exons.append( DiffEvent('skipped_exon', s_s1[0], s_s1[1], torder, cigar2 = ocigar, cigar1 = cigar, exon_num = (s_s1[2], None), exon2 = (exon_n-strand, exon_n))) prev_match = overlap[0] else: sstart = min(start, overlap[0].start) ssend = max(end, overlap[0].end) # Ignore 5' or 3' differences if (exon_n == max_exon_2 and overlap[0].value['anno'] == max_exon_1): if end == overlap[0].end: prev_match = overlap[0] else: exclusive_juncs.append( (sstart, ssend, (overlap[0].value['anno'], exon_n), (overlap[0].start - start, overlap[0].end - end) )) # Deal with partial matches prev_match = overlap[0] exon_match[exon_n] = int(overlap[0].value['anno']) else: pass skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons) skipped_exons.events.extend(altends) return(matching_exons, skipped_exons)
def generate(x): "Generates random interval over a size and span" lo = randint(10000, SIZE) hi = lo + randint(1, randint(1, 10**4)) return (lo, hi) def generate_point(x): lo = randint(10000, SIZE) return (lo, lo) # use this to force both examples to generate the same data seed(10) # generate 10 thousand random intervals data = map(generate, xrange(N)) # generate the intervals to query over query = map(generate_point, xrange(1000)) # create the interval tree tree = Intersecter() # build an interval tree from the rest of the data for start, end in data: tree.add_interval( Interval(start, end) ) # perform the query for q, q in query: overlap = tree.find(q, q) out = [ (x.start, x.end) for x in overlap ] print '(%s) -> %s' % (q, out)