def do_buffer(gpd_lines,fasta,args): results = [] for gpd_line in gpd_lines: gpd = GPD(gpd_line) l = gpd.get_length() if l < args.length: continue num = int(float(l)/float(args.length)) rem = l % args.length #print 'rem : '+str(rem) extra = 0 offset = 0 #if space > 1: # we have room to make multiple passes # #print '---' # #print 'length: '+str(l) # #print 'strand: '+gpd.get_strand() # if random.random() < 0.5: extra = rem # offset = int(float(args.length)/float(args.coverage)) #else: # offset = int(float(rem)/float(args.coverage)) if args.short_reads: offset = 0 if random.random() < 0.5: offset = rem gsub = gpd.subset(offset,args.length+offset) #print gsub.get_gpd_line() val = get_sam(gsub,fasta) results.append(val) #continue else:# not short reads for i in range(0,args.coverage): init = 0 if num == 0 and rem > 0: init = random.choice(range(0,rem)) elif num > 0: init = random.choice(range(0,args.length)) #start = (i*offset+extra) % args.length #while start+args.length <= l: for j in range(init,l,args.length): if j + args.length > l: break #print str(start)+" "+str(start+args.length) gsub = gpd.subset(j,j+args.length) val = get_sam(gsub,fasta) results.append(val) #print gsub.get_sequence(fasta) #start += args.length #print gsub.get_strand() #print space #print rem #print gpd return results
def do_buffer(gpd_lines, fasta, args): results = [] for gpd_line in gpd_lines: gpd = GPD(gpd_line) l = gpd.get_length() if l < args.length: continue num = int(float(l) / float(args.length)) rem = l % args.length #print 'rem : '+str(rem) extra = 0 offset = 0 #if space > 1: # we have room to make multiple passes # #print '---' # #print 'length: '+str(l) # #print 'strand: '+gpd.get_strand() # if random.random() < 0.5: extra = rem # offset = int(float(args.length)/float(args.coverage)) #else: # offset = int(float(rem)/float(args.coverage)) if args.short_reads: offset = 0 if random.random() < 0.5: offset = rem gsub = gpd.subset(offset, args.length + offset) #print gsub.get_gpd_line() val = get_sam(gsub, fasta) results.append(val) #continue else: # not short reads for i in range(0, args.coverage): init = 0 if num == 0 and rem > 0: init = random.choice(range(0, rem)) elif num > 0: init = random.choice(range(0, args.length)) #start = (i*offset+extra) % args.length #while start+args.length <= l: for j in range(init, l, args.length): if j + args.length > l: break #print str(start)+" "+str(start+args.length) gsub = gpd.subset(j, j + args.length) val = get_sam(gsub, fasta) results.append(val) #print gsub.get_sequence(fasta) #start += args.length #print gsub.get_strand() #print space #print rem #print gpd return results
def main(): #do our inputs args = do_inputs() sys.stderr.write("Reading reference genepred\n") ref = {} tx_strand = {} z = 0 with open(args.reference_genepred) as inf: for line in inf: gpd = GPD(line) gname = gpd.get_gene_name() tname = gpd.get_transcript_name() tx_strand[tname] = gpd.get_strand() if gname not in ref: ref[gname] = [] ref[gname].append(gpd) z += 1 sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n") if args.maximum_isoforms > 0: sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n") for gname in ref.keys(): if len(ref[gname]) > args.maximum_isoforms: del ref[gname] sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n") for gname in ref.keys(): passing = [] for gpd in ref[gname]: if gpd.get_length() < args.minimum_length: continue passing.append(gpd) if len(passing) == 0: del ref[gname] else: ref[gname] = passing sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Converting gpd into exon bed\n") beds = [] for gname in ref.keys(): for gpd in ref[gname]: tname = gpd.get_transcript_name() for i in range(0,len(gpd.exons)): ex = gpd.exons[i] beds.append(ex.get_range().get_bed_array()+[gname,tname,i]) with open(args.tempdir+'/gpd.bed','w') as of: for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])): of.write("\t".join([str(x) for x in bed])+"\n") sys.stderr.write("intersecting with bed depth\n") of = open(args.tempdir+'/intersect.bed','w') cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed' p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of) p.communicate() coverage = {} sys.stderr.write("Reading the intersection\n") with open(args.tempdir+'/intersect.bed') as inf: for line in inf: f = line.rstrip().split("\t") gname = f[7] tname = f[8] depth = int(f[3]) bed1 = Bed(f[0],int(f[1]),int(f[2])) bed2 = Bed(f[4],int(f[5]),int(f[6])) bed = bed1.union(bed2) bed.set_payload(depth) if gname not in coverage: coverage[gname] = {} if tname not in coverage[gname]: coverage[gname][tname] = [] coverage[gname][tname].append(bed) transcript_depths = {} for gname in coverage: for tname in coverage[gname]: ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0] rlen = ref_gpd.get_length() bases_covered = sum([x.length() for x in coverage[gname][tname]]) bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]]) avg_depth = float(bases_area)/float(rlen) if avg_depth < args.minimum_average_depth: continue if bases_covered < args.minimum_length: continue #print gname #print tname #print rlen #print bases_covered #print bases_area total_positions = {} for ex in ref_gpd.exons: b = ex.get_range().get_bed_array() for i in range(b[1],b[2]): total_positions[i] = 0 # zero indexed for b in coverage[gname][tname]: depth = b.get_payload() barr = b.get_bed_array() for i in range(barr[1],barr[2]): total_positions[i] = depth transcript_depths[tname] = total_positions sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n") outputs = [] for tname in transcript_depths: depths = transcript_depths[tname] positions = sorted(depths.keys()) tx_len = len(positions) bins = {} for i in range(0,tx_len): bin = int(100*float(i)/float(tx_len)) if bin not in bins: bins[bin] = [] bins[bin].append(depths[positions[i]]) for bin in bins: bins[bin] = average(bins[bin]) biggest = float(max(bins.values())) tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())] if tx_strand[tname] == '-': tx_array.reverse() #outputs.append(tx_array) args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n") #for i in range(0,100): # args.output.write("\t".join([str(x[i]) for x in outputs])+"\n") args.output.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def annotate_line(inputs): global txome (line, z, args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5) else: eo = gpd.exon_overlap(tx, multi_minover=10, multi_endfrac=0, multi_midfrac=0.8, multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([ full, subset, ecnt, econsec, gpd.get_exon_count(), tx.get_exon_count(), osize, gpd.get_length(), tx.get_length(), tx ]) if len(candidates) == 0: return None bests = sorted(candidates, key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min( float(x[6]) / float(x[7]), float(x[6]) / float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
def annotate_line(inputs): global txome (line,z,args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5) else: eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx]) if len(candidates)==0: return None bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"