def annotate_line(inputs): global txome (line,z,args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5) else: eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx]) if len(candidates)==0: return None bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
def main(args): of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$', args.input) if m: newname = m.group(1) newname = re.sub('[\s]+', '_', newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname=" + newname + "\t" description = newname + ' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="' + description + '"' + "\t" header += 'itemRgb="On"' of.write(header + "\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" ostring += color + "\t" ostring += str(exoncount) + "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ',' of.write(ostring + "\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def main(args): of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$',args.input) if m: newname = m.group(1) newname = re.sub('[\s]+','_',newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname="+newname+"\t" description = newname+' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="'+description + '"'+"\t" header += 'itemRgb="On"' of.write(header+"\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" ostring += color+"\t" ostring += str(exoncount) + "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonEnds')[i]-gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonStarts')[i]-gpd.value('exonStarts')[0])+',' of.write(ostring+"\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def annotate_line(inputs): global txome (line, z, args) = inputs gpd = GPD(line) gpd.set_payload(z) v = gpd.get_range() if v.chr not in txome: return None possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)] candidates = [] if len(possible) == 0: return None for tx in possible: eo = None full = False subset = False econsec = 1 if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1: eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5) else: eo = gpd.exon_overlap(tx, multi_minover=10, multi_endfrac=0, multi_midfrac=0.8, multi_consec=False) if eo.is_full_overlap(): full = True if eo.is_subset(): subset = True if eo: econsec = eo.consecutive_exon_count() if not eo: continue ecnt = eo.match_exon_count() osize = gpd.overlap_size(tx) candidates.append([ full, subset, ecnt, econsec, gpd.get_exon_count(), tx.get_exon_count(), osize, gpd.get_length(), tx.get_length(), tx ]) if len(candidates) == 0: return None bests = sorted(candidates, key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min( float(x[6]) / float(x[7]), float(x[6]) / float(x[8])))) #line_z v = bests[0] ### we have the annotation z = gpd.get_payload() #line = line_z[0] #gpd = GPD(line) if not v: return None type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"