def main(args): of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') #read the reference gpd rinf = None global txome txome = {} if re.search('\.gz$', args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 # populate txome with reference transcripts for each chromosome for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z) + " \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) #def annotate_line(gpd,txome,args): sys.stderr.write("annotating\n") p = Pool(processes=args.threads) csize = 100 #for v in generate_tx(inf,args): # res = annotate_line(v) # if not res: continue # print res.rstrip() results2 = p.imap(func=annotate_line, iterable=generate_tx(inf, args), chunksize=csize) #sys.stderr.write("done map\n") for res in results2: if not res: continue of.write(res) of.close()
def main(args): of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') #read the reference gpd rinf = None global txome txome = {} if re.search('\.gz$',args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 # populate txome with reference transcripts for each chromosome for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z%100 == 0: sys.stderr.write(str(z)+" \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z)+" \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$',args.input): inf = gzip.open(args.input) else: inf = open(args.input) #def annotate_line(gpd,txome,args): sys.stderr.write("annotating\n") p = Pool(processes=args.threads) csize = 100 #for v in generate_tx(inf,args): # res = annotate_line(v) # if not res: continue # print res.rstrip() results2 = p.imap(func=annotate_line,iterable=generate_tx(inf,args),chunksize=csize) #sys.stderr.write("done map\n") for res in results2: if not res: continue of.write(res) of.close()
def main(args): of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$', args.input) if m: newname = m.group(1) newname = re.sub('[\s]+', '_', newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname=" + newname + "\t" description = newname + ' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="' + description + '"' + "\t" header += 'itemRgb="On"' of.write(header + "\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" ostring += color + "\t" ostring += str(exoncount) + "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ',' of.write(ostring + "\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def main(args): global of if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') #read the reference gpd rinf = None txome = {} if re.search('\.gz$', args.reference): rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("Reading in reference\n") z = 0 for line in rinf: z += 1 gpd = GPD(line) gpd.set_payload(z) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = [] r = gpd.get_range() r.set_payload(gpd) txome[gpd.value('chrom')].append(r) rinf.close() sys.stderr.write(str(z) + " \r") sys.stderr.write("\n") inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) z = 0 chroms = {} sys.stderr.write("Buffering reads\n") for line in inf: z += 1 m = re.match('[^\t]*\t[^\t]*\t([^\t]+)', line) chrom = m.group(1) if z % 100 == 0: sys.stderr.write(str(z) + " \r") if chrom not in chroms: chroms[chrom] = [] chroms[chrom].append([line, z]) sys.stderr.write("\n") sys.stderr.write("Finished buffering reads\n") if args.threads > 1: p = Pool(processes=args.threads) results = [] global chrtotal chrtotal = len(chroms) for chrom in chroms: if chrom not in txome: continue if args.threads > 1: v = p.apply_async(do_buffer, args=(chroms[chrom], { chrom: txome[chrom] }, args), callback=do_out) results.append(v) else: v = do_buffer(chroms[chrom], {chrom: txome[chrom]}, args) results.append(Queue(v)) do_out(v) if args.threads > 1: p.close() p.join() sys.stderr.write("\n") for res in [x.get() for x in results]: for oline in res: of.write(oline) inf.close() of.close()
def main(args): of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$',args.input) if m: newname = m.group(1) newname = re.sub('[\s]+','_',newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname="+newname+"\t" description = newname+' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="'+description + '"'+"\t" header += 'itemRgb="On"' of.write(header+"\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t" ostring += color+"\t" ostring += str(exoncount) + "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonEnds')[i]-gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0,exoncount): ostring += str(gpd.value('exonStarts')[i]-gpd.value('exonStarts')[0])+',' of.write(ostring+"\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()