def main(args): inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') loci = LocusStream(GPDStream(inf)) for locus in loci: exranges = [] for entry in locus.get_payload(): for exon in entry.exons: exranges.append(exon.get_range()) covs = ranges_to_coverage(exranges) for cov in covs: of.write("\t".join([str(x) for x in cov.get_bed_coordinates()]) + "\t" + str(+cov.get_payload()) + "\n") of.close() inf.close()
def main(args): inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') p = Pool(processes=args.threads) loci = LocusStream(GPDStream(inf)) csize = 100 results = p.imap(func=do_locus, iterable=generate_gpd(loci), chunksize=csize) for covs in results: for cov in covs: of.write(cov) of.close() inf.close()
def main(args): #do our inputs inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') gs = GPDStream(inf) ls = LocusStream(gs) if args.threads > 1: p = Pool(processes=args.threads) results = [] for locus_rng in ls: if args.threads == 1: sys.stderr.write(locus_rng.get_range_string()+"\n") else: sys.stderr.write(locus_rng.get_range_string()+" \r") gpds = locus_rng.get_payload() if args.threads > 1: new_gpds = p.apply_async(do_multi_round_locus,args=(gpds,args,)) results.append(new_gpds) else: new_gpds = MiniQueue(do_multi_round_locus(gpds,args)) results.append(new_gpds) if args.threads > 1: p.close() p.join() for result in results: new_gpds = result.get() for v in new_gpds: if not v['tx'].validate(): sys.stderr.write("ERROR: invalid gpd entry\n") sys.stderr.write(v['tx'].get_fake_gpd_line()+"\n") sys.exit() fake_gpd = v['tx'].get_fake_gpd_line() #print v['tx'].get_gene_name() if args.gene_names: f = fake_gpd.rstrip().split("\t") f[0] = v['tx'].get_gene_name() fake_gpd = "\t".join(f) of.write(fake_gpd+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): inf = None if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') gs = GPDStream(inf) for gpd in gs: of.write( str(gpd.get_length()) + "\t" + str(gpd.get_exon_count()) + "\n") of.close()
def main(): parser = argparse.ArgumentParser( description= "Intersect a bam with a gpd file to give bam coverage of each gpd entry", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('sorted_bam', help="sorted bam file") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) #bs = BAMFile(args.sorted_bam) bs = SamtoolsBAMStream(args.sorted_bam) gs = GPDStream(args.input) mls = MultiLocusStream([gs, bs]) for ml in mls: [gpds, bams] = ml.get_payload() print ml print len(gpds) print len(bams)
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('-o', '--output', help="output file or use STDOUT if not set") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) gs = GPDStream(args.input) ls = LocusStream(gs) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') for rng in ls: sys.stderr.write(rng.get_range_string() + " \r") gpds = rng.get_payload() exs = [] for ex_set in [[y.get_range() for y in x.exons] for x in gpds]: exs += ex_set cov = ranges_to_coverage(exs) #use our coverage data on each gpd entry now for gpd in gpds: totcov = 0 for exon in [x.get_range() for x in gpd.exons]: gcovs = union_range_array(exon, cov, payload=2) totcov += sum([x.get_payload() * x.length() for x in gcovs]) of.write(gpd.get_gene_name() + "\t" + str(gpd.get_exon_count()) + "\t" + str(gpd.get_length()) + "\t" + str(float(totcov) / float(gpd.get_length())) + "\n") sys.stderr.write("\n") of.close()
def main(args): inf = None if re.search('\.gz',args.best_gpd): inf = gzip.open(args.best_gpd) else: inf = open(args.best_gpd) gs = GPDStream(inf) z = 0 data = {} for gpd in gs: z += 1 data[z] = [gpd.get_length(),gpd.get_exon_count()] gpd.get_length() inf.close() inf = None if re.search('\.gz',args.best_annotation): inf = gzip.open(args.best_annotation) else: inf = open(args.best_annotation) done_reads = set() of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') for line in inf: f = line.rstrip().split("\t") read_id = int(f[0]) type = f[4] done_reads.add(read_id) of.write(type+"\t"+str(data[read_id][0])+"\t"+str(data[read_id][1])+"\n") for i in [x for x in range(1,z+1) if x not in done_reads]: of.write('unannotated'+"\t"+str(data[i][0])+"\t"+str(data[i][1])+"\n") of.close()
def main(): #do our inputs args = do_inputs() of = sys.stdout if args.output: of = open(args.output, 'w') inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading in fasta\n") f = FastaData(open(args.reference).read()) sh = GPDStream(inf) gc_bins = range(0, args.number_of_bins) bin_handles = [] for i in range(0, args.number_of_bins): fname = args.tempdir + '/' + str(i) + '.bed.gz' cmd2 = 'bed_to_bed_depth.py - -o ' + fname p2 = Popen(cmd2.split(), stdin=PIPE, close_fds=True) cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, close_fds=True) bin_handles.append([p1, p2, fname, i]) if args.best_X_covered: sys.stderr.write("work out stratified data\n") cmd3 = 'bed_depth_to_stratified_coverage.py --minimum_coverage 10 --output_key ' + args.tempdir + '/key' + ' -r ' + args.reference + ' - -o ' + args.tempdir + '/combo.bed.gz' pstrat3 = Popen(cmd3.split(), stdin=PIPE, close_fds=True) cmd2 = 'bed_to_bed_depth.py -' pstrat2 = Popen(cmd2.split(), stdin=PIPE, stdout=pstrat3.stdin, close_fds=True) cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir pstrat1 = Popen(cmd1.split(), stdin=PIPE, stdout=pstrat2.stdin, close_fds=True) num = 0 for gpd in sh: num += 1 if (num % 1000 == 0): sys.stderr.write(str(num) + " \r") results = [] if args.minimum_sequence_length: if gpd.get_length() < args.minimum_sequence_length: continue seq = gpd.get_sequence(f).upper() seq_obj = Seq(seq) n_count = seq_obj.n_count() if len(seq) - n_count < args.min_non_N: continue gc = seq_obj.gc_content() gc_bin = int(args.number_of_bins * gc) if gc_bin == args.number_of_bins: gc_bin -= 1 for exon in gpd.exons: bed_bin = [ "\t".join([str(x) for x in exon.rng.get_bed_array()]), gc_bin ] results.append(bed_bin) elif args.fragment: seqlen = gpd.get_length() if seqlen < args.fragment: continue sfrags = int(float(seqlen) / float(args.fragment)) sremain = seqlen % args.fragment offset = 0 if random.random() < 0.5: offset = sremain #print '^^^' for i in range(0, sfrags): gsub = gpd.subset(i * args.fragment + offset, (i + 1) * args.fragment + offset) seq = gsub.get_sequence(f).upper() seq_obj = Seq(seq) n_count = seq_obj.n_count() if len(seq) - n_count < args.min_non_N: continue gc = seq_obj.gc_content() gc_bin = int(args.number_of_bins * gc) if gc_bin == args.number_of_bins: gc_bin -= 1 for exon in gsub.exons: bed_bin = [ "\t".join([str(x) for x in exon.rng.get_bed_array()]), gc_bin ] results.append(bed_bin) for val in results: [bed, bin] = val bin_handles[bin][0].stdin.write(bed + "\n") if args.best_X_covered: pstrat1.stdin.write(bed + "\n") #if not gc: print len(gpd.get_sequence(f)) sys.stderr.write("\n") for v in bin_handles: v[0].communicate() v[1].communicate() if args.best_X_covered: pstrat1.communicate() pstrat2.communicate() pstrat3.communicate() # If we want stratified data we should do it here sys.stderr.write("read the key\n") d = {} with open(args.tempdir + '/key') as inf: header = inf.readline() for line in inf: f = line.rstrip().split("\t") d[int(f[0])] = int(f[1]) if args.best_X_covered not in d: sys.stderr.write( "ERROR: the number of bases you specified is probably too big you didn't make the digit begin with 1 or 5 and restof the numbers be zero\n" ) sys.exit() num = d[args.best_X_covered] ninf = gzip.open(args.tempdir + '/combo.bed.gz') nof = gzip.open(args.tempdir + '/strat.bed.gz', 'w') for line in ninf: f = line.rstrip().split("\t") if int(f[3]) >= num: nof.write("\t".join(f[:-1]) + "\n") nof.close() ninf.close() for i in range(0, len(bin_handles)): v = bin_handles[i] fname = v[2] fname2 = args.tempdir + '/' + str(v[3]) + '.strata.bed.gz' gof = open(fname2, 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=gof, stdin=PIPE) cmd1 = 'bedtools intersect -a ' + fname + ' -b ' + args.tempdir + '/strat.bed.gz' p1 = Popen(cmd1.split(), stdout=p2.stdin) p1.communicate() p2.communicate() gof.close() # lets just replace the name of the file that the final output will read from bin_handles[i][2] = fname2 # Now we have bed depths for each bin for v in bin_handles: fname = v[2] #sys.stderr.write(fname+" ... prosessing\n") depths = {} bin = v[3] inf = gzip.open(fname) for line in inf: f = line.rstrip().split("\t") bases = int(f[2]) - int(f[1]) depth = int(f[3]) if depth not in depths: depths[depth] = 0 depths[depth] += bases inf.close() for depth in sorted(depths.keys()): of.write( str(bin) + "\t" + str(depth) + "\t" + str(depths[depth]) + "\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(): parser = argparse.ArgumentParser(description="For every gpd entry (sorted) intersect it with bed depth (sorted)",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('gpd_input',help="GPD file") parser.add_argument('bed_depth_input',help="GPD file") parser.add_argument('-o','--output',help="output file") args = parser.parse_args() inf1 = None if re.search('\.gz$',args.gpd_input): inf1 = gzip.open(args.gpd_input) else: inf1 = open(args.gpd_input) inf2 = None if re.search('\.gz$',args.bed_depth_input): inf2 = gzip.open(args.bed_depth_input) else: inf2 = open(args.bed_depth_input) gs = GPDStream(inf1) bs = BedStream(inf2) of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') mls = MultiLocusStream([gs,bs]) z = 0 for ml in mls: z += 1 #if z%1000 == 0: sys.stderr.write(ml.get_range_string()+" \r") [gpds,beds] = ml.get_payload() if len(gpds) == 0: continue if len(beds)==0: for gpd in gpds: of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t0\t0\t0"+"\n") continue #break beds up by depth #depths = {} #for bed in beds: # d = int(bed.get_payload()) # if d not in depths: depths[d] = [] # depths[d].append(bed) #for gpd in gpds: # clen = 0 # tot = 0 # for d in depths: # covs = [] # for ex in [x.get_range() for x in gpd.exons]: # clen += sum([x.overlap_size(ex) for x in depths[d]]) # tot += clen*d # of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n") for gpd in gpds: covs = [] for ex in [x.get_range() for x in gpd.exons]: c = union_range_array(ex,beds,payload=2) covs += c clen = sum([x.length() for x in covs if int(x.get_payload())>0]) tot = sum([x.length()*int(x.get_payload()) for x in covs]) of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n") sys.stderr.write("\n") of.close() inf1.close() inf2.close()
def main(args): of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') color = '0,0,0' if args.color: if args.color == 'blue': color = '67,162,202' elif args.color == 'green': color = '49,163,84' elif args.color == 'orange': color = '254,178,76' elif args.color == 'purple': color = '136,86,167' elif args.color == 'red': color = '240,59,32' # set up the header if one is desired header = '' if not args.noheader: newname = 'longreads' m = re.search('([^\/]+)$', args.input) if m: newname = m.group(1) newname = re.sub('[\s]+', '_', newname) if args.headername: newname = args.headername elif args.input == '-': newname = 'STDIN' header += "track\tname=" + newname + "\t" description = newname + ' GenePred Entries' if args.headerdescription: description = args.headerdescription header += 'description="' + description + '"' + "\t" header += 'itemRgb="On"' of.write(header + "\n") gpd_handle = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': gpd_handle = gzip.open(args.input) else: gpd_handle = open(args.input) gs = GPDStream(gpd_handle) #with gpd_handle as infile: for gpd in gs: #for line in infile: #if re.match('^#',line): # continue #genepred_entry = GenePredBasics.line_to_entry(line) if args.minintron: gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line()) exoncount = gpd.get_exon_count() ostring = gpd.value('chrom') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" if args.namefield == 1: ostring += gpd.value('gene_name') + "\t" else: ostring += gpd.value('name') ostring += '1000' + "\t" ostring += gpd.value('strand') + "\t" ostring += str(gpd.value('exonStarts')[0]) + "\t" ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t" ostring += color + "\t" ostring += str(exoncount) + "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ',' ostring += "\t" for i in range(0, exoncount): ostring += str( gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ',' of.write(ostring + "\n") #for i in range(0,len(genepred_entry['exonStarts'])): gpd_handle.close() of.close()
def make_html(args): global g_version #read in our alignment data mydate = time.strftime("%Y-%m-%d") a = {} with open(args.tempdir+'/data/alignment_stats.txt') as inf: for line in inf: (name,numstr)=line.rstrip().split("\t") a[name]=int(numstr) #read in our special read analysis special = {} with open(args.tempdir+'/data/special_report') as inf: for line in inf: f = line.rstrip().split("\t") if f[0] not in special: special[f[0]] = [] special[f[0]].append(f[1:]) #Only have error stats if we are using it e = {} if args.reference: #read in our error data with open(args.tempdir+'/data/error_stats.txt') as inf: for line in inf: (name,numstr)=line.rstrip().split("\t") e[name]=int(numstr) # read in our coverage data coverage_data = {} # this one will be set in annotation on section tx_to_gene = {} coverage_data['genome_total'] = 0 with open(args.tempdir+'/data/chrlens.txt') as inf: for line in inf: f = line.rstrip().split("\t") coverage_data['genome_total']+=int(f[1]) inf = gzip.open(args.tempdir+'/data/depth.sorted.bed.gz') coverage_data['genome_covered'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['genome_covered'] += rng.length() inf.close() # The annotation section if args.annotation: inf = open(args.tempdir+'/data/beds/exon.bed') coverage_data['exons_total'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['exons_total'] += rng.length() inf.close() inf = open(args.tempdir+'/data/beds/intron.bed') coverage_data['introns_total'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['introns_total'] += rng.length() inf.close() inf = open(args.tempdir+'/data/beds/intergenic.bed') coverage_data['intergenic_total'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['intergenic_total'] += rng.length() inf.close() inf = gzip.open(args.tempdir+'/data/exondepth.bed.gz') coverage_data['exons_covered'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['exons_covered'] += rng.length() inf.close() inf = gzip.open(args.tempdir+'/data/introndepth.bed.gz') coverage_data['introns_covered'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['introns_covered'] += rng.length() inf.close() inf = gzip.open(args.tempdir+'/data/intergenicdepth.bed.gz') coverage_data['intergenic_covered'] = 0 bs = BedStream(inf) for rng in bs: f = line.rstrip().split("\t") coverage_data['intergenic_covered'] += rng.length() inf.close() # deal with annotations ref_genes = {} ref_transcripts = {} with open(args.annotation) as inf: gs = GPDStream(inf) for gpd in gs: tx_to_gene[gpd.get_transcript_name()] = gpd.get_gene_name() ref_genes[gpd.get_gene_name()] = [0,0] ref_transcripts[gpd.get_transcript_name()] = [0,0] inf = gzip.open(args.tempdir+'/data/annotbest.txt.gz') for line in inf: f = line.rstrip().split("\t") gene = f[2] tx = f[3] if f[4]=='partial': ref_genes[gene][0] += 1 elif f[4]=='full': ref_genes[gene][1] += 1 if f[4]=='partial': ref_transcripts[tx][0] += 1 elif f[4]=='full': ref_transcripts[tx][1] += 1 inf.close() #get our locus count if args.do_loci: inf = gzip.open(args.tempdir+'/data/loci.bed.gz') locuscount = 0 for line in inf: locuscount += 1 inf.close() #get our annotation counts if args.annotation: genefull = 0 geneany = 0 txfull = 0 txany = 0 inf = gzip.open(args.tempdir+'/data/annotbest.txt.gz') genes_f = {} genes_a = {} txs_f = {} txs_a = {} for line in inf: f = line.rstrip().split("\t") g = f[2] t = f[3] if g not in genes_a: genes_a[g] = 0 genes_a[g]+=1 if t not in txs_a: txs_a[t] = 0 txs_a[t]+=1 if f[4] == 'full': if g not in genes_f: genes_f[g] = 0 genes_f[g]+=1 if t not in txs_f: txs_f[t] = 0 txs_f[t]+=1 inf.close() genefull = len(genes_f.keys()) geneany = len(genes_a.keys()) txfull = len(txs_f.keys()) txany = len(txs_a.keys()) # still in args.annotation required #Get evidence counts for bias bias_tx_count = None bias_read_count = None with open(args.tempdir+'/data/bias_counts.txt') as inf: for line in inf: f = line.rstrip().split("\t") bias_tx_count = int(f[0]) bias_read_count = int(f[1]) #make our css directory if not os.path.exists(args.tempdir+'/css'): os.makedirs(args.tempdir+'/css') udir = os.path.dirname(os.path.realpath(__file__)) #copy css into that directory copy(udir+'/../data/mystyle.css',args.tempdir+'/css/mystyle.css') of = open(args.tempdir+'/report.xhtml','w') ostr = ''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link rel="stylesheet" type="text/css" href="css/mystyle.css" /> <title>Long Read Alignment and Error Report</title> </head> <body> ''' of.write(ostr) ######################################### # 1. TOP BLOCK ostr = ''' <div class="result_block"> <div class="top_block"> <div> Generated on: </div> <div class="input_value"> ''' of.write(ostr) of.write(mydate) ostr = ''' </div> </div> <div class="top_block"> <div> Version: </div> <div class="input_value">''' of.write(ostr) of.write(str(g_version)) ostr = ''' </div> </div> <div class="top_block"> <div>Execution parmeters:</div> <div class="input_value"> <a id="params.txt" href="data/params.txt">params.txt</a> </div> </div> <div class="top_block"> <div>Long read alignment and error report for:</div> <div class="input_value" id="filename">''' of.write(ostr+"\n") of.write(args.input) ostr = ''' </div> </div> <div class="clear"></div> <div class="top_block"> <div> Reference Genome: </div> <div class="input_value">''' of.write(ostr) #if args.reference: of.write(str(args.reference)) #else: # of.write(' '*20) ostr = ''' </div> </div> <div class="top_block"> <div> Reference Annotation: </div> <div class="input_value">''' of.write(ostr) #if args.reference: of.write(str(args.annotation)) #else: # of.write(' '*20) ostr = ''' </div> </div> </div> <div class="clear"></div> <hr /> ''' of.write(ostr) ################################## # 2. ALIGNMENT ANALYSIS ## This block should be in every output. Generated from the BAM ostr = ''' <div class="result_block"> <div class="subject_title"> <table><tr><td class="c1">Alignment analysis</td><td class="c2"><span class="highlight">''' of.write(ostr) reads_aligned = perc(a['ALIGNED_READS'],a['TOTAL_READS'],1) of.write(reads_aligned) ostr = ''' </span></td><td class="c2"><span class="highlight2">reads aligned</span></td><td class="c2"><span class="highlight">''' of.write(ostr) bases_aligned = perc(a['ALIGNED_BASES'],a['TOTAL_BASES'],1) of.write(bases_aligned) ostr = ''' </span></td><td class="c2"><span class="highlight2">bases aligned <i>(of aligned reads)</i></span></td></tr></table> </div> <div class="one_third left"> <table class="data_table"> <tr class="rhead"><td colspan="3">Read Stats</td></tr>''' of.write(ostr+"\n") total_read_string = '<tr><td>Total reads</td><td>'+str(addcommas(a['TOTAL_READS']))+'</td><td></td></tr>' of.write(total_read_string+"\n") unaligned_read_string = '<tr><td>- Unaligned reads</td><td>'+str(addcommas(a['UNALIGNED_READS']))+'</td><td>'+perc(a['UNALIGNED_READS'],a['TOTAL_READS'],1)+'</td></tr>' of.write(unaligned_read_string+"\n") aligned_read_string = '<tr><td>- Aligned reads</td><td>'+str(addcommas(a['ALIGNED_READS']))+'</td><td>'+perc(a['ALIGNED_READS'],a['TOTAL_READS'],1)+'</td></tr>' of.write(aligned_read_string+"\n") single_align_read_string = '<tr><td>--- Single-align reads</td><td>'+str(addcommas(a['SINGLE_ALIGN_READS']))+'</td><td>'+perc(a['SINGLE_ALIGN_READS'],a['TOTAL_READS'],1)+'</td></tr>' of.write(single_align_read_string+"\n") gapped_align_read_string = '<tr><td>--- Gapped-align reads</td><td>'+str(addcommas(a['GAPPED_ALIGN_READS']))+'</td><td>'+perc(a['GAPPED_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>' of.write(gapped_align_read_string+"\n") gapped_align_read_string = '<tr><td>--- Chimeric reads</td><td>'+str(addcommas(a['CHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['CHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>' of.write(gapped_align_read_string+"\n") gapped_align_read_string = '<tr><td>----- Trans-chimeric reads</td><td>'+str(addcommas(a['TRANSCHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['TRANSCHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>' of.write(gapped_align_read_string+"\n") gapped_align_read_string = '<tr><td>----- Self-chimeric reads</td><td>'+str(addcommas(a['SELFCHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['SELFCHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>' of.write(gapped_align_read_string+"\n") ostr=''' <tr class="rhead"><td colspan="3">Base Stats <i>(of aligned reads)</i></td></tr>''' of.write(ostr+"\n") total_bases_string = '<tr><td>Total bases</td><td>'+str(addcommas(a['TOTAL_BASES']))+'</td><td></td></tr>' of.write(total_bases_string+"\n") unaligned_bases_string = '<tr><td>- Unaligned bases</td><td>'+str(addcommas(a['UNALIGNED_BASES']))+'</td><td>'+perc(a['UNALIGNED_BASES'],a['TOTAL_BASES'],1)+'</td></tr>' of.write(unaligned_bases_string+"\n") aligned_bases_string = '<tr><td>- Aligned bases</td><td>'+str(addcommas(a['ALIGNED_BASES']))+'</td><td>'+perc(a['ALIGNED_BASES'],a['TOTAL_BASES'],1)+'</td></tr>' of.write(aligned_bases_string+"\n") single_align_bases_string = '<tr><td>--- Single-aligned bases</td><td>'+str(addcommas(a['SINGLE_ALIGN_BASES']))+'</td><td>'+perc(a['SINGLE_ALIGN_BASES'],a['TOTAL_BASES'],1)+'</td></tr>' of.write(single_align_bases_string+"\n") gapped_align_bases_string = '<tr><td>--- Other-aligned bases</td><td>'+str(addcommas(a['GAPPED_ALIGN_BASES']))+'</td><td>'+perc(a['GAPPED_ALIGN_BASES'],a['TOTAL_BASES'],2)+'</td></tr>' of.write(gapped_align_bases_string+"\n") ostr = ''' </table> <table class="right"> <tr><td>Unaligned</td><td><div id="unaligned_leg" class="legend_square"></div></td></tr> <tr><td>Trans-chimeric alignment</td><td><div id="chimeric_leg" class="legend_square"></div></td></tr> <tr><td>Self-chimeric alignment</td><td><div id="selfchimeric_leg" class="legend_square"></div></td></tr> <tr><td>Gapped alignment</td><td><div id="gapped_leg" class="legend_square"></div></td></tr> <tr><td>Single alignment</td><td><div id="single_leg" class="legend_square"></div></td></tr> </table> </div> <div class="two_thirds right"> <div class="rhead">Summary [<a download="alignments.pdf" href="plots/alignments.pdf">pdf</a>]</div> <img src="plots/alignments.png" alt="alignments_png" /> </div> <div class="clear"></div> <div class="two_thirds right"> <div class="rhead">Exon counts of best alignments [<a download="exon_size_distro.pdf" href="plots/exon_size_distro.pdf">pdf</a>]</div> <img src="plots/exon_size_distro.png" alt="exon_size_distro_png" /> </div> ''' of.write(ostr) if len(special['GN']) > 1: ostr = ''' <div class="one_half left"> <table class="one_half data_table"> <tr class="rhead"><td colspan="5">Long read name information</td></tr> <tr><td>Type</td><td>Sub-type</td><td>Reads</td><td>Aligned</td><td>Fraction</td></tr> ''' of.write(ostr) for f in [x for x in special['GN'] if x[0] != 'Unclassified' or int(x[2])>0]: of.write(' <tr><td>'+f[0]+'</td><td>'+f[1]+'</td><td>'+addcommas(int(f[2]))+'</td><td>'+addcommas(int(f[3]))+'</td><td>'+perc(int(f[3]),int(f[2]),2)+'</td></tr>'+"\n") ostr = ''' </table> ''' of.write(ostr) if 'PB' in special: # We have pacbio specific report pb = {} for f in special['PB']: pb[f[0]]=f[1] if re.search('\.',f[1]): pb[f[0]]=float(f[1]) ostr = ''' <div class="rhead">PacBio SMRT Cells [<a download="pacbio.pdf" href="/plots/pacbio.pdf">pdf</a>]</div> <img src="plots/pacbio.png" alt="pacbio_png" /> <table class="horizontal_legend right"> <tr><td>Aligned</td><td><div class="legend_square pacbio_aligned_leg"></div></td><td>Unaligned</td><td><div class="legend_square pacbio_unaligned_leg"></div></td></tr> </table> <table class="data_table one_half"> <tr class="rhead"><td colspan="4">PacBio Stats</td></tr> ''' of.write(ostr) of.write(' <tr><td>Total Cell Count</td><td colspan="3">'+addcommas(int(pb['Cell Count']))+'</td></tr>') of.write(' <tr><td>Total Molecule Count</td><td colspan="3">'+addcommas(int(pb['Molecule Count']))+'</td></tr>') of.write(' <tr><td>Total Molecules Aligned</td><td colspan="3">'+addcommas(int(pb['Aligned Molecule Count']))+' ('+perc(pb['Aligned Molecule Count'],pb['Molecule Count'],2)+')</td></tr>') of.write(' <tr class="rhead"><td>Per-cell Feature</td><td>Min</td><td>Avg</td><td>Max</td></tr>') of.write(' <tr><td>Reads</td><td>'+addcommas(int(pb['Min Reads Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Reads Per Cell']))+'</td><td>'+addcommas(int(pb['Max Reads Per Cell']))+'</td></tr>') of.write(' <tr><td>Molecules</td><td>'+addcommas(int(pb['Min Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Max Molecules Per Cell']))+'</td></tr>') of.write(' <tr><td>Aligned Molecules</td><td>'+addcommas(int(pb['Min Aligned Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Aligned Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Max Aligned Molecules Per Cell']))+'</td></tr>') ostr = ''' </table> ''' of.write(ostr) ostr = ''' </div> ''' of.write(ostr) ostr = ''' </div> <div class="clear"></div> <hr /> ''' of.write(ostr) ################################### # 3. ANNOTATION ANALYSIS ### This block should only be done when we have annotations if args.annotation: ostr = ''' <div class="result_block"> <div class="subject_title">Annotation Analysis</div> <div class="one_half left"> <div class="rhead">Distribution of reads among genomic features [<a download="read_genomic_features.pdf" href="plots/read_genomic_features.pdf">pdf</a>]</div> <img src="plots/read_genomic_features.png" alt="read_genomic_features_png" /> <table class="one_half right horizontal_legend"> <tr> <td>Exons</td><td><div class="exon_leg legend_square"></div></td><td></td> <td>Introns</td><td><div class="intron_leg legend_square"></div></td><td></td> <td>Intergenic</td><td><div class="intergenic_leg legend_square"></div></td><td></td> </tr> </table> </div> <div class="one_half right"> <div class="rhead">Distribution of annotated reads [<a download="annot_lengths.pdf" href="plots/annot_lengths.pdf">pdf</a>]</div> <img src="plots/annot_lengths.png" alt="annot_lengths_png" /> <table class="one_half right horizontal_legend"> <tr> <td>Partial annotation</td><td><div class="partial_leg legend_square"></div></td><td></td> <td>Full-length</td><td><div class="full_leg legend_square"></div></td><td></td> <td>Unannotated</td><td><div class="unannotated_leg legend_square"></div></td><td></td> </tr> </table> </div> <div class="clear"></div> <div class="one_half right"> <div class="rhead">Distribution of identified reference transcripts [<a download="transcript_distro.pdf" href="plots/transcript_distro.pdf">pdf</a>]</div> <img src="plots/transcript_distro.png" alt="transcript_distro_png" /> <table class="one_half right horizontal_legend"> <tr> <td>Partial annotation</td><td><div class="partial_leg legend_square"></div></td><td></td> <td>Full-length</td><td><div class="full_leg legend_square"></div></td><td></td> </tr> </table> </div> <div class="one_half left"> <table class="data_table one_half"> <tr class="rhead"><td colspan="5">Annotation Counts</td></tr> <tr><td>Feature</td><td>Evidence</td><td>Reference</td><td>Detected</td><td>Percent</td></tr> ''' of.write(ostr) cnt = len([x for x in ref_genes.keys() if sum(ref_genes[x])>0]) of.write(' <tr><td>Genes</td><td>Any match</td><td>'+addcommas(len(ref_genes.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_genes.keys()),2)+'</td></tr>'+"\n") cnt = len([x for x in ref_genes.keys() if ref_genes[x][1]>0]) of.write(' <tr><td>Genes</td><td>Full-length</td><td>'+addcommas(len(ref_genes.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_genes.keys()),2)+'</td></tr>'+"\n") cnt = len([x for x in ref_transcripts.keys() if sum(ref_transcripts[x])>0]) of.write(' <tr><td>Transcripts</td><td>Any match</td><td>'+addcommas(len(ref_transcripts.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_transcripts.keys()),2)+'</td></tr>'+"\n") cnt = len([x for x in ref_transcripts.keys() if ref_transcripts[x][1]>0]) of.write(' <tr><td>Transcripts</td><td>Full-length</td><td>'+addcommas(len(ref_transcripts.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_transcripts.keys()),2)+'</td></tr>'+"\n") ostr = ''' </table> <table class="data_table one_half"> <tr class="rhead"><td colspan="4">Top Genes</td></tr> <tr><td>Gene</td><td>Partial</td><td>Full-length</td><td>Total Reads</td></tr> ''' of.write(ostr) # get our top genes vs = reversed(sorted(ref_genes.keys(),key=lambda x: sum(ref_genes[x]))[-5:]) for v in vs: of.write(' <tr><td>'+v+'</td><td>'+addcommas(ref_genes[v][0])+'</td><td>'+addcommas(ref_genes[v][1])+'</td><td>'+addcommas(sum(ref_genes[v]))+'</td></tr>'+"\n") ostr=''' </table> <table class="data_table one_half"> <tr class="rhead"><td colspan="5">Top Transcripts</td></tr> <tr><td>Transcript</td><td>Gene</td><td>Partial</td><td>Full-length</td><td>Total Reads</td></tr> ''' of.write(ostr) vs = reversed(sorted(ref_transcripts.keys(),key=lambda x: sum(ref_transcripts[x]))[-5:]) for v in vs: of.write(' <tr><td>'+v+'</td><td>'+tx_to_gene[v]+'</td><td>'+addcommas(ref_transcripts[v][0])+'</td><td>'+addcommas(ref_transcripts[v][1])+'</td><td>'+addcommas(sum(ref_transcripts[v]))+'</td></tr>'+"\n") ostr = ''' </table> </div> <div class="clear"></div> </div> <hr /> ''' of.write(ostr) # still in conditional for if we have annotation ################################## # 4. COVERAGE ANALYSIS ### For Coverage we can do part of it without annotations ostr = ''' <div class="subject_title">Coverage analysis     <span class="highlight">''' of.write(ostr+"\n") of.write(perc(coverage_data['genome_covered'],coverage_data['genome_total'],2)+"\n") ostr = ''' </span> <span class="highlight2">reference sequences covered</span> </div> <div class="result_block"> <div class="one_half left"> <div class="rhead">Coverage of reference sequences [<a download="covgraph.pdf" href="plots/covgraph.pdf">pdf</a>]</div> <img src="plots/covgraph.png" alt="covgraph_png" /> </div> <div class="one_half left"> <div class="rhead">Coverage distribution [<a download="perchrdepth.pdf" href="plots/perchrdepth.pdf">pdf</a>]</div> <img src="plots/perchrdepth.png" alt="perchrdepth_png" /> </div> <div class="clear"></div> ''' of.write(ostr) ### The next part of coverage requires annotations if args.annotation: ostr = ''' <div class="one_half left"> <table class="data_table one_half"> <tr class="rhead"><td colspan="4">Coverage statistics</td></tr> <tr><td>Feature</td><td>Feature (bp)</td><td>Coverage (bp)</td><td>Fraction</td></tr> ''' # still in annotation conditional of.write(ostr) of.write(' <tr><td>Genome</td><td>'+addcommas(coverage_data['genome_total'])+'</td><td>'+addcommas(coverage_data['genome_covered'])+'</td><td>'+perc(coverage_data['genome_covered'],coverage_data['genome_total'],2)+'</td></tr>') of.write(' <tr><td>Exons</td><td>'+addcommas(coverage_data['exons_total'])+'</td><td>'+addcommas(coverage_data['exons_covered'])+'</td><td>'+perc(coverage_data['exons_covered'],coverage_data['exons_total'],2)+'</td></tr>') of.write(' <tr><td>Introns</td><td>'+addcommas(coverage_data['introns_total'])+'</td><td>'+addcommas(coverage_data['introns_covered'])+'</td><td>'+perc(coverage_data['introns_covered'],coverage_data['introns_total'],2)+'</td></tr>') of.write(' <tr><td>Intergenic</td><td>'+addcommas(coverage_data['intergenic_total'])+'</td><td>'+addcommas(coverage_data['intergenic_covered'])+'</td><td>'+perc(coverage_data['intergenic_covered'],coverage_data['intergenic_total'],2)+'</td></tr>') ostr = ''' </table> </div> <div class="one_half right"> <div class="rhead">Annotated features coverage [<a download="feature_depth.pdf" href="plots/feature_depth.pdf">pdf</a>]</div> <img src="plots/feature_depth.png" alt="feature_depth_png" /> <table class="one_third right"> <tr><td>Genome</td><td><div class="legend_square genome_cov_leg"></div></td> <td>Exons</td><td><div class="legend_square exon_cov_leg"></div></td> <td>Introns</td><td><div class="legend_square intron_cov_leg"></div></td> <td>Intergenic</td><td><div class="legend_square intergenic_cov_leg"></div></td></tr> </table> </div> <div class="one_half left"> <div class="rhead">Bias in alignment to reference transcripts [<a download="bias.pdf" href="plots/bias.pdf">pdf</a>]</div> <table> ''' # still in conditional for annotation requirement of.write(ostr) of.write('<tr><td colspan="2">Evidence from:</td></tr>') of.write('<tr><td>Total Transcripts</td><td>'+str(addcommas(bias_tx_count))+'</td></tr>'+"\n") of.write('<tr><td>Total reads</td><td>'+str(addcommas(bias_read_count))+'</td></tr>'+"\n") ostr=''' </table> <img src="plots/bias.png" alt="bias_png" /> </div> <div class="clear"></div> ''' # still in annotations check of.write(ostr) # done with annotations check ostr = ''' </div> <hr /> ''' of.write(ostr) ############################################# # 5. RAREFRACTION ANALYSIS ### Rarefraction analysis block requires do_loci or annotations if args.do_loci or args.annotation: ostr = ''' <div class="subject_title"><table><tr><td class="c1">Rarefraction analysis</td> ''' of.write(ostr) if args.annotation: ostr = ''' <td class="c2"><span class="highlight"> ''' # still in do_loci or annotations conditional of.write(ostr) of.write(str(addcommas(geneany))+"\n") ostr = ''' </span></td><td class="c3"><span class="highlight2">Genes detected</span></td><td class="c4"><span class="highlight"> ''' # still in do_loci or annotations conditional of.write(ostr) of.write(str(addcommas(genefull))+"\n") ostr = ''' </span></td><td class="c5"><span class="highlight2">Full-length genes</span></td> ''' # still in do_loci or annotations conditional of.write(ostr) ostr = ''' </tr></table> </div> <div class="result_block"> <div class="one_half left"> ''' of.write(ostr) if args.annotation: ostr = ''' <div class="rhead">Gene detection rarefraction [<a download="gene_rarefraction.pdf" href="plots/gene_rarefraction.pdf">pdf</a>]</div> <img src="plots/gene_rarefraction.png" alt="gene_rarefraction_png" /> </div> <div class="one_half left"> <div class="rhead">Transcript detection rarefraction [<a download="transcript_rarefraction" href="plots/transcript_rarefraction.pdf">pdf</a>]</div> <img src="plots/transcript_rarefraction.png" alt="transcript_rarefraction_png" /> </div> <div class="clear"></div> ''' # still in args.annotation of.write(ostr) #done with args.anotation ostr = ''' <div class="one_half left"> <table class="data_table one_third"> <tr><td class="rhead" colspan="3">Rarefraction stats</td></tr> <tr class="bold"><td>Feature</td><td>Criteria</td><td>Count</td></tr> ''' # still in do_loci or annotations conditional of.write(ostr+"\n") if args.annotation: of.write('<tr><td>Gene</td><td>full-length</td><td>'+str(addcommas(genefull))+'</td></tr>') of.write('<tr><td>Gene</td><td>any match</td><td>'+str(addcommas(geneany))+'</td></tr>') of.write('<tr><td>Transcript</td><td>full-length</td><td>'+str(addcommas(txfull))+'</td></tr>') of.write('<tr><td>Transcript</td><td>any match</td><td>'+str(addcommas(txany))+'</td></tr>') if args.do_loci: of.write('<tr><td>Locus</td><td></td><td>'+str(addcommas(locuscount))+'</td></tr>') ostr=''' </table> <table id="rarefraction_legend"> <tr><td>Any match</td><td><div class="rareany_leg legend_square"></div></td></tr> <tr><td>full-length</td><td><div class="rarefull_leg legend_square"></div></td></tr> <tr><td class="about" colspan="2">vertical line height indicates 5%-95% CI of sampling</td></tr> </table> </div> ''' # still in do_loci or annotations conditional of.write(ostr) if args.do_loci: ostr = ''' <div class="one_half left"> <div class="rhead">Locus detection rarefraction [<a download="locus_rarefraction.pdf" href="plots/locus_rarefraction.pdf">pdf</a>]</div> <img src="plots/locus_rarefraction.png" alt="locus_rarefraction_png" /> </div> ''' # in do_loci condition of.write(ostr) # still in do_loci or annotations conditional ostr = ''' </div> <div class="clear"></div> <hr /> ''' # still in do_loci or annotations conditional of.write(ostr) # Finished do_loci or annotations conditional ################################### # 6. ERROR PATTERN # We need a reference in order to do error pattern analysis if args.reference: ostr = ''' <div class="subject_title">Error pattern analysis     <span class="highlight"> ''' #if args.reference of.write(ostr+"\n") error_rate = perc(e['ANY_ERROR'],e['ALIGNMENT_BASES'],3) of.write(error_rate) ostr=''' </span> <span class="highlight2">error rate</span></div> <div class="subject_subtitle">      based on aligned segments</div> <div class="result_block"> <div class="full_length right"> <div class="rhead">Error rates, given a target sequence [<a download="context_plot.pdf" href="plots/context_plot.pdf">pdf</a>]</div> <img src="plots/context_plot.png" alt="context_plot_png" /> </div> <div class="clear"></div> <table class="data_table one_third left"> <tr class="rhead"><td colspan="3">Alignment stats</td></tr> ''' # if args.reference of.write(ostr+"\n") best_alignments_sampled_string = '<tr><td>Best alignments sampled</td><td>'+str(e['ALIGNMENT_COUNT'])+'</td><td></td></tr>' of.write(best_alignments_sampled_string+"\n") ostr = ''' <tr class="rhead"><td colspan="3">Base stats</td></tr> ''' # if args.reference of.write(ostr+"\n") bases_analyzed_string = '<tr><td>Bases analyzed</td><td>'+str(addcommas(e['ALIGNMENT_BASES']))+'</td><td></td></tr>' of.write(bases_analyzed_string+"\n") correctly_aligned_string = '<tr><td>- Correctly aligned bases</td><td>'+str(addcommas(e['ALIGNMENT_BASES']-e['ANY_ERROR']))+'</td><td>'+perc((e['ALIGNMENT_BASES']-e['ANY_ERROR']),e['ALIGNMENT_BASES'],1)+'</td></tr>' of.write(correctly_aligned_string+"\n") total_error_string = '<tr><td>- Total error bases</td><td>'+str(addcommas(e['ANY_ERROR']))+'</td><td>'+perc(e['ANY_ERROR'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(total_error_string+"\n") mismatched_string = '<tr><td>--- Mismatched bases</td><td>'+str(addcommas(e['MISMATCHES']))+'</td><td>'+perc(e['MISMATCHES'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(mismatched_string+"\n") deletion_string = '<tr><td>--- Deletion bases</td><td>'+str(addcommas(e['ANY_DELETION']))+'</td><td>'+perc(e['ANY_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(deletion_string+"\n") complete_deletion_string = '<tr><td>----- Complete deletion bases</td><td>'+str(addcommas(e['COMPLETE_DELETION']))+'</td><td>'+perc(e['COMPLETE_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(complete_deletion_string+"\n") homopolymer_deletion_string = '<tr><td>----- Homopolymer deletion bases</td><td>'+str(addcommas(e['HOMOPOLYMER_DELETION']))+'</td><td>'+perc(e['HOMOPOLYMER_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(homopolymer_deletion_string+"\n") insertion_string = '<tr><td>--- Insertion bases</td><td>'+str(addcommas(e['ANY_INSERTION']))+'</td><td>'+perc(e['ANY_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(insertion_string+"\n") complete_insertion_string = '<tr><td>----- Complete insertion bases</td><td>'+str(addcommas(e['COMPLETE_INSERTION']))+'</td><td>'+perc(e['COMPLETE_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(complete_insertion_string+"\n") homopolymer_insertion_string = '<tr><td>----- Homopolymer insertion bases</td><td>'+str(addcommas(e['HOMOPOLYMER_INSERTION']))+'</td><td>'+perc(e['HOMOPOLYMER_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>' of.write(homopolymer_insertion_string+"\n") ostr = ''' </table> <div class="one_half left"> <div class="rhead">Alignment-based error rates [<a download="alignment_error_plot.pdf" href="plots/alignment_error_plot.pdf">pdf</a>]</div> <img class="square_image" src="plots/alignment_error_plot.png" alt="alignment_error_plot_png" /> </div> </div> <div class="clear"></div> <hr /> ''' #if args.reference of.write(ostr) # finished with args.reference condition ############################## # 8. Raw data block ostr = ''' <div id="bed_data"> <table class="header_table"> <tr><td class="rhead" colspan="2">Browser-ready Bed data</td></tr> <tr> <td>Best Alignments:</td> <td class="raw_files"><a download="best.sorted.bed.gz" href="data/best.sorted.bed.gz">best.sorted.bed.gz</a></td> </tr> <tr> <td>Gapped Alignments:</td> <td class="raw_files"><a download="gapped.bed.gz" href="data/gapped.bed.gz">gapped.bed.gz</a></td> </tr> <tr> <td>Trans-chimeric Alignments:</td> <td class="raw_files"><a download="chimera.bed.gz" href="data/chimera.bed.gz">chimera.bed.gz</a></td> </tr> <tr> <td>Self-chimeric Alignments:</td> <td class="raw_files"><a download="technical_chimeras.bed.gz" href="data/technical_chimeras.bed.gz">technical_chimeras.bed.gz</a></td> </tr> <tr> <td>Other-chimeric Alignments:</td> <td class="raw_files"><a download="techinical_atypical_chimeras.bed.gz" href="data/technical_atypical_chimeras.bed.gz">techinical_atypical_chimeras.bed.gz</a></td> </tr> </table> </div> <div id="raw_data"> <table class="header_table"> <tr><td class="rhead" colspan="2">Raw data</td></tr> <tr> <td>Alignments stats raw report:</td> <td class="raw_files"><a id="alignment_stats.txt" href="data/alignment_stats.txt">alignment_stats.txt</a></td> </tr> <tr> <td>Read lengths:</td> <td class="raw_files"><a download="lengths.txt.gz" href="data/lengths.txt.gz">lengths.txt.gz</a></td> </tr> <tr> <td>Reference sequence lengths:</td> <td class="raw_files"><a id="chrlens.txt" href="data/chrlens.txt">chrlens.txt</a></td> </tr> <tr> <td>Coverage bed:</td> <td class="raw_files"><a download="depth.sorted.bed.gz" href="data/depth.sorted.bed.gz">depth.sorted.bed.gz</a></td> </tr> ''' of.write(ostr) if args.do_loci: of.write('<tr> <td>Loci basics bed:</td><td class="raw_files"><a download="loci.bed.gz" href="data/loci.bed.gz">loci.bed.gz</a></td></tr>'+"\n") of.write('<tr><td>Locus read data bed:</td><td class="raw_files"><a download="loci-all.bed.gz" href="data/loci-all.bed.gz">loci-all.bed.gz</a></td></tr>'+"\n") of.write('<tr><td>Locus rarefraction:</td><td class="raw_files"><a download="locus_rarefraction.txt" href="data/locus_rarefraction.txt">locus_rarefraction.txt</a></td></tr>'+"\n") if args.annotation: ostr = ''' <tr> <td>Read annotations:</td> <td class="raw_files"><a download="annotbest.txt.gz" href="data/annotbest.txt.gz">annotbest.txt.gz</a></td> </tr> <tr> <td>Read genomic features:</td> <td class="raw_files"><a download="read_genomic_features.txt.gz" href="data/read_genomic_features.txt.gz">read_genomic_features.txt.gz</a></td> </tr> <tr> <td>Annotation status and read lengths:</td> <td class="raw_files"><a download="annot_lengths.txt.gz" href="data/annot_lengths.txt.gz">annot_lengths.txt.gz</a></td> </tr> <tr> <td>Gene any match rarefraction:</td> <td class="raw_files"><a download="gene_rarefraction.txt" href="data/gene_rarefraction.txt">gene_rarefraction.txt</a></td> </tr> <tr> <td>Gene full-length rarefraction:</td> <td class="raw_files"><a download="gene_full_rarefraction.txt" href="data/gene_full_rarefraction.txt">gene_full_rarefraction.txt</a></td> </tr> <tr> <td>Transcript any match rarefraction:</td> <td class="raw_files"><a download="transcript_rarefraction.txt" href="data/transcript_rarefraction.txt">transcript_rarefraction.txt</a></td> </tr> <tr> <td>Transcript full-length rarefraction:</td> <td class="raw_files"><a download="transcript_full_rarefraction.txt" href="data/transcript_full_rarefraction.txt">transcript_full_rarefraction.txt</a></td> </tr> <tr> <td>Bias table:</td> <td class="raw_files"><a download="bias_table.txt.gz" href="data/bias_table.txt.gz">bias_table.txt.gz</a></td> </tr> ''' # if args.annotation of.write(ostr) # done with args.annotation #output data that depends on reference if args.reference: ostr = ''' <tr> <td>Alignment errors data:</td> <td class="raw_files"><a download="error_data.txt" href="data/error_data.txt">error_data.txt</a></td> </tr> <tr> <td>Alignment error report:</td> <td class="raw_files"><a download="error_stats.txt" href="data/error_stats.txt">error_stats.txt</a></td> </tr> <tr> <td>Contextual errors data:</td> <td class="raw_files"><a download="context_error_data.txt" href="data/context_error_data.txt">context_error_data.txt</a></td> </tr> ''' # if args.reference of.write(ostr) # back to any condition ostr = ''' </table> </div> </body> </html> ''' of.write(ostr)
def main(): #do our inputs args = do_inputs() # first we need to run the classify classify_reads.external_cmd('classify_reads.py ' + args.input_annot + ' ' + args.input_gpd + ' -o ' + args.tempdir + '/classify.txt.gz') get_novel_sets(args.tempdir + '/classify.txt.gz', args.input_gpd, args.tempdir + '/novel_isoform_reads.gpd.gz', args.tempdir + '/novel_locus_reads.gpd.gz', args) # Now we can make a new non-redundant set of genpreds from the novel isoforms sys.stderr.write("making NR novel isoforms\n") cmd = 'gpd_to_nr.py '+args.tempdir+'/novel_isoform_reads.gpd.gz '+\ ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\ ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\ ' --minimum_support '+str(args.minimum_support)+\ ' --gene_names '+\ ' -o '+args.tempdir+'/novel_isoforms_nr.gpd.gz' gpd_to_nr.external_cmd(cmd) sys.stderr.write("reannotating novel based on our new gpd\n") # Now we reannotate the novel based on the these newly annotated isoforms cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_locus_reads.gpd.gz '+\ ' --threads '+str(1)+' '+\ ' -r '+args.tempdir+'/novel_isoforms_nr.gpd.gz '+\ ' -o '+args.tempdir+'/novel_locus_reads.annot.txt.gz' gpd_annotate.external_cmd(cmd) # now this new annotation should be classified # the new isoform will be in novel_isoform_reads.gpd.gz cmd = 'classify_reads.py ' + args.tempdir + '/novel_locus_reads.annot.txt.gz ' + args.tempdir + '/novel_locus_reads.gpd.gz -o ' + args.tempdir + '/classify_novel.txt.gz' sys.stderr.write(cmd + "\n") classify_reads.external_cmd(cmd) get_novel_sets(args.tempdir + '/classify_novel.txt.gz', args.tempdir + '/novel_locus_reads.gpd.gz', args.tempdir + '/novel_isoform_reads2.gpd.gz', args.tempdir + '/novel_locus_reads2.gpd.gz', args) # now lets combine our novel isoform reads making sure to sort them of = open(args.tempdir + '/new_novel_isoform_reads.gpd.gz', 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE) cmd1 = 'sort -k3,3 -k5,5n -k6,6n' p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE) inf = gzip.open(args.tempdir + '/novel_isoform_reads.gpd.gz') for line in inf: p1.stdin.write(line) inf.close() inf = gzip.open(args.tempdir + '/novel_isoform_reads2.gpd.gz') for line in inf: p1.stdin.write(line) inf.close() p1.communicate() p2.communicate() of.close() # Now we can make a new non-redundant set of genpreds from the novel isoforms sys.stderr.write("making NR novel isoforms\n") cmd = 'gpd_to_nr.py '+args.tempdir+'/new_novel_isoform_reads.gpd.gz '+\ ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\ ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\ ' --minimum_support '+str(args.minimum_support)+\ ' --gene_names '+\ ' -o '+args.tempdir+'/novel_isoforms_nr2.gpd.gz' gpd_to_nr.external_cmd(cmd) #Only need to reannotate if we are interested in whats left over #sys.stderr.write("reannotating novel based on our new gpd\n") ## Now we reannotate the novel based on the these newly annotated isoforms #cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_locus_reads.gpd.gz '+\ # ' --threads '+str(args.threads)+' '+\ # ' -r '+args.tempdir+'/novel_isoforms_nr2.gpd.gz '+\ # ' -o '+args.tempdir+'/novel_locus_reads.annot.txt.gz' #gpd_annotate.external_cmd(cmd) sys.stderr.write("now work on the novel loci\n") # Now lets work on the novel locus of = open(args.tempdir + '/sorted_novel_locus_reads.gpd.gz', 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE) cmd1 = 'sort -k3,3 -k5,5n -k6,6n' p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE) inf = gzip.open(args.tempdir + '/novel_locus_reads2.gpd.gz') for line in inf: p1.stdin.write(line) inf.close() p1.communicate() p2.communicate() of.close() sys.stderr.write("making NR novel loci\n") cmd = 'gpd_to_nr.py '+args.tempdir+'/sorted_novel_locus_reads.gpd.gz '+\ ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\ ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\ ' --minimum_support '+str(args.minimum_support)+\ ' -o '+args.tempdir+'/novel_locus_nr.gpd.gz' gpd_to_nr.external_cmd(cmd) sys.stderr.write("sort the novel isoforms\n") of = open(args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz', 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE) cmd1 = 'sort -k3,3 -k5,5n -k6,6n' p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE) inf = gzip.open(args.tempdir + '/novel_isoforms_nr2.gpd.gz') for line in inf: p1.stdin.write(line) inf.close() p1.communicate() p2.communicate() of.close() sys.stderr.write("sort the novel loci\n") of = open(args.tempdir + '/novel_loci_nr.sorted.gpd.gz', 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE) cmd1 = 'sort -k3,3 -k5,5n -k6,6n' p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE) inf = gzip.open(args.tempdir + '/novel_locus_nr.gpd.gz') for line in inf: p1.stdin.write(line) inf.close() p1.communicate() p2.communicate() of.close() # Now we can rename totally novel genes based on locus overlap of = open(args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz', 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE) cmd1 = 'sort -k3,3 -k5,5n -k6,6n' p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE) inf = gzip.open(args.tempdir + '/novel_loci_nr.sorted.gpd.gz') gs = GPDStream(inf) ls = LocusStream(gs) z = 0 for rng in ls: z += 1 rng_string = rng.get_range_string() gpds = rng.get_payload() for gpd in gpds: gene_name = 'LOC' + str(z) + '|' + str( len(gpds)) + '|' + rng_string f = gpd.get_gpd_line().rstrip().split("\t") f[0] = gene_name gpd_line = "\t".join(f) p1.stdin.write(gpd_line + "\n") p1.communicate() p2.communicate() of.close() # we are almost done but we need to make sure these genepreds aren't subsets of known genes sys.stderr.write("reannotating novel-isoform by reference\n") cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_isoforms_nr.sorted.gpd.gz '+\ ' --threads '+str(1)+' '+\ ' -r '+args.reference_annotation_gpd+\ ' -o '+args.tempdir+'/novel_isoforms_nr.annot.txt.gz' gpd_annotate.external_cmd(cmd) cmd = 'classify_reads.py ' + args.tempdir + '/novel_isoforms_nr.annot.txt.gz ' + args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz -o ' + args.tempdir + '/classify_novel_isoform_ref.txt.gz' sys.stderr.write(cmd + "\n") classify_reads.external_cmd(cmd) # now we can screen to make sure things in the novel isoform file really are novel isoforms blacklist = set() finf = gzip.open(args.tempdir + '/classify_novel_isoform_ref.txt.gz') for line in finf: f = line.rstrip().split("\t") if f[2] == 'subset' or f[2] == 'full': blacklist.add(f[0]) finf.close() fof = gzip.open(args.tempdir + '/novel_isoforms_nr.filtered.sorted.gpd.gz', 'w') finf = gzip.open(args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz') for line in finf: f = line.rstrip().split("\t") if f[1] in blacklist: continue fof.write(line) finf.close() fof.close() sys.stderr.write("reannotating novel-locus by reference\n") cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_loci_nr_named.sorted.gpd.gz '+\ ' --threads '+str(1)+' '+\ ' -r '+args.reference_annotation_gpd+\ ' -o '+args.tempdir+'/novel_loci_nr_named.annot.txt.gz' gpd_annotate.external_cmd(cmd) cmd = 'classify_reads.py ' + args.tempdir + '/novel_loci_nr_named.annot.txt.gz ' + args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz -o ' + args.tempdir + '/classify_novel_loci.txt.gz' sys.stderr.write(cmd + "\n") classify_reads.external_cmd(cmd) # now we can screen to make sure things in the novel isoform file really are novel isoforms blacklist = set() finf = gzip.open(args.tempdir + '/classify_novel_loci.txt.gz') for line in finf: f = line.rstrip().split("\t") if f[2] == 'subset' or f[2] == 'full': blacklist.add(f[0]) finf.close() fof = gzip.open( args.tempdir + '/novel_loci_nr_named.filtered.sorted.gpd.gz', 'w') finf = gzip.open(args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz') for line in finf: f = line.rstrip().split("\t") if f[1] in blacklist: continue fof.write(line) finf.close() fof.close() if not os.path.exists(args.output): os.makedirs(args.output) copy(args.tempdir + '/novel_loci_nr_named.filtered.sorted.gpd.gz', args.output + '/novel_loci_nr_named.sorted.gpd.gz') copy(args.tempdir + '/novel_isoforms_nr.filtered.sorted.gpd.gz', args.output + '/novel_isoforms_nr.sorted.gpd.gz') # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(): #do our inputs args = do_inputs() inf = sys.stdin if args.input: if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = open(args.tempdir+'/input.gpd.gz','w') sys.stderr.write("sorting our input\n") input_cnt = sort_gpd(inf,of,args) of.close() inf.close() rinf = None if args.reference[-3:] == '.gz': rinf = gzip.open(args.reference) else: rinf = open(args.reference) sys.stderr.write("sorting our reference\n") rof = open(args.tempdir+'/ref.gpd.gz','w') sort_gpd(rinf,rof,args) rof.close() # Now we can traverse the ordered files by locus inf_input = gzip.open(args.tempdir+'/input.gpd.gz') inf_ref = gzip.open(args.tempdir+'/ref.gpd.gz') gsi = GPDStream(inf_input) gsr = GPDStream(inf_ref) mls = MultiLocusStream([gsi,gsr]) z = 0 y = 0 output = [] if args.threads > 1: p = Pool(args.threads) sys.stderr.write("processing "+str(input_cnt)+" inputs\n") for rng in mls: z+=1 if z%10 == 0: perc = int(100*float(y)/float(input_cnt+1)) sys.stderr.write(rng.get_range_string()+" "+str(y)+" inputs "+str(perc)+"% \r") (input_entries,reference_entries) = rng.get_payload() if len(input_entries)==0: continue # Lets convert these back to lines to make the easier to pass through multiprocessing igpds = [x.get_gpd_line() for x in input_entries] rgpds = [x.get_gpd_line() for x in reference_entries] y += len(input_entries) if args.threads == 1: output.append(MiniQueue(process_locus(igpds,rgpds,args))) else: output.append(p.apply_async(process_locus,args=(igpds,rgpds,args,))) sys.stderr.write("\n") if args.threads > 1: p.close() p.join() of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') tn_cnt = 0 for out in output: outlines = out.get() for line in outlines: f = line.rstrip().split("\t") if int(f[3]) != 0: tn_cnt+=1 of.write(line+"\n") of.close() perc = '?' if input_cnt > 0: perc = int(100*float(tn_cnt)/float(input_cnt)) sys.stderr.write("Found "+str(tn_cnt)+" "+str(perc)+"% Unsupported Transcripts\n") # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): sys.stderr.write("Reading in reference genePred\n") refgpd = {} inf = open(args.ref_genepred) gs = GPDStream(inf) z = 0 for gpd in gs: z += 1 refgpd[z] = gpd inf.close() sys.stderr.write("Reading in read annotations\n") inf = None if is_gzip(args.annotations): inf = gzip.open(args.annotations) else: inf = open(args.annotations) reflocs = {} rline = {} for line in inf: f = line.rstrip().split("\t") res={'read_line':int(f[0]),\ 'read_name':f[1],\ 'gene_name':f[2],\ 'tx_name':f[3],\ 'type':f[4],\ 'matching_exon_count':int(f[5]),\ 'consecutive_exons':int(f[6]),\ 'read_exons':int(f[7]),\ 'tx_exons':int(f[8]),\ 'overlap':int(f[9]),\ 'read_length':int(f[10]),\ 'tx_length':int(f[11]),\ 'read_range':GenomicRange(range_string=f[12]),\ 'tx_range':GenomicRange(range_string=f[13]),\ 'ref_line':int(f[14])} if res['ref_line'] not in reflocs: reflocs[res['ref_line']] = [] reflocs[res['ref_line']].append(res) if args.full and res['type'] != 'full': continue if args.minimum_matched_exons > res['matching_exon_count']: continue rline[res['read_line']] = res inf.close() sys.stderr.write("reading read genepred\n") inf = None if is_gzip(args.read_genepred): inf = gzip.open(args.read_genepred) else: inf = open(args.read_genepred) gs = GPDStream(inf) z = 0 originals = {} for gpd in gs: z += 1 if z not in rline: continue refline = rline[z]['ref_line'] if refline not in originals: originals[refline] = {} originals[refline][z] = gpd inf.close() results = {} for i in range(1, 101): results[str(i)] = [] read_total = 0 outs = {} for tx_line in originals: ref_gpd = refgpd[tx_line] annots = reflocs[tx_line] reads = originals[tx_line].values() v = do_tx_line(ref_gpd, annots, reads, args) if not v: continue tname = ref_gpd.get_transcript_name() bins = sorted([int(x) for x in v[0].keys()]) outs[tname] = [0 for x in range(1, 101)] read_total += v[1] for i in range(1, 101): if str(i) in v[0]: results[str(i)].append(v[0][str(i)]) outs[tname][i - 1] = v[0][str(i)] #else: # results[str(i)].append(0) of = sys.stdout if args.output and re.search('\.gz', args.output): of = gzip.open(args.output, 'w') elif args.output: of = open(args.output, 'w') tot = len(outs.keys()) #for i in range(1,101): # ostr = str(i) # tot = len(results[str(i)]) # for j in results[str(i)]: # ostr += "\t"+str(j) # of.write(ostr+"\n") for tname in outs: of.write(tname + "\t" + "\t".join([str(x) for x in outs[tname]]) + "\n") of.close() if args.output_counts: of = open(args.output_counts, 'w') of.write(str(tot) + "\t" + str(read_total) + "\n") of.close() sys.stderr.write( str(tot) + " total transcripts \t" + str(read_total) + " total reads\n")
def main(args): # Setup inputs inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout # Setup outputs if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') mr = TranscriptLociMergeRules('is_any_overlap') mr.set_use_junctions(False) if args.threads > 1: p = Pool(processes=args.threads) results = [] z = 0 for locus in LocusStream(GPDStream(inf)): vals = locus.get_payload() if args.downsample: if len(vals) > args.downsample: shuffle(vals) vals = vals[0:args.downsample] locus.set_payload(vals) if args.threads <= 1: tls = Queue(do_locus(locus, mr, z, args, verbose=True)) results.append(tls) else: tls = p.apply_async(do_locus, args=(locus, mr, z, args, False), callback=process_output) results.append(tls) z += len(locus.get_payload()) if args.threads > 1: p.close() p.join() #sys.exit() sys.stderr.write("\n") sys.stderr.write("Outputing results\n") if args.output_loci: if re.search('\.gz$', args.output_loci): ofl = gzip.open(args.output_loci, 'w') else: ofl = open(args.output_loci, 'w') lnum = 0 for res in sorted([y for y in [r.get() for r in results] if y], key=lambda x: (x.chr, x.start, x.end)): rng = res.get_range_string() rngout = res.copy() tls = res.get_payload() for tl in sorted( tls, key=lambda x: (x.get_range().chr, x.get_range().start, x.get_range().end)): lnum += 1 txs = sorted( tl.get_transcripts(), key=lambda x: (x.get_range().chr, x.get_range().start, x.get_range().end)) tlrng = [str(x) for x in tl.get_range().get_bed_array()] ofl.write("\t".join(tlrng) + "\t" + str(lnum) + "\t" + str(len(txs)) + "\n") for tx in txs: cov = tx.get_payload()[1] of.write("\t".join(tlrng) + "\t" + str(lnum) + "\t" + str(len(txs)) + "\t" + str(tx.get_payload()[0]) + "\t" + str(z) + "\t" + tx.get_gene_name() + "\t" + str(cov['average_coverage']) + "\t" + str(cov['fraction_covered']) + "\t" + str(cov['mindepth']) + "\n") if args.output_loci: ofl.close() inf.close() of.close()