예제 #1
0
def main(args):

    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    loci = LocusStream(GPDStream(inf))
    for locus in loci:
        exranges = []
        for entry in locus.get_payload():
            for exon in entry.exons:
                exranges.append(exon.get_range())
        covs = ranges_to_coverage(exranges)
        for cov in covs:
            of.write("\t".join([str(x) for x in cov.get_bed_coordinates()]) +
                     "\t" + str(+cov.get_payload()) + "\n")
    of.close()
    inf.close()
def main(args):

    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    p = Pool(processes=args.threads)
    loci = LocusStream(GPDStream(inf))
    csize = 100
    results = p.imap(func=do_locus,
                     iterable=generate_gpd(loci),
                     chunksize=csize)
    for covs in results:
        for cov in covs:
            of.write(cov)
    of.close()
    inf.close()
예제 #3
0
def main(args):
  #do our inputs

  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:] == '.gz':
      inf = gzip.open(args.input)
    else:
      inf = open(args.input)

  of = sys.stdout
  if args.output: 
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  

  gs = GPDStream(inf)
  ls = LocusStream(gs)
  if args.threads > 1:
    p = Pool(processes=args.threads)
  results = []
  for locus_rng in ls:
    if args.threads == 1:
      sys.stderr.write(locus_rng.get_range_string()+"\n")
    else:
      sys.stderr.write(locus_rng.get_range_string()+"                 \r")
    gpds = locus_rng.get_payload()
    if args.threads > 1:
      new_gpds = p.apply_async(do_multi_round_locus,args=(gpds,args,))
      results.append(new_gpds)
    else:
      new_gpds = MiniQueue(do_multi_round_locus(gpds,args))
      results.append(new_gpds)
  if args.threads > 1:
    p.close()
    p.join()
  
  for result in results:
    new_gpds = result.get()
    for v in new_gpds:
      if not v['tx'].validate(): 
        sys.stderr.write("ERROR: invalid gpd entry\n")
        sys.stderr.write(v['tx'].get_fake_gpd_line()+"\n")
        sys.exit()
      fake_gpd = v['tx'].get_fake_gpd_line()
      #print v['tx'].get_gene_name()
      if args.gene_names: 
        f = fake_gpd.rstrip().split("\t")
        f[0] = v['tx'].get_gene_name()
        fake_gpd = "\t".join(f)
      of.write(fake_gpd+"\n")
  of.close()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
예제 #4
0
def main(args):

    inf = None
    if re.search('\.gz$', args.input):
        inf = gzip.open(args.input)
    else:
        inf = open(args.input)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    gs = GPDStream(inf)
    for gpd in gs:
        of.write(
            str(gpd.get_length()) + "\t" + str(gpd.get_exon_count()) + "\n")
    of.close()
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Intersect a bam with a gpd file to give bam coverage of each gpd entry",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('sorted_bam', help="sorted bam file")
    args = parser.parse_args()

    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)
    #bs = BAMFile(args.sorted_bam)
    bs = SamtoolsBAMStream(args.sorted_bam)
    gs = GPDStream(args.input)
    mls = MultiLocusStream([gs, bs])
    for ml in mls:
        [gpds, bams] = ml.get_payload()
        print ml
        print len(gpds)
        print len(bams)
예제 #6
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="output file or use STDOUT if not set")
    args = parser.parse_args()

    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)
    gs = GPDStream(args.input)
    ls = LocusStream(gs)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    for rng in ls:
        sys.stderr.write(rng.get_range_string() + "    \r")
        gpds = rng.get_payload()
        exs = []
        for ex_set in [[y.get_range() for y in x.exons] for x in gpds]:
            exs += ex_set
        cov = ranges_to_coverage(exs)
        #use our coverage data on each gpd entry now
        for gpd in gpds:
            totcov = 0
            for exon in [x.get_range() for x in gpd.exons]:
                gcovs = union_range_array(exon, cov, payload=2)
                totcov += sum([x.get_payload() * x.length() for x in gcovs])
            of.write(gpd.get_gene_name() + "\t" + str(gpd.get_exon_count()) +
                     "\t" + str(gpd.get_length()) + "\t" +
                     str(float(totcov) / float(gpd.get_length())) + "\n")
    sys.stderr.write("\n")
    of.close()
예제 #7
0
def main(args):

  inf = None
  if re.search('\.gz',args.best_gpd):
    inf = gzip.open(args.best_gpd)
  else:
    inf = open(args.best_gpd)
  gs = GPDStream(inf)
  z = 0
  data = {}
  for gpd in gs:
    z += 1
    data[z] = [gpd.get_length(),gpd.get_exon_count()]
    gpd.get_length()
  inf.close()
  inf = None
  if re.search('\.gz',args.best_annotation):
    inf = gzip.open(args.best_annotation)
  else:
    inf = open(args.best_annotation)
  done_reads = set()
  of = sys.stdout
  if args.output:
    if re.search('\.gz$',args.output):
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  for line in inf:
    f = line.rstrip().split("\t")
    read_id = int(f[0])
    type = f[4]
    done_reads.add(read_id)
    of.write(type+"\t"+str(data[read_id][0])+"\t"+str(data[read_id][1])+"\n")
  for i in [x for x in range(1,z+1) if x not in done_reads]:
    of.write('unannotated'+"\t"+str(data[i][0])+"\t"+str(data[i][1])+"\n")
  of.close()
def main():
    #do our inputs
    args = do_inputs()

    of = sys.stdout
    if args.output: of = open(args.output, 'w')

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)

    sys.stderr.write("reading in fasta\n")
    f = FastaData(open(args.reference).read())
    sh = GPDStream(inf)
    gc_bins = range(0, args.number_of_bins)
    bin_handles = []
    for i in range(0, args.number_of_bins):
        fname = args.tempdir + '/' + str(i) + '.bed.gz'
        cmd2 = 'bed_to_bed_depth.py - -o ' + fname
        p2 = Popen(cmd2.split(), stdin=PIPE, close_fds=True)
        cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir
        p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, close_fds=True)
        bin_handles.append([p1, p2, fname, i])

    if args.best_X_covered:
        sys.stderr.write("work out stratified data\n")
        cmd3 = 'bed_depth_to_stratified_coverage.py --minimum_coverage 10 --output_key ' + args.tempdir + '/key' + ' -r ' + args.reference + ' - -o ' + args.tempdir + '/combo.bed.gz'
        pstrat3 = Popen(cmd3.split(), stdin=PIPE, close_fds=True)
        cmd2 = 'bed_to_bed_depth.py -'
        pstrat2 = Popen(cmd2.split(),
                        stdin=PIPE,
                        stdout=pstrat3.stdin,
                        close_fds=True)
        cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir
        pstrat1 = Popen(cmd1.split(),
                        stdin=PIPE,
                        stdout=pstrat2.stdin,
                        close_fds=True)

    num = 0
    for gpd in sh:
        num += 1
        if (num % 1000 == 0): sys.stderr.write(str(num) + "     \r")
        results = []
        if args.minimum_sequence_length:
            if gpd.get_length() < args.minimum_sequence_length: continue
            seq = gpd.get_sequence(f).upper()
            seq_obj = Seq(seq)
            n_count = seq_obj.n_count()
            if len(seq) - n_count < args.min_non_N: continue
            gc = seq_obj.gc_content()
            gc_bin = int(args.number_of_bins * gc)
            if gc_bin == args.number_of_bins: gc_bin -= 1
            for exon in gpd.exons:
                bed_bin = [
                    "\t".join([str(x) for x in exon.rng.get_bed_array()]),
                    gc_bin
                ]
                results.append(bed_bin)
        elif args.fragment:
            seqlen = gpd.get_length()
            if seqlen < args.fragment: continue
            sfrags = int(float(seqlen) / float(args.fragment))
            sremain = seqlen % args.fragment
            offset = 0
            if random.random() < 0.5: offset = sremain
            #print '^^^'
            for i in range(0, sfrags):
                gsub = gpd.subset(i * args.fragment + offset,
                                  (i + 1) * args.fragment + offset)
                seq = gsub.get_sequence(f).upper()
                seq_obj = Seq(seq)
                n_count = seq_obj.n_count()
                if len(seq) - n_count < args.min_non_N: continue
                gc = seq_obj.gc_content()
                gc_bin = int(args.number_of_bins * gc)
                if gc_bin == args.number_of_bins: gc_bin -= 1
                for exon in gsub.exons:
                    bed_bin = [
                        "\t".join([str(x) for x in exon.rng.get_bed_array()]),
                        gc_bin
                    ]
                    results.append(bed_bin)

        for val in results:
            [bed, bin] = val
            bin_handles[bin][0].stdin.write(bed + "\n")
            if args.best_X_covered:
                pstrat1.stdin.write(bed + "\n")
                #if not gc: print len(gpd.get_sequence(f))

    sys.stderr.write("\n")
    for v in bin_handles:
        v[0].communicate()
        v[1].communicate()
    if args.best_X_covered:
        pstrat1.communicate()
        pstrat2.communicate()
        pstrat3.communicate()
        # If we want stratified data we should do it here
        sys.stderr.write("read the key\n")
        d = {}
        with open(args.tempdir + '/key') as inf:
            header = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                d[int(f[0])] = int(f[1])
        if args.best_X_covered not in d:
            sys.stderr.write(
                "ERROR: the number of bases you specified is probably too big you didn't make the digit begin with 1 or 5 and restof the numbers be zero\n"
            )
            sys.exit()
        num = d[args.best_X_covered]
        ninf = gzip.open(args.tempdir + '/combo.bed.gz')
        nof = gzip.open(args.tempdir + '/strat.bed.gz', 'w')
        for line in ninf:
            f = line.rstrip().split("\t")
            if int(f[3]) >= num:
                nof.write("\t".join(f[:-1]) + "\n")
        nof.close()
        ninf.close()
        for i in range(0, len(bin_handles)):
            v = bin_handles[i]
            fname = v[2]
            fname2 = args.tempdir + '/' + str(v[3]) + '.strata.bed.gz'
            gof = open(fname2, 'w')
            cmd2 = 'gzip'
            p2 = Popen(cmd2.split(), stdout=gof, stdin=PIPE)
            cmd1 = 'bedtools intersect -a ' + fname + ' -b ' + args.tempdir + '/strat.bed.gz'
            p1 = Popen(cmd1.split(), stdout=p2.stdin)
            p1.communicate()
            p2.communicate()
            gof.close()
            # lets just replace the name of the file that the final output will read from
            bin_handles[i][2] = fname2
    # Now we have bed depths for each bin
    for v in bin_handles:
        fname = v[2]
        #sys.stderr.write(fname+" ... prosessing\n")
        depths = {}
        bin = v[3]
        inf = gzip.open(fname)
        for line in inf:
            f = line.rstrip().split("\t")
            bases = int(f[2]) - int(f[1])
            depth = int(f[3])
            if depth not in depths: depths[depth] = 0
            depths[depth] += bases
        inf.close()
        for depth in sorted(depths.keys()):
            of.write(
                str(bin) + "\t" + str(depth) + "\t" + str(depths[depth]) +
                "\n")
    of.close()
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
def main():
  parser = argparse.ArgumentParser(description="For every gpd entry (sorted) intersect it with bed depth (sorted)",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('gpd_input',help="GPD file")
  parser.add_argument('bed_depth_input',help="GPD file")
  parser.add_argument('-o','--output',help="output file")
  args = parser.parse_args()
  
  inf1 = None
  if re.search('\.gz$',args.gpd_input):
    inf1 = gzip.open(args.gpd_input)
  else:
    inf1 = open(args.gpd_input)
  inf2 = None
  if re.search('\.gz$',args.bed_depth_input):
    inf2 = gzip.open(args.bed_depth_input)
  else:
    inf2 = open(args.bed_depth_input)
  gs = GPDStream(inf1)
  bs = BedStream(inf2)
  of = sys.stdout
  if args.output:
    if re.search('\.gz$',args.output):
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  mls = MultiLocusStream([gs,bs])
  z = 0
  for ml in mls:
    z += 1
    #if z%1000 == 0:
    sys.stderr.write(ml.get_range_string()+"       \r")
    [gpds,beds] = ml.get_payload()
    if len(gpds) == 0: 
      continue
    if len(beds)==0:
      for gpd in gpds:
        of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t0\t0\t0"+"\n")
      continue
    #break beds up by depth
    #depths = {}
    #for bed in beds:
    #  d = int(bed.get_payload())
    #  if d not in depths: depths[d] = []
    #  depths[d].append(bed)
    #for gpd in gpds:
    #  clen = 0
    #  tot = 0
    #  for d in depths:
    #    covs = []
    #    for ex in [x.get_range() for x in gpd.exons]:
    #      clen += sum([x.overlap_size(ex) for x in depths[d]])
    #      tot += clen*d
    #  of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n")
    for gpd in gpds:
      covs = []
      for ex in [x.get_range() for x in gpd.exons]:
        c = union_range_array(ex,beds,payload=2)
        covs += c
      clen = sum([x.length() for x in covs if int(x.get_payload())>0])
      tot =  sum([x.length()*int(x.get_payload()) for x in covs])
      of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n")
  sys.stderr.write("\n")
  of.close()
  inf1.close()
  inf2.close()
예제 #10
0
def main(args):
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
    color = '0,0,0'

    if args.color:
        if args.color == 'blue':
            color = '67,162,202'
        elif args.color == 'green':
            color = '49,163,84'
        elif args.color == 'orange':
            color = '254,178,76'
        elif args.color == 'purple':
            color = '136,86,167'
        elif args.color == 'red':
            color = '240,59,32'

    # set up the header if one is desired
    header = ''
    if not args.noheader:
        newname = 'longreads'
        m = re.search('([^\/]+)$', args.input)
        if m:
            newname = m.group(1)
        newname = re.sub('[\s]+', '_', newname)
        if args.headername:
            newname = args.headername
        elif args.input == '-':
            newname = 'STDIN'
        header += "track\tname=" + newname + "\t"
        description = newname + ' GenePred Entries'
        if args.headerdescription:
            description = args.headerdescription
        header += 'description="' + description + '"' + "\t"
        header += 'itemRgb="On"'
        of.write(header + "\n")

    gpd_handle = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            gpd_handle = gzip.open(args.input)
        else:
            gpd_handle = open(args.input)
    gs = GPDStream(gpd_handle)
    #with gpd_handle as infile:
    for gpd in gs:
        #for line in infile:
        #if re.match('^#',line):
        #  continue
        #genepred_entry = GenePredBasics.line_to_entry(line)
        if args.minintron:
            gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line())
        exoncount = gpd.get_exon_count()
        ostring = gpd.value('chrom') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        if args.namefield == 1:
            ostring += gpd.value('gene_name') + "\t"
        else:
            ostring += gpd.value('name')
        ostring += '1000' + "\t"
        ostring += gpd.value('strand') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        ostring += color + "\t"
        ostring += str(exoncount) + "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ','
        ostring += "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ','
        of.write(ostring + "\n")
        #for i in range(0,len(genepred_entry['exonStarts'])):
    gpd_handle.close()
    of.close()
예제 #11
0
def make_html(args):
  global g_version
  #read in our alignment data
  mydate = time.strftime("%Y-%m-%d")
  a = {}
  with open(args.tempdir+'/data/alignment_stats.txt') as inf:
    for line in inf:
      (name,numstr)=line.rstrip().split("\t")
      a[name]=int(numstr)

  #read in our special read analysis
  special = {}
  with open(args.tempdir+'/data/special_report') as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      if f[0] not in special: special[f[0]] = []
      special[f[0]].append(f[1:])

  #Only have error stats if we are using it
  e = {}
  if args.reference:
    #read in our error data
    with open(args.tempdir+'/data/error_stats.txt') as inf:
      for line in inf:
        (name,numstr)=line.rstrip().split("\t")
        e[name]=int(numstr)

  # read in our coverage data
  coverage_data = {}
  # this one will be set in annotation on section
  tx_to_gene = {}
  coverage_data['genome_total'] = 0
  with open(args.tempdir+'/data/chrlens.txt') as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      coverage_data['genome_total']+=int(f[1])
  inf = gzip.open(args.tempdir+'/data/depth.sorted.bed.gz')
  coverage_data['genome_covered'] = 0
  bs = BedStream(inf)
  for rng in bs:
    f = line.rstrip().split("\t")
    coverage_data['genome_covered'] += rng.length()
  inf.close()

  # The annotation section
  if args.annotation:
    inf = open(args.tempdir+'/data/beds/exon.bed')
    coverage_data['exons_total'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['exons_total'] += rng.length()
    inf.close()
    inf = open(args.tempdir+'/data/beds/intron.bed')
    coverage_data['introns_total'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['introns_total'] += rng.length()
    inf.close()
    inf = open(args.tempdir+'/data/beds/intergenic.bed')
    coverage_data['intergenic_total'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['intergenic_total'] += rng.length()
    inf.close()
    inf = gzip.open(args.tempdir+'/data/exondepth.bed.gz')
    coverage_data['exons_covered'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['exons_covered'] += rng.length()
    inf.close()
    inf = gzip.open(args.tempdir+'/data/introndepth.bed.gz')
    coverage_data['introns_covered'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['introns_covered'] += rng.length()
    inf.close()
    inf = gzip.open(args.tempdir+'/data/intergenicdepth.bed.gz')
    coverage_data['intergenic_covered'] = 0
    bs = BedStream(inf)
    for rng in bs:
      f = line.rstrip().split("\t")
      coverage_data['intergenic_covered'] += rng.length()
    inf.close()

    # deal with annotations
    ref_genes = {}
    ref_transcripts = {}
    with open(args.annotation) as inf:
      gs = GPDStream(inf)  
      for gpd in gs:
        tx_to_gene[gpd.get_transcript_name()] = gpd.get_gene_name()
        ref_genes[gpd.get_gene_name()] = [0,0]
        ref_transcripts[gpd.get_transcript_name()] = [0,0]
    inf = gzip.open(args.tempdir+'/data/annotbest.txt.gz')
    for line in inf:
      f = line.rstrip().split("\t")
      gene = f[2]
      tx = f[3]
      if f[4]=='partial': ref_genes[gene][0] += 1
      elif f[4]=='full': ref_genes[gene][1] += 1
      if f[4]=='partial': ref_transcripts[tx][0] += 1
      elif f[4]=='full': ref_transcripts[tx][1] += 1
    inf.close()

  #get our locus count
  if args.do_loci:
    inf = gzip.open(args.tempdir+'/data/loci.bed.gz')
    locuscount = 0
    for line in inf:
      locuscount += 1
    inf.close()

  #get our annotation counts
  if args.annotation:
    genefull = 0
    geneany = 0
    txfull = 0
    txany = 0
    inf = gzip.open(args.tempdir+'/data/annotbest.txt.gz')
    genes_f = {}
    genes_a = {}
    txs_f = {}
    txs_a = {}
    for line in inf:
      f = line.rstrip().split("\t")
      g = f[2]
      t = f[3]
      if g not in genes_a: genes_a[g] = 0
      genes_a[g]+=1
      if t not in txs_a: txs_a[t] = 0
      txs_a[t]+=1
      if f[4] == 'full':
        if g not in genes_f: genes_f[g] = 0
        genes_f[g]+=1
        if t not in txs_f: txs_f[t] = 0
        txs_f[t]+=1
    inf.close()
    genefull = len(genes_f.keys())
    geneany = len(genes_a.keys())
    txfull = len(txs_f.keys())
    txany = len(txs_a.keys())
    # still in args.annotation required
    #Get evidence counts for bias
    bias_tx_count = None
    bias_read_count = None
    with open(args.tempdir+'/data/bias_counts.txt') as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        bias_tx_count = int(f[0])
        bias_read_count = int(f[1])

  #make our css directory
  if not os.path.exists(args.tempdir+'/css'):
    os.makedirs(args.tempdir+'/css')
  udir = os.path.dirname(os.path.realpath(__file__))
  #copy css into that directory
  copy(udir+'/../data/mystyle.css',args.tempdir+'/css/mystyle.css')
  of = open(args.tempdir+'/report.xhtml','w')
  ostr = '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link rel="stylesheet" type="text/css" href="css/mystyle.css" />
<title>Long Read Alignment and Error Report</title>
</head>
<body>
'''
  of.write(ostr)
  #########################################
  # 1.  TOP BLOCK
  ostr = '''
<div class="result_block">
  <div class="top_block">
    <div>
    Generated on:
    </div>
    <div class="input_value">
'''
  of.write(ostr)
  of.write(mydate)
  ostr = '''
    </div>
  </div>
  <div class="top_block">
    <div>
    Version:
    </div>
    <div class="input_value">'''
  of.write(ostr)
  of.write(str(g_version))
  ostr = '''
    </div>
  </div>
  <div class="top_block">
    <div>Execution parmeters:</div>
    <div class="input_value">
    <a id="params.txt" href="data/params.txt">params.txt</a>
    </div>
  </div>
  <div class="top_block">
    <div>Long read alignment and error report for:</div>
    <div class="input_value" id="filename">'''
  of.write(ostr+"\n")
  of.write(args.input)
  ostr = '''
    </div>  
  </div>
  <div class="clear"></div>
  <div class="top_block">
    <div>
    Reference Genome:
    </div>
    <div class="input_value">'''
  of.write(ostr)
  #if args.reference:
  of.write(str(args.reference))
  #else:
  #  of.write('&#xA0;'*20)
  ostr = '''
    </div>
  </div>
  <div class="top_block">
    <div>
    Reference Annotation:
    </div>
    <div class="input_value">'''
  of.write(ostr)
  #if args.reference:
  of.write(str(args.annotation))
  #else:
  #  of.write('&#xA0;'*20)
  ostr = '''
    </div>
  </div>
</div>
<div class="clear"></div>
<hr />
'''
  of.write(ostr)
  ##################################
  # 2. ALIGNMENT ANALYSIS

  ## This block should be in every output.  Generated from the BAM
  ostr = '''
<div class="result_block">
  <div class="subject_title">
    <table><tr><td class="c1">Alignment analysis</td><td class="c2"><span class="highlight">'''
  of.write(ostr)
  reads_aligned = perc(a['ALIGNED_READS'],a['TOTAL_READS'],1)
  of.write(reads_aligned)
  ostr = '''
  </span></td><td class="c2"><span class="highlight2">reads aligned</span></td><td class="c2"><span class="highlight">'''
  of.write(ostr)
  bases_aligned = perc(a['ALIGNED_BASES'],a['TOTAL_BASES'],1)
  of.write(bases_aligned)
  ostr = '''
  </span></td><td class="c2"><span class="highlight2">bases aligned <i>(of aligned reads)</i></span></td></tr></table>
  </div>
  <div class="one_third left">
    <table class="data_table">
        <tr class="rhead"><td colspan="3">Read Stats</td></tr>'''
  of.write(ostr+"\n")
  total_read_string = '<tr><td>Total reads</td><td>'+str(addcommas(a['TOTAL_READS']))+'</td><td></td></tr>'
  of.write(total_read_string+"\n")
  unaligned_read_string = '<tr><td>- Unaligned reads</td><td>'+str(addcommas(a['UNALIGNED_READS']))+'</td><td>'+perc(a['UNALIGNED_READS'],a['TOTAL_READS'],1)+'</td></tr>'
  of.write(unaligned_read_string+"\n")
  aligned_read_string = '<tr><td>- Aligned reads</td><td>'+str(addcommas(a['ALIGNED_READS']))+'</td><td>'+perc(a['ALIGNED_READS'],a['TOTAL_READS'],1)+'</td></tr>'
  of.write(aligned_read_string+"\n")
  single_align_read_string = '<tr><td>--- Single-align reads</td><td>'+str(addcommas(a['SINGLE_ALIGN_READS']))+'</td><td>'+perc(a['SINGLE_ALIGN_READS'],a['TOTAL_READS'],1)+'</td></tr>'
  of.write(single_align_read_string+"\n")
  gapped_align_read_string = '<tr><td>--- Gapped-align reads</td><td>'+str(addcommas(a['GAPPED_ALIGN_READS']))+'</td><td>'+perc(a['GAPPED_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>'
  of.write(gapped_align_read_string+"\n")
  gapped_align_read_string = '<tr><td>--- Chimeric reads</td><td>'+str(addcommas(a['CHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['CHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>'
  of.write(gapped_align_read_string+"\n")
  gapped_align_read_string = '<tr><td>----- Trans-chimeric reads</td><td>'+str(addcommas(a['TRANSCHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['TRANSCHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>'
  of.write(gapped_align_read_string+"\n")
  gapped_align_read_string = '<tr><td>----- Self-chimeric reads</td><td>'+str(addcommas(a['SELFCHIMERA_ALIGN_READS']))+'</td><td>'+perc(a['SELFCHIMERA_ALIGN_READS'],a['TOTAL_READS'],2)+'</td></tr>'
  of.write(gapped_align_read_string+"\n")
  ostr='''
        <tr class="rhead"><td colspan="3">Base Stats <i>(of aligned reads)</i></td></tr>'''
  of.write(ostr+"\n")
  total_bases_string = '<tr><td>Total bases</td><td>'+str(addcommas(a['TOTAL_BASES']))+'</td><td></td></tr>'
  of.write(total_bases_string+"\n")
  unaligned_bases_string = '<tr><td>- Unaligned bases</td><td>'+str(addcommas(a['UNALIGNED_BASES']))+'</td><td>'+perc(a['UNALIGNED_BASES'],a['TOTAL_BASES'],1)+'</td></tr>'
  of.write(unaligned_bases_string+"\n")
  aligned_bases_string = '<tr><td>- Aligned bases</td><td>'+str(addcommas(a['ALIGNED_BASES']))+'</td><td>'+perc(a['ALIGNED_BASES'],a['TOTAL_BASES'],1)+'</td></tr>'
  of.write(aligned_bases_string+"\n")
  single_align_bases_string = '<tr><td>--- Single-aligned bases</td><td>'+str(addcommas(a['SINGLE_ALIGN_BASES']))+'</td><td>'+perc(a['SINGLE_ALIGN_BASES'],a['TOTAL_BASES'],1)+'</td></tr>'
  of.write(single_align_bases_string+"\n")
  gapped_align_bases_string = '<tr><td>--- Other-aligned bases</td><td>'+str(addcommas(a['GAPPED_ALIGN_BASES']))+'</td><td>'+perc(a['GAPPED_ALIGN_BASES'],a['TOTAL_BASES'],2)+'</td></tr>'
  of.write(gapped_align_bases_string+"\n")
  ostr = '''
    </table>
    <table class="right">
          <tr><td>Unaligned</td><td><div id="unaligned_leg" class="legend_square"></div></td></tr>
          <tr><td>Trans-chimeric alignment</td><td><div id="chimeric_leg" class="legend_square"></div></td></tr>
          <tr><td>Self-chimeric alignment</td><td><div id="selfchimeric_leg" class="legend_square"></div></td></tr>
          <tr><td>Gapped alignment</td><td><div id="gapped_leg" class="legend_square"></div></td></tr>
          <tr><td>Single alignment</td><td><div id="single_leg" class="legend_square"></div></td></tr>
    </table>
  </div>
  <div class="two_thirds right">
    <div class="rhead">Summary [<a download="alignments.pdf" href="plots/alignments.pdf">pdf</a>]</div>
    <img src="plots/alignments.png" alt="alignments_png" />
  </div>   
  <div class="clear"></div>
  <div class="two_thirds right">
    <div class="rhead">Exon counts of best alignments [<a download="exon_size_distro.pdf" href="plots/exon_size_distro.pdf">pdf</a>]</div>
    <img src="plots/exon_size_distro.png" alt="exon_size_distro_png" />
  </div>
'''
  of.write(ostr)
  if len(special['GN']) > 1:
    ostr = '''
  <div class="one_half left">
    <table class="one_half data_table">
      <tr class="rhead"><td colspan="5">Long read name information</td></tr>
      <tr><td>Type</td><td>Sub-type</td><td>Reads</td><td>Aligned</td><td>Fraction</td></tr>
'''
    of.write(ostr)
    for f in [x for x in special['GN'] if x[0] != 'Unclassified' or int(x[2])>0]:
      of.write('      <tr><td>'+f[0]+'</td><td>'+f[1]+'</td><td>'+addcommas(int(f[2]))+'</td><td>'+addcommas(int(f[3]))+'</td><td>'+perc(int(f[3]),int(f[2]),2)+'</td></tr>'+"\n")
    ostr = '''
    </table>
'''
    of.write(ostr)
    if 'PB' in special:
      # We have pacbio specific report
      pb = {}
      for f in special['PB']: 
        pb[f[0]]=f[1]
        if re.search('\.',f[1]): pb[f[0]]=float(f[1])
      ostr = '''
      <div class="rhead">PacBio SMRT Cells [<a download="pacbio.pdf" href="/plots/pacbio.pdf">pdf</a>]</div>
      <img src="plots/pacbio.png" alt="pacbio_png" />
      <table class="horizontal_legend right">
        <tr><td>Aligned</td><td><div class="legend_square pacbio_aligned_leg"></div></td><td>Unaligned</td><td><div class="legend_square pacbio_unaligned_leg"></div></td></tr>
      </table>
      <table class="data_table one_half">
        <tr class="rhead"><td colspan="4">PacBio Stats</td></tr>
'''
      of.write(ostr)
      of.write('      <tr><td>Total Cell Count</td><td colspan="3">'+addcommas(int(pb['Cell Count']))+'</td></tr>')
      of.write('      <tr><td>Total Molecule Count</td><td colspan="3">'+addcommas(int(pb['Molecule Count']))+'</td></tr>')
      of.write('      <tr><td>Total Molecules Aligned</td><td colspan="3">'+addcommas(int(pb['Aligned Molecule Count']))+' ('+perc(pb['Aligned Molecule Count'],pb['Molecule Count'],2)+')</td></tr>')
      of.write('      <tr class="rhead"><td>Per-cell Feature</td><td>Min</td><td>Avg</td><td>Max</td></tr>')
      of.write('      <tr><td>Reads</td><td>'+addcommas(int(pb['Min Reads Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Reads Per Cell']))+'</td><td>'+addcommas(int(pb['Max Reads Per Cell']))+'</td></tr>')
      of.write('      <tr><td>Molecules</td><td>'+addcommas(int(pb['Min Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Max Molecules Per Cell']))+'</td></tr>')
      of.write('      <tr><td>Aligned Molecules</td><td>'+addcommas(int(pb['Min Aligned Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Avg Aligned Molecules Per Cell']))+'</td><td>'+addcommas(int(pb['Max Aligned Molecules Per Cell']))+'</td></tr>')
      ostr = '''        
      </table>
'''
      of.write(ostr)
    ostr = '''
  </div>
'''
    of.write(ostr)
  ostr = '''
</div>
<div class="clear"></div>
<hr />
'''
  of.write(ostr)
  ###################################
  # 3. ANNOTATION ANALYSIS

  ### This block should only be done when we have annotations
  if args.annotation:
    ostr = '''
<div class="result_block">
  <div class="subject_title">Annotation Analysis</div>
  <div class="one_half left">
    <div class="rhead">Distribution of reads among genomic features [<a download="read_genomic_features.pdf" href="plots/read_genomic_features.pdf">pdf</a>]</div>
    <img src="plots/read_genomic_features.png" alt="read_genomic_features_png" />
    <table class="one_half right horizontal_legend">
      <tr>
      <td>Exons</td><td><div class="exon_leg legend_square"></div></td><td></td>
      <td>Introns</td><td><div class="intron_leg legend_square"></div></td><td></td>
      <td>Intergenic</td><td><div class="intergenic_leg legend_square"></div></td><td></td>
      </tr>
    </table>
  </div>
  <div class="one_half right">
    <div class="rhead">Distribution of annotated reads [<a download="annot_lengths.pdf" href="plots/annot_lengths.pdf">pdf</a>]</div>
    <img src="plots/annot_lengths.png" alt="annot_lengths_png" />
    <table class="one_half right horizontal_legend">
      <tr>
      <td>Partial annotation</td><td><div class="partial_leg legend_square"></div></td><td></td>
      <td>Full-length</td><td><div class="full_leg legend_square"></div></td><td></td>
      <td>Unannotated</td><td><div class="unannotated_leg legend_square"></div></td><td></td>
      </tr>
    </table>
  </div>
  <div class="clear"></div>
  <div class="one_half right">
    <div class="rhead">Distribution of identified reference transcripts [<a download="transcript_distro.pdf" href="plots/transcript_distro.pdf">pdf</a>]</div>
    <img src="plots/transcript_distro.png" alt="transcript_distro_png" />
    <table class="one_half right horizontal_legend">
      <tr>
      <td>Partial annotation</td><td><div class="partial_leg legend_square"></div></td><td></td>
      <td>Full-length</td><td><div class="full_leg legend_square"></div></td><td></td>
      </tr>
    </table>
  </div>
  <div class="one_half left">
    <table class="data_table one_half">
      <tr class="rhead"><td colspan="5">Annotation Counts</td></tr>
      <tr><td>Feature</td><td>Evidence</td><td>Reference</td><td>Detected</td><td>Percent</td></tr>
'''
    of.write(ostr)
    cnt = len([x for x in ref_genes.keys() if sum(ref_genes[x])>0])
    of.write('      <tr><td>Genes</td><td>Any match</td><td>'+addcommas(len(ref_genes.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_genes.keys()),2)+'</td></tr>'+"\n")
    cnt = len([x for x in ref_genes.keys() if ref_genes[x][1]>0])
    of.write('      <tr><td>Genes</td><td>Full-length</td><td>'+addcommas(len(ref_genes.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_genes.keys()),2)+'</td></tr>'+"\n")
    cnt = len([x for x in ref_transcripts.keys() if sum(ref_transcripts[x])>0])
    of.write('      <tr><td>Transcripts</td><td>Any match</td><td>'+addcommas(len(ref_transcripts.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_transcripts.keys()),2)+'</td></tr>'+"\n")
    cnt = len([x for x in ref_transcripts.keys() if ref_transcripts[x][1]>0])
    of.write('      <tr><td>Transcripts</td><td>Full-length</td><td>'+addcommas(len(ref_transcripts.keys()))+'</td><td>'+addcommas(cnt)+'</td><td>'+perc(cnt,len(ref_transcripts.keys()),2)+'</td></tr>'+"\n")
    ostr = '''
    </table>
    <table class="data_table one_half">
      <tr class="rhead"><td colspan="4">Top Genes</td></tr>
      <tr><td>Gene</td><td>Partial</td><td>Full-length</td><td>Total Reads</td></tr>
'''
    of.write(ostr)
    # get our top genes
    vs = reversed(sorted(ref_genes.keys(),key=lambda x: sum(ref_genes[x]))[-5:])
    for v in vs:
      of.write('      <tr><td>'+v+'</td><td>'+addcommas(ref_genes[v][0])+'</td><td>'+addcommas(ref_genes[v][1])+'</td><td>'+addcommas(sum(ref_genes[v]))+'</td></tr>'+"\n")
    ostr='''
    </table>
    <table class="data_table one_half">
      <tr class="rhead"><td colspan="5">Top Transcripts</td></tr>
      <tr><td>Transcript</td><td>Gene</td><td>Partial</td><td>Full-length</td><td>Total Reads</td></tr>
'''
    of.write(ostr)
    vs = reversed(sorted(ref_transcripts.keys(),key=lambda x: sum(ref_transcripts[x]))[-5:])
    for v in vs:
      of.write('      <tr><td>'+v+'</td><td>'+tx_to_gene[v]+'</td><td>'+addcommas(ref_transcripts[v][0])+'</td><td>'+addcommas(ref_transcripts[v][1])+'</td><td>'+addcommas(sum(ref_transcripts[v]))+'</td></tr>'+"\n")  
    ostr = '''
    </table>
  </div>
  <div class="clear"></div>
</div>
<hr />
'''
    of.write(ostr) # still in conditional for if we have annotation

  ##################################
  # 4. COVERAGE ANALYSIS

  ### For Coverage we can do part of it without annotations
  ostr = '''
<div class="subject_title">Coverage analysis &#xA0;&#xA0;&#xA0;&#xA0;<span class="highlight">'''
  of.write(ostr+"\n")
  of.write(perc(coverage_data['genome_covered'],coverage_data['genome_total'],2)+"\n")
  ostr = '''
  </span> <span class="highlight2">reference sequences covered</span>
</div>
<div class="result_block">
  <div class="one_half left">
    <div class="rhead">Coverage of reference sequences [<a download="covgraph.pdf" href="plots/covgraph.pdf">pdf</a>]</div>
    <img src="plots/covgraph.png" alt="covgraph_png" />
  </div>
  <div class="one_half left">
    <div class="rhead">Coverage distribution [<a download="perchrdepth.pdf" href="plots/perchrdepth.pdf">pdf</a>]</div>
    <img src="plots/perchrdepth.png" alt="perchrdepth_png" />
  </div>
  <div class="clear"></div>
'''
  of.write(ostr)
  ### The next part of coverage requires annotations
  if args.annotation:
    ostr = '''
  <div class="one_half left">
    <table class="data_table one_half">
      <tr class="rhead"><td colspan="4">Coverage statistics</td></tr>
      <tr><td>Feature</td><td>Feature (bp)</td><td>Coverage (bp)</td><td>Fraction</td></tr>
'''
    # still in annotation conditional
    of.write(ostr)
    of.write('    <tr><td>Genome</td><td>'+addcommas(coverage_data['genome_total'])+'</td><td>'+addcommas(coverage_data['genome_covered'])+'</td><td>'+perc(coverage_data['genome_covered'],coverage_data['genome_total'],2)+'</td></tr>')
    of.write('    <tr><td>Exons</td><td>'+addcommas(coverage_data['exons_total'])+'</td><td>'+addcommas(coverage_data['exons_covered'])+'</td><td>'+perc(coverage_data['exons_covered'],coverage_data['exons_total'],2)+'</td></tr>')
    of.write('    <tr><td>Introns</td><td>'+addcommas(coverage_data['introns_total'])+'</td><td>'+addcommas(coverage_data['introns_covered'])+'</td><td>'+perc(coverage_data['introns_covered'],coverage_data['introns_total'],2)+'</td></tr>')
    of.write('    <tr><td>Intergenic</td><td>'+addcommas(coverage_data['intergenic_total'])+'</td><td>'+addcommas(coverage_data['intergenic_covered'])+'</td><td>'+perc(coverage_data['intergenic_covered'],coverage_data['intergenic_total'],2)+'</td></tr>')
    ostr = '''
    </table>
  </div>
  <div class="one_half right">
    <div class="rhead">Annotated features coverage [<a download="feature_depth.pdf" href="plots/feature_depth.pdf">pdf</a>]</div>
    <img src="plots/feature_depth.png" alt="feature_depth_png" />
    <table class="one_third right">
      <tr><td>Genome</td><td><div class="legend_square genome_cov_leg"></div></td>
          <td>Exons</td><td><div class="legend_square exon_cov_leg"></div></td>
          <td>Introns</td><td><div class="legend_square intron_cov_leg"></div></td>
          <td>Intergenic</td><td><div class="legend_square intergenic_cov_leg"></div></td></tr>
    </table>
  </div>
  <div class="one_half left">
    <div class="rhead">Bias in alignment to reference transcripts [<a download="bias.pdf" href="plots/bias.pdf">pdf</a>]</div>
    <table>
  '''
    # still in conditional for annotation requirement
    of.write(ostr)
    of.write('<tr><td colspan="2">Evidence from:</td></tr>')
    of.write('<tr><td>Total Transcripts</td><td>'+str(addcommas(bias_tx_count))+'</td></tr>'+"\n")
    of.write('<tr><td>Total reads</td><td>'+str(addcommas(bias_read_count))+'</td></tr>'+"\n")
    ostr='''
    </table>
    <img src="plots/bias.png" alt="bias_png" />
  </div>
  <div class="clear"></div>
'''
    # still in annotations check
    of.write(ostr)  
  # done with annotations check
  ostr = '''
</div>
<hr />
'''
  of.write(ostr)
  #############################################
  # 5. RAREFRACTION ANALYSIS

  ### Rarefraction analysis block requires do_loci or annotations
  if args.do_loci or args.annotation:
    ostr = '''
<div class="subject_title"><table><tr><td class="c1">Rarefraction analysis</td>
'''
    of.write(ostr)
    if args.annotation:
      ostr = '''
<td class="c2"><span class="highlight">
'''
      # still in do_loci or annotations conditional
      of.write(ostr)
      of.write(str(addcommas(geneany))+"\n")
      ostr = '''
  </span></td><td class="c3"><span class="highlight2">Genes detected</span></td><td class="c4"><span class="highlight">
'''
      # still in do_loci or annotations conditional
      of.write(ostr)
      of.write(str(addcommas(genefull))+"\n")
      ostr = '''
  </span></td><td class="c5"><span class="highlight2">Full-length genes</span></td>
'''
      # still in do_loci or annotations conditional
      of.write(ostr)
    ostr = '''
  </tr></table>
</div>
<div class="result_block">
  <div class="one_half left">
'''
    of.write(ostr)
    if args.annotation:
      ostr = '''
    <div class="rhead">Gene detection rarefraction [<a download="gene_rarefraction.pdf" href="plots/gene_rarefraction.pdf">pdf</a>]</div>
    <img src="plots/gene_rarefraction.png" alt="gene_rarefraction_png" />
  </div>
  <div class="one_half left">
    <div class="rhead">Transcript detection rarefraction [<a download="transcript_rarefraction" href="plots/transcript_rarefraction.pdf">pdf</a>]</div>
    <img src="plots/transcript_rarefraction.png" alt="transcript_rarefraction_png" />
  </div>
  <div class="clear"></div>
'''
      # still in args.annotation
      of.write(ostr)
    #done with args.anotation 
    ostr = '''
  <div class="one_half left">
    <table class="data_table one_third">
      <tr><td class="rhead" colspan="3">Rarefraction stats</td></tr>
      <tr class="bold"><td>Feature</td><td>Criteria</td><td>Count</td></tr>
'''
    # still in do_loci or annotations conditional
    of.write(ostr+"\n")
    if args.annotation:
      of.write('<tr><td>Gene</td><td>full-length</td><td>'+str(addcommas(genefull))+'</td></tr>')
      of.write('<tr><td>Gene</td><td>any match</td><td>'+str(addcommas(geneany))+'</td></tr>')
      of.write('<tr><td>Transcript</td><td>full-length</td><td>'+str(addcommas(txfull))+'</td></tr>')
      of.write('<tr><td>Transcript</td><td>any match</td><td>'+str(addcommas(txany))+'</td></tr>')
    if args.do_loci:  of.write('<tr><td>Locus</td><td></td><td>'+str(addcommas(locuscount))+'</td></tr>')
    ostr='''
    </table>
    <table id="rarefraction_legend">
      <tr><td>Any match</td><td><div class="rareany_leg legend_square"></div></td></tr>
      <tr><td>full-length</td><td><div class="rarefull_leg legend_square"></div></td></tr>
      <tr><td class="about" colspan="2">vertical line height indicates 5%-95% CI of sampling</td></tr>
    </table>
  </div>
'''
    # still in do_loci or annotations conditional
    of.write(ostr)
    if args.do_loci: 
      ostr = '''
  <div class="one_half left">
    <div class="rhead">Locus detection rarefraction [<a download="locus_rarefraction.pdf" href="plots/locus_rarefraction.pdf">pdf</a>]</div>
    <img src="plots/locus_rarefraction.png" alt="locus_rarefraction_png" />
  </div>
'''
      # in do_loci condition
      of.write(ostr)
    # still in do_loci or annotations conditional
    ostr = '''
</div>
<div class="clear"></div>
<hr />
'''
    # still in do_loci or annotations conditional
    of.write(ostr)
  # Finished do_loci or annotations conditional

  ###################################
  # 6. ERROR PATTERN

  # We need a reference in order to do error pattern analysis
  if args.reference:
    ostr = '''
<div class="subject_title">Error pattern analysis &#xA0;&#xA0;&#xA0;&#xA0;<span class="highlight">
'''
    #if args.reference
    of.write(ostr+"\n")
    error_rate = perc(e['ANY_ERROR'],e['ALIGNMENT_BASES'],3)
    of.write(error_rate)
    ostr='''
  </span> <span class="highlight2">error rate</span></div>
<div class="subject_subtitle">&#xA0; &#xA0; &#xA0; based on aligned segments</div>
<div class="result_block">
  <div class="full_length right">
    <div class="rhead">Error rates, given a target sequence [<a download="context_plot.pdf" href="plots/context_plot.pdf">pdf</a>]</div>
    <img src="plots/context_plot.png" alt="context_plot_png" />
  </div>
  <div class="clear"></div>
  <table class="data_table one_third left">
      <tr class="rhead"><td colspan="3">Alignment stats</td></tr>
'''
    # if args.reference
    of.write(ostr+"\n")
    best_alignments_sampled_string = '<tr><td>Best alignments sampled</td><td>'+str(e['ALIGNMENT_COUNT'])+'</td><td></td></tr>'
    of.write(best_alignments_sampled_string+"\n")
    ostr = '''
      <tr class="rhead"><td colspan="3">Base stats</td></tr>
'''
    # if args.reference
    of.write(ostr+"\n")
    bases_analyzed_string = '<tr><td>Bases analyzed</td><td>'+str(addcommas(e['ALIGNMENT_BASES']))+'</td><td></td></tr>'
    of.write(bases_analyzed_string+"\n")
    correctly_aligned_string = '<tr><td>- Correctly aligned bases</td><td>'+str(addcommas(e['ALIGNMENT_BASES']-e['ANY_ERROR']))+'</td><td>'+perc((e['ALIGNMENT_BASES']-e['ANY_ERROR']),e['ALIGNMENT_BASES'],1)+'</td></tr>'
    of.write(correctly_aligned_string+"\n")
    total_error_string = '<tr><td>- Total error bases</td><td>'+str(addcommas(e['ANY_ERROR']))+'</td><td>'+perc(e['ANY_ERROR'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(total_error_string+"\n")
    mismatched_string = '<tr><td>--- Mismatched bases</td><td>'+str(addcommas(e['MISMATCHES']))+'</td><td>'+perc(e['MISMATCHES'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(mismatched_string+"\n")
    deletion_string = '<tr><td>--- Deletion bases</td><td>'+str(addcommas(e['ANY_DELETION']))+'</td><td>'+perc(e['ANY_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(deletion_string+"\n")
    complete_deletion_string = '<tr><td>----- Complete deletion bases</td><td>'+str(addcommas(e['COMPLETE_DELETION']))+'</td><td>'+perc(e['COMPLETE_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(complete_deletion_string+"\n")
    homopolymer_deletion_string = '<tr><td>----- Homopolymer deletion bases</td><td>'+str(addcommas(e['HOMOPOLYMER_DELETION']))+'</td><td>'+perc(e['HOMOPOLYMER_DELETION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(homopolymer_deletion_string+"\n")
    insertion_string = '<tr><td>--- Insertion bases</td><td>'+str(addcommas(e['ANY_INSERTION']))+'</td><td>'+perc(e['ANY_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(insertion_string+"\n")
    complete_insertion_string = '<tr><td>----- Complete insertion bases</td><td>'+str(addcommas(e['COMPLETE_INSERTION']))+'</td><td>'+perc(e['COMPLETE_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(complete_insertion_string+"\n")
    homopolymer_insertion_string = '<tr><td>----- Homopolymer insertion bases</td><td>'+str(addcommas(e['HOMOPOLYMER_INSERTION']))+'</td><td>'+perc(e['HOMOPOLYMER_INSERTION'],e['ALIGNMENT_BASES'],3)+'</td></tr>'
    of.write(homopolymer_insertion_string+"\n")
    ostr = '''
  </table>
  <div class="one_half left">
    <div class="rhead">Alignment-based error rates [<a download="alignment_error_plot.pdf" href="plots/alignment_error_plot.pdf">pdf</a>]</div>
    <img class="square_image" src="plots/alignment_error_plot.png" alt="alignment_error_plot_png" />
  </div>
</div>
<div class="clear"></div>
<hr />
'''
    #if args.reference
    of.write(ostr)
  # finished with args.reference condition
 
  ##############################
  # 8. Raw data block
  ostr = '''
<div id="bed_data">
<table class="header_table">
  <tr><td class="rhead" colspan="2">Browser-ready Bed data</td></tr>
  <tr>
    <td>Best Alignments:</td>
    <td class="raw_files"><a download="best.sorted.bed.gz" href="data/best.sorted.bed.gz">best.sorted.bed.gz</a></td>
  </tr>
  <tr>
    <td>Gapped Alignments:</td>
    <td class="raw_files"><a download="gapped.bed.gz" href="data/gapped.bed.gz">gapped.bed.gz</a></td>
  </tr>
  <tr>
    <td>Trans-chimeric Alignments:</td>
    <td class="raw_files"><a download="chimera.bed.gz" href="data/chimera.bed.gz">chimera.bed.gz</a></td>
  </tr>
  <tr>
    <td>Self-chimeric Alignments:</td>
    <td class="raw_files"><a download="technical_chimeras.bed.gz" href="data/technical_chimeras.bed.gz">technical_chimeras.bed.gz</a></td>
  </tr>
  <tr>
    <td>Other-chimeric Alignments:</td>
    <td class="raw_files"><a download="techinical_atypical_chimeras.bed.gz" href="data/technical_atypical_chimeras.bed.gz">techinical_atypical_chimeras.bed.gz</a></td>
  </tr>
</table>
</div>
<div id="raw_data">
<table class="header_table">
  <tr><td class="rhead" colspan="2">Raw data</td></tr>
  <tr>
    <td>Alignments stats raw report:</td>
    <td class="raw_files"><a id="alignment_stats.txt" href="data/alignment_stats.txt">alignment_stats.txt</a></td>
  </tr>
  <tr>
    <td>Read lengths:</td>
    <td class="raw_files"><a download="lengths.txt.gz" href="data/lengths.txt.gz">lengths.txt.gz</a></td>
  </tr>
  <tr>
    <td>Reference sequence lengths:</td>
    <td class="raw_files"><a id="chrlens.txt" href="data/chrlens.txt">chrlens.txt</a></td>
  </tr>
  <tr>
    <td>Coverage bed:</td>
    <td class="raw_files"><a download="depth.sorted.bed.gz" href="data/depth.sorted.bed.gz">depth.sorted.bed.gz</a></td>
  </tr>
'''
  of.write(ostr)
  if args.do_loci:
    of.write('<tr> <td>Loci basics bed:</td><td class="raw_files"><a download="loci.bed.gz" href="data/loci.bed.gz">loci.bed.gz</a></td></tr>'+"\n")
    of.write('<tr><td>Locus read data bed:</td><td class="raw_files"><a download="loci-all.bed.gz" href="data/loci-all.bed.gz">loci-all.bed.gz</a></td></tr>'+"\n")
    of.write('<tr><td>Locus rarefraction:</td><td class="raw_files"><a download="locus_rarefraction.txt" href="data/locus_rarefraction.txt">locus_rarefraction.txt</a></td></tr>'+"\n")
  if args.annotation:
    ostr = '''
  <tr>
    <td>Read annotations:</td>
    <td class="raw_files"><a download="annotbest.txt.gz" href="data/annotbest.txt.gz">annotbest.txt.gz</a></td>
  </tr>
  <tr>
    <td>Read genomic features:</td>
    <td class="raw_files"><a download="read_genomic_features.txt.gz" href="data/read_genomic_features.txt.gz">read_genomic_features.txt.gz</a></td>
  </tr>
  <tr>
    <td>Annotation status and read lengths:</td>
    <td class="raw_files"><a download="annot_lengths.txt.gz" href="data/annot_lengths.txt.gz">annot_lengths.txt.gz</a></td>
  </tr>
  <tr>
    <td>Gene any match rarefraction:</td>
    <td class="raw_files"><a download="gene_rarefraction.txt" href="data/gene_rarefraction.txt">gene_rarefraction.txt</a></td>
  </tr>
  <tr>
    <td>Gene full-length rarefraction:</td>
    <td class="raw_files"><a download="gene_full_rarefraction.txt" href="data/gene_full_rarefraction.txt">gene_full_rarefraction.txt</a></td>
  </tr>
  <tr>
    <td>Transcript any match rarefraction:</td>
    <td class="raw_files"><a download="transcript_rarefraction.txt" href="data/transcript_rarefraction.txt">transcript_rarefraction.txt</a></td>
  </tr>
  <tr>
    <td>Transcript full-length rarefraction:</td>
    <td class="raw_files"><a download="transcript_full_rarefraction.txt" href="data/transcript_full_rarefraction.txt">transcript_full_rarefraction.txt</a></td>
  </tr>
  <tr>
    <td>Bias table:</td>
    <td class="raw_files"><a download="bias_table.txt.gz" href="data/bias_table.txt.gz">bias_table.txt.gz</a></td>
  </tr>
'''
    # if args.annotation
    of.write(ostr)
  # done with args.annotation
  #output data that depends on reference
  if args.reference: 
    ostr = '''
  <tr>
    <td>Alignment errors data:</td>
    <td class="raw_files"><a download="error_data.txt" href="data/error_data.txt">error_data.txt</a></td>
  </tr>
  <tr>
    <td>Alignment error report:</td>
    <td class="raw_files"><a download="error_stats.txt" href="data/error_stats.txt">error_stats.txt</a></td>
  </tr>
  <tr>
    <td>Contextual errors data:</td>
    <td class="raw_files"><a download="context_error_data.txt" href="data/context_error_data.txt">context_error_data.txt</a></td>
  </tr>
'''
    # if args.reference
    of.write(ostr)
  # back to any condition
  ostr = '''
</table>
</div>
</body>
</html>
  '''
  of.write(ostr)
예제 #12
0
def main():
    #do our inputs
    args = do_inputs()

    # first we need to run the classify
    classify_reads.external_cmd('classify_reads.py ' + args.input_annot + ' ' +
                                args.input_gpd + ' -o ' + args.tempdir +
                                '/classify.txt.gz')

    get_novel_sets(args.tempdir + '/classify.txt.gz', args.input_gpd,
                   args.tempdir + '/novel_isoform_reads.gpd.gz',
                   args.tempdir + '/novel_locus_reads.gpd.gz', args)
    # Now we can make a new non-redundant set of genpreds from the novel isoforms
    sys.stderr.write("making NR novel isoforms\n")
    cmd = 'gpd_to_nr.py '+args.tempdir+'/novel_isoform_reads.gpd.gz '+\
          ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\
          ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\
          ' --minimum_support '+str(args.minimum_support)+\
          ' --gene_names '+\
          ' -o '+args.tempdir+'/novel_isoforms_nr.gpd.gz'
    gpd_to_nr.external_cmd(cmd)

    sys.stderr.write("reannotating novel based on our new gpd\n")
    # Now we reannotate the novel based on the these newly annotated isoforms
    cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_locus_reads.gpd.gz '+\
          ' --threads '+str(1)+' '+\
          ' -r '+args.tempdir+'/novel_isoforms_nr.gpd.gz '+\
          ' -o '+args.tempdir+'/novel_locus_reads.annot.txt.gz'
    gpd_annotate.external_cmd(cmd)

    # now this new annotation should be classified
    # the new isoform will be in novel_isoform_reads.gpd.gz
    cmd = 'classify_reads.py ' + args.tempdir + '/novel_locus_reads.annot.txt.gz ' + args.tempdir + '/novel_locus_reads.gpd.gz -o ' + args.tempdir + '/classify_novel.txt.gz'
    sys.stderr.write(cmd + "\n")
    classify_reads.external_cmd(cmd)
    get_novel_sets(args.tempdir + '/classify_novel.txt.gz',
                   args.tempdir + '/novel_locus_reads.gpd.gz',
                   args.tempdir + '/novel_isoform_reads2.gpd.gz',
                   args.tempdir + '/novel_locus_reads2.gpd.gz', args)

    # now lets combine our novel isoform reads making sure to sort them
    of = open(args.tempdir + '/new_novel_isoform_reads.gpd.gz', 'w')
    cmd2 = 'gzip'
    p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE)
    cmd1 = 'sort -k3,3 -k5,5n -k6,6n'
    p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE)
    inf = gzip.open(args.tempdir + '/novel_isoform_reads.gpd.gz')
    for line in inf:
        p1.stdin.write(line)
    inf.close()
    inf = gzip.open(args.tempdir + '/novel_isoform_reads2.gpd.gz')
    for line in inf:
        p1.stdin.write(line)
    inf.close()
    p1.communicate()
    p2.communicate()
    of.close()

    # Now we can make a new non-redundant set of genpreds from the novel isoforms
    sys.stderr.write("making NR novel isoforms\n")
    cmd = 'gpd_to_nr.py '+args.tempdir+'/new_novel_isoform_reads.gpd.gz '+\
          ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\
          ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\
          ' --minimum_support '+str(args.minimum_support)+\
          ' --gene_names '+\
          ' -o '+args.tempdir+'/novel_isoforms_nr2.gpd.gz'
    gpd_to_nr.external_cmd(cmd)

    #Only need to reannotate if we are interested in whats left over
    #sys.stderr.write("reannotating novel based on our new gpd\n")
    ## Now we reannotate the novel based on the these newly annotated isoforms
    #cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_locus_reads.gpd.gz '+\
    #      ' --threads '+str(args.threads)+' '+\
    #      ' -r '+args.tempdir+'/novel_isoforms_nr2.gpd.gz '+\
    #      ' -o '+args.tempdir+'/novel_locus_reads.annot.txt.gz'
    #gpd_annotate.external_cmd(cmd)

    sys.stderr.write("now work on the novel loci\n")
    # Now lets work on the novel locus
    of = open(args.tempdir + '/sorted_novel_locus_reads.gpd.gz', 'w')
    cmd2 = 'gzip'
    p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE)
    cmd1 = 'sort -k3,3 -k5,5n -k6,6n'
    p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE)
    inf = gzip.open(args.tempdir + '/novel_locus_reads2.gpd.gz')
    for line in inf:
        p1.stdin.write(line)
    inf.close()
    p1.communicate()
    p2.communicate()
    of.close()

    sys.stderr.write("making NR novel loci\n")
    cmd = 'gpd_to_nr.py '+args.tempdir+'/sorted_novel_locus_reads.gpd.gz '+\
          ' -j '+str(args.junction_tolerance)+' --threads '+str(args.threads)+\
          ' --minimum_junction_end_support '+str(args.minimum_junction_end_support)+\
          ' --minimum_support '+str(args.minimum_support)+\
          ' -o '+args.tempdir+'/novel_locus_nr.gpd.gz'
    gpd_to_nr.external_cmd(cmd)

    sys.stderr.write("sort the novel isoforms\n")
    of = open(args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz', 'w')
    cmd2 = 'gzip'
    p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE)
    cmd1 = 'sort -k3,3 -k5,5n -k6,6n'
    p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE)
    inf = gzip.open(args.tempdir + '/novel_isoforms_nr2.gpd.gz')
    for line in inf:
        p1.stdin.write(line)
    inf.close()
    p1.communicate()
    p2.communicate()
    of.close()

    sys.stderr.write("sort the novel loci\n")
    of = open(args.tempdir + '/novel_loci_nr.sorted.gpd.gz', 'w')
    cmd2 = 'gzip'
    p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE)
    cmd1 = 'sort -k3,3 -k5,5n -k6,6n'
    p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE)
    inf = gzip.open(args.tempdir + '/novel_locus_nr.gpd.gz')
    for line in inf:
        p1.stdin.write(line)
    inf.close()
    p1.communicate()
    p2.communicate()
    of.close()

    # Now we can rename totally novel genes based on locus overlap
    of = open(args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz', 'w')
    cmd2 = 'gzip'
    p2 = Popen(cmd2.split(), stdout=of, stdin=PIPE)
    cmd1 = 'sort -k3,3 -k5,5n -k6,6n'
    p1 = Popen(cmd1.split(), stdout=p2.stdin, stdin=PIPE)

    inf = gzip.open(args.tempdir + '/novel_loci_nr.sorted.gpd.gz')
    gs = GPDStream(inf)
    ls = LocusStream(gs)
    z = 0
    for rng in ls:
        z += 1
        rng_string = rng.get_range_string()
        gpds = rng.get_payload()
        for gpd in gpds:
            gene_name = 'LOC' + str(z) + '|' + str(
                len(gpds)) + '|' + rng_string
            f = gpd.get_gpd_line().rstrip().split("\t")
            f[0] = gene_name
            gpd_line = "\t".join(f)
            p1.stdin.write(gpd_line + "\n")
    p1.communicate()
    p2.communicate()
    of.close()

    # we are almost done but we need to make sure these genepreds aren't subsets of known genes
    sys.stderr.write("reannotating novel-isoform by reference\n")
    cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_isoforms_nr.sorted.gpd.gz '+\
          ' --threads '+str(1)+' '+\
          ' -r '+args.reference_annotation_gpd+\
          ' -o '+args.tempdir+'/novel_isoforms_nr.annot.txt.gz'
    gpd_annotate.external_cmd(cmd)
    cmd = 'classify_reads.py ' + args.tempdir + '/novel_isoforms_nr.annot.txt.gz ' + args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz -o ' + args.tempdir + '/classify_novel_isoform_ref.txt.gz'
    sys.stderr.write(cmd + "\n")
    classify_reads.external_cmd(cmd)

    # now we can screen to make sure things in the novel isoform file really are novel isoforms
    blacklist = set()
    finf = gzip.open(args.tempdir + '/classify_novel_isoform_ref.txt.gz')
    for line in finf:
        f = line.rstrip().split("\t")
        if f[2] == 'subset' or f[2] == 'full': blacklist.add(f[0])
    finf.close()
    fof = gzip.open(args.tempdir + '/novel_isoforms_nr.filtered.sorted.gpd.gz',
                    'w')
    finf = gzip.open(args.tempdir + '/novel_isoforms_nr.sorted.gpd.gz')
    for line in finf:
        f = line.rstrip().split("\t")
        if f[1] in blacklist: continue
        fof.write(line)
    finf.close()
    fof.close()

    sys.stderr.write("reannotating novel-locus by reference\n")
    cmd = 'gpd_anntotate.py '+args.tempdir+'/novel_loci_nr_named.sorted.gpd.gz '+\
          ' --threads '+str(1)+' '+\
          ' -r '+args.reference_annotation_gpd+\
          ' -o '+args.tempdir+'/novel_loci_nr_named.annot.txt.gz'
    gpd_annotate.external_cmd(cmd)
    cmd = 'classify_reads.py ' + args.tempdir + '/novel_loci_nr_named.annot.txt.gz ' + args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz -o ' + args.tempdir + '/classify_novel_loci.txt.gz'
    sys.stderr.write(cmd + "\n")
    classify_reads.external_cmd(cmd)

    # now we can screen to make sure things in the novel isoform file really are novel isoforms
    blacklist = set()
    finf = gzip.open(args.tempdir + '/classify_novel_loci.txt.gz')
    for line in finf:
        f = line.rstrip().split("\t")
        if f[2] == 'subset' or f[2] == 'full': blacklist.add(f[0])
    finf.close()
    fof = gzip.open(
        args.tempdir + '/novel_loci_nr_named.filtered.sorted.gpd.gz', 'w')
    finf = gzip.open(args.tempdir + '/novel_loci_nr_named.sorted.gpd.gz')
    for line in finf:
        f = line.rstrip().split("\t")
        if f[1] in blacklist: continue
        fof.write(line)
    finf.close()
    fof.close()

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    copy(args.tempdir + '/novel_loci_nr_named.filtered.sorted.gpd.gz',
         args.output + '/novel_loci_nr_named.sorted.gpd.gz')
    copy(args.tempdir + '/novel_isoforms_nr.filtered.sorted.gpd.gz',
         args.output + '/novel_isoforms_nr.sorted.gpd.gz')

    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
예제 #13
0
def main():
  #do our inputs
  args = do_inputs()

  inf = sys.stdin
  if args.input:
    if args.input[-3:]=='.gz': inf = gzip.open(args.input)
    else: inf = open(args.input)

  of = open(args.tempdir+'/input.gpd.gz','w')
  sys.stderr.write("sorting our input\n")
  input_cnt = sort_gpd(inf,of,args)
  of.close()
  inf.close()

  rinf = None
  if args.reference[-3:] == '.gz':
    rinf = gzip.open(args.reference)
  else:
    rinf = open(args.reference)
  sys.stderr.write("sorting our reference\n")
  rof = open(args.tempdir+'/ref.gpd.gz','w')
  sort_gpd(rinf,rof,args)
  rof.close()

  # Now we can traverse the ordered files by locus
  inf_input = gzip.open(args.tempdir+'/input.gpd.gz')
  inf_ref = gzip.open(args.tempdir+'/ref.gpd.gz')
  
  gsi = GPDStream(inf_input)
  gsr = GPDStream(inf_ref)
  mls = MultiLocusStream([gsi,gsr])
  z = 0
  y = 0
  output = []
  if args.threads > 1:
    p = Pool(args.threads)
  sys.stderr.write("processing "+str(input_cnt)+" inputs\n")
  for rng in mls:
    z+=1
    if z%10 == 0: 
      perc = int(100*float(y)/float(input_cnt+1))
      sys.stderr.write(rng.get_range_string()+" "+str(y)+" inputs "+str(perc)+"%                      \r")
    (input_entries,reference_entries) = rng.get_payload()
    if len(input_entries)==0: continue
    # Lets convert these back to lines to make the easier to pass through multiprocessing
    igpds = [x.get_gpd_line() for x in input_entries]
    rgpds = [x.get_gpd_line() for x in reference_entries]
    y += len(input_entries)
    if args.threads == 1:
      output.append(MiniQueue(process_locus(igpds,rgpds,args)))
    else:
      output.append(p.apply_async(process_locus,args=(igpds,rgpds,args,)))
  sys.stderr.write("\n")

  if args.threads > 1:
    p.close()
    p.join()

  of = sys.stdout
  if args.output: 
    if args.output[-3:] == '.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  tn_cnt = 0
  for out in output:
    outlines = out.get()
    for line in outlines:
      f = line.rstrip().split("\t")
      if int(f[3]) != 0: tn_cnt+=1
      of.write(line+"\n")
  of.close()
  perc = '?'
  if input_cnt > 0:
    perc = int(100*float(tn_cnt)/float(input_cnt))
  sys.stderr.write("Found "+str(tn_cnt)+" "+str(perc)+"% Unsupported Transcripts\n")
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
예제 #14
0
def main(args):

    sys.stderr.write("Reading in reference genePred\n")
    refgpd = {}
    inf = open(args.ref_genepred)
    gs = GPDStream(inf)
    z = 0
    for gpd in gs:
        z += 1
        refgpd[z] = gpd
    inf.close()

    sys.stderr.write("Reading in read annotations\n")
    inf = None
    if is_gzip(args.annotations):
        inf = gzip.open(args.annotations)
    else:
        inf = open(args.annotations)
    reflocs = {}
    rline = {}
    for line in inf:
        f = line.rstrip().split("\t")
        res={'read_line':int(f[0]),\
        'read_name':f[1],\
        'gene_name':f[2],\
        'tx_name':f[3],\
        'type':f[4],\
        'matching_exon_count':int(f[5]),\
        'consecutive_exons':int(f[6]),\
        'read_exons':int(f[7]),\
        'tx_exons':int(f[8]),\
        'overlap':int(f[9]),\
        'read_length':int(f[10]),\
        'tx_length':int(f[11]),\
        'read_range':GenomicRange(range_string=f[12]),\
        'tx_range':GenomicRange(range_string=f[13]),\
        'ref_line':int(f[14])}
        if res['ref_line'] not in reflocs: reflocs[res['ref_line']] = []
        reflocs[res['ref_line']].append(res)
        if args.full and res['type'] != 'full': continue
        if args.minimum_matched_exons > res['matching_exon_count']: continue
        rline[res['read_line']] = res
    inf.close()

    sys.stderr.write("reading read genepred\n")
    inf = None
    if is_gzip(args.read_genepred):
        inf = gzip.open(args.read_genepred)
    else:
        inf = open(args.read_genepred)
    gs = GPDStream(inf)
    z = 0
    originals = {}
    for gpd in gs:
        z += 1
        if z not in rline: continue
        refline = rline[z]['ref_line']
        if refline not in originals: originals[refline] = {}
        originals[refline][z] = gpd
    inf.close()
    results = {}
    for i in range(1, 101):
        results[str(i)] = []
    read_total = 0
    outs = {}
    for tx_line in originals:
        ref_gpd = refgpd[tx_line]
        annots = reflocs[tx_line]
        reads = originals[tx_line].values()
        v = do_tx_line(ref_gpd, annots, reads, args)
        if not v: continue
        tname = ref_gpd.get_transcript_name()
        bins = sorted([int(x) for x in v[0].keys()])
        outs[tname] = [0 for x in range(1, 101)]
        read_total += v[1]
        for i in range(1, 101):
            if str(i) in v[0]:
                results[str(i)].append(v[0][str(i)])
                outs[tname][i - 1] = v[0][str(i)]
            #else:
            #  results[str(i)].append(0)
    of = sys.stdout
    if args.output and re.search('\.gz', args.output):
        of = gzip.open(args.output, 'w')
    elif args.output:
        of = open(args.output, 'w')
    tot = len(outs.keys())
    #for i in range(1,101):
    #  ostr = str(i)
    #  tot = len(results[str(i)])
    #  for j in results[str(i)]:
    #    ostr += "\t"+str(j)
    #  of.write(ostr+"\n")
    for tname in outs:
        of.write(tname + "\t" + "\t".join([str(x)
                                           for x in outs[tname]]) + "\n")
    of.close()
    if args.output_counts:
        of = open(args.output_counts, 'w')
        of.write(str(tot) + "\t" + str(read_total) + "\n")
        of.close()
    sys.stderr.write(
        str(tot) + " total transcripts \t" + str(read_total) +
        " total reads\n")
예제 #15
0
def main(args):

    # Setup inputs
    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    of = sys.stdout
    # Setup outputs
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    mr = TranscriptLociMergeRules('is_any_overlap')
    mr.set_use_junctions(False)
    if args.threads > 1:
        p = Pool(processes=args.threads)
    results = []
    z = 0
    for locus in LocusStream(GPDStream(inf)):
        vals = locus.get_payload()
        if args.downsample:
            if len(vals) > args.downsample:
                shuffle(vals)
                vals = vals[0:args.downsample]
                locus.set_payload(vals)
        if args.threads <= 1:
            tls = Queue(do_locus(locus, mr, z, args, verbose=True))
            results.append(tls)
        else:
            tls = p.apply_async(do_locus,
                                args=(locus, mr, z, args, False),
                                callback=process_output)
            results.append(tls)
        z += len(locus.get_payload())
    if args.threads > 1:
        p.close()
        p.join()
    #sys.exit()
    sys.stderr.write("\n")
    sys.stderr.write("Outputing results\n")
    if args.output_loci:
        if re.search('\.gz$', args.output_loci):
            ofl = gzip.open(args.output_loci, 'w')
        else:
            ofl = open(args.output_loci, 'w')
    lnum = 0
    for res in sorted([y for y in [r.get() for r in results] if y],
                      key=lambda x: (x.chr, x.start, x.end)):
        rng = res.get_range_string()
        rngout = res.copy()
        tls = res.get_payload()
        for tl in sorted(
                tls,
                key=lambda x:
            (x.get_range().chr, x.get_range().start, x.get_range().end)):
            lnum += 1
            txs = sorted(
                tl.get_transcripts(),
                key=lambda x:
                (x.get_range().chr, x.get_range().start, x.get_range().end))
            tlrng = [str(x) for x in tl.get_range().get_bed_array()]
            ofl.write("\t".join(tlrng) + "\t" + str(lnum) + "\t" +
                      str(len(txs)) + "\n")
            for tx in txs:
                cov = tx.get_payload()[1]
                of.write("\t".join(tlrng) + "\t" + str(lnum) + "\t" +
                         str(len(txs)) + "\t" + str(tx.get_payload()[0]) +
                         "\t" + str(z) + "\t" + tx.get_gene_name() + "\t" +
                         str(cov['average_coverage']) + "\t" +
                         str(cov['fraction_covered']) + "\t" +
                         str(cov['mindepth']) + "\n")
    if args.output_loci:
        ofl.close()
    inf.close()
    of.close()