def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="IDP output folder")
    #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
    parser.add_argument('--offset',
                        type=int,
                        default=1,
                        help="add this much to all expressions")
    parser.add_argument('--mult',
                        type=int,
                        default=10,
                        help="multiply all expressions by this much")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    args.input = args.input.rstrip('/')
    inf = open(args.input + '/isoform.gpd')
    sys.stderr.write("Reading isoform.gpd\n")
    txs = {}
    for line in inf:
        gpd = GPD(line)
        tx = gpd.get_transcript_name()
        if tx not in txs:
            txs[tx] = []
        for exon in gpd.exons:
            txs[tx].append(exon.get_range())
    inf.close()

    sys.stderr.write("Reading isoform.exp file\n")
    inf = open(args.input + '/isoform.exp')
    vals = []
    for line in inf:
        f = line.rstrip().split("\t")
        v = int((float(f[1]) * args.mult) + args.offset)
        tx = f[0]
        exons = txs[tx]
        #if len(exons) < args.min_exons: continue
        for i in range(0, v):
            vals += exons[:]
    inf.close()
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()
Esempio n. 2
0
def do_multi_round_locus(gpds,args):
    if args.threads == 1: sys.stderr.write("processing "+str(len(gpds))+" gpds\n")
    new_gpds = process_locus(gpds,args)
    if args.threads == 1: sys.stderr.write("merged to "+str(len(new_gpds))+" gpds\n")
    num_gpds = -1
    round = 1
    while num_gpds != len(new_gpds):
      round +=1
      num_gpds = len(new_gpds)
      buffer = []
      for v in new_gpds:
        #if v['evidence'] < args.minimum_support: continue
        for i in range(0,min(v['evidence'],max(args.minimum_support+1,args.minimum_junction_end_support+1))):
          nline = GPD(v['tx'].get_fake_gpd_line())
          # replace the gene name if we know it
          if not nline.validate():
            if args.threads == 1: sys.stderr.write("WARNING: 1. failed to make valid gpd. losing candidate\n")
            continue
          ngpd = GPD(nline.get_fake_gpd_line())
          if args.gene_names:
            ngpd.set_gene_name(v['tx'].get_gene_name())
          buffer.append(ngpd)
      gpds = buffer
      new_gpds = process_locus(gpds,args)
      if args.threads == 1: sys.stderr.write("round "+str(round)+" merged to "+str(len(new_gpds))+" gpds\n")
    return new_gpds
Esempio n. 3
0
def do_buffer(gpd_lines, fasta, args):
    results = []
    for gpd_line in gpd_lines:
        gpd = GPD(gpd_line)
        l = gpd.get_length()
        if l < args.length: continue
        num = int(float(l) / float(args.length))
        rem = l % args.length
        #print 'rem : '+str(rem)
        extra = 0
        offset = 0
        #if space > 1: # we have room to make multiple passes
        #  #print '---'
        #  #print 'length: '+str(l)
        #  #print 'strand: '+gpd.get_strand()
        #  if random.random() < 0.5: extra = rem
        #  offset = int(float(args.length)/float(args.coverage))
        #else:
        #  offset = int(float(rem)/float(args.coverage))

        if args.short_reads:
            offset = 0
            if random.random() < 0.5: offset = rem
            gsub = gpd.subset(offset, args.length + offset)
            #print gsub.get_gpd_line()
            val = get_sam(gsub, fasta)
            results.append(val)
            #continue
        else:  # not short reads
            for i in range(0, args.coverage):
                init = 0
                if num == 0 and rem > 0:
                    init = random.choice(range(0, rem))
                elif num > 0:
                    init = random.choice(range(0, args.length))
                #start = (i*offset+extra) % args.length
                #while start+args.length <= l:
                for j in range(init, l, args.length):
                    if j + args.length > l: break
                    #print str(start)+" "+str(start+args.length)
                    gsub = gpd.subset(j, j + args.length)
                    val = get_sam(gsub, fasta)
                    results.append(val)
                    #print gsub.get_sequence(fasta)
                    #start += args.length
                    #print gsub.get_strand()
        #print space
        #print rem
        #print gpd
    return results
Esempio n. 4
0
def do_buffer(gpd_lines,fasta,args):
  results = []
  for gpd_line in gpd_lines:
    gpd = GPD(gpd_line)
    l = gpd.get_length()
    if l < args.length: continue
    num = int(float(l)/float(args.length))
    rem = l % args.length
    #print 'rem : '+str(rem)
    extra = 0
    offset = 0
    #if space > 1: # we have room to make multiple passes
    #  #print '---'
    #  #print 'length: '+str(l)
    #  #print 'strand: '+gpd.get_strand()
    #  if random.random() < 0.5: extra = rem
    #  offset = int(float(args.length)/float(args.coverage))
    #else:
    #  offset = int(float(rem)/float(args.coverage)) 

    if args.short_reads:
      offset = 0
      if random.random() < 0.5: offset = rem
      gsub = gpd.subset(offset,args.length+offset)
      #print gsub.get_gpd_line()
      val = get_sam(gsub,fasta)
      results.append(val)
      #continue
    else:# not short reads
      for i in range(0,args.coverage):
        init = 0
        if num == 0 and rem > 0:
          init = random.choice(range(0,rem))
        elif num > 0:
          init = random.choice(range(0,args.length))
        #start = (i*offset+extra) % args.length
        #while start+args.length <= l:
        for j in range(init,l,args.length):
          if j + args.length > l: break
          #print str(start)+" "+str(start+args.length)
          gsub = gpd.subset(j,j+args.length)
          val = get_sam(gsub,fasta)
          results.append(val)
          #print gsub.get_sequence(fasta)
          #start += args.length
          #print gsub.get_strand()
    #print space
    #print rem
    #print gpd
  return results
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="IDP output folder")
  #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
  parser.add_argument('--offset',type=int,default=1,help="add this much to all expressions")
  parser.add_argument('--mult',type=int,default=10,help="multiply all expressions by this much")
  parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT")
  args = parser.parse_args()
  
  args.input= args.input.rstrip('/')
  inf = open(args.input+'/isoform.gpd')
  sys.stderr.write("Reading isoform.gpd\n")
  txs = {}
  for line in inf:
    gpd = GPD(line)
    tx = gpd.get_transcript_name()
    if tx not in txs:
      txs[tx] = []
    for exon in gpd.exons:
      txs[tx].append(exon.get_range())
  inf.close()

  sys.stderr.write("Reading isoform.exp file\n")
  inf = open(args.input+'/isoform.exp')
  vals = []
  for line in inf:
      f = line.rstrip().split("\t")
      v = int((float(f[1])*args.mult)+args.offset)
      tx = f[0]
      exons = txs[tx]
      #if len(exons) < args.min_exons: continue
      for i in range(0,v):
        vals += exons[:]
  inf.close()
  sys.stderr.write("Generating coverage file "+str(len(vals))+"\n")
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  covs = ranges_to_coverage(vals)
  for v in covs:
    of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n")
  #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
  of.close()
def main(args):

    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    #read the reference gpd
    rinf = None
    global txome
    txome = {}
    if re.search('\.gz$', args.reference):
        rinf = gzip.open(args.reference)
    else:
        rinf = open(args.reference)
    sys.stderr.write("Reading in reference\n")
    z = 0
    # populate txome with reference transcripts for each chromosome
    for line in rinf:
        z += 1
        gpd = GPD(line)
        gpd.set_payload(z)
        if z % 100 == 0: sys.stderr.write(str(z) + "          \r")
        if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
        r = gpd.get_range()
        r.set_payload(gpd)
        txome[gpd.value('chrom')].append(r)
    rinf.close()
    sys.stderr.write(str(z) + "          \r")
    sys.stderr.write("\n")
    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)

    #def annotate_line(gpd,txome,args):
    sys.stderr.write("annotating\n")
    p = Pool(processes=args.threads)
    csize = 100
    #for v in generate_tx(inf,args):
    #  res = annotate_line(v)
    #  if not res: continue
    #  print res.rstrip()
    results2 = p.imap(func=annotate_line,
                      iterable=generate_tx(inf, args),
                      chunksize=csize)
    #sys.stderr.write("done map\n")
    for res in results2:
        if not res: continue
        of.write(res)
    of.close()
 def __init__(self, gpd_file=None, ref_fasta=None):
     self.transcripts = []
     if gpd_file:
         from Bio.Format.GPD import GPD
         with open(gpd_file) as inf:
             for line in inf:
                 self.transcripts.append(GPD(line))
     if ref_fasta:
         for i in range(0, len(self.transcripts)):
             self.transcripts[i].get_sequence(ref_fasta)
Esempio n. 8
0
def do_buffer(buffer, txome, args):
    results = []
    for line_z in buffer:
        z = line_z[1]
        line = line_z[0]
        gpd = GPD(line)
        v = annotate_line(gpd, txome, args)
        if not v: continue
        type = 'partial'
        if v[0]: type = 'full'
        exon_count = v[2]
        most_consecutive_exons = v[3]
        read_exon_count = v[4]
        tx_exon_count = v[5]
        overlap_size = v[6]
        read_length = v[7]
        tx_length = v[8]
        results.append(str(z)+"\t"+gpd.get_gene_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
              str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
              str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n")
    return results
Esempio n. 9
0
def main(args):
  sys.stderr.write("Reading reference fasta\n")
  ref_genome = FastaData(open(args.reference_fasta,'rb').read())
  sys.stderr.write("Reading in transcriptome\n")
  output = {}
  txome = Transcriptome()
  z = 0
  with open(args.reference_gpd) as inf:
    for line in inf:
      z+=1
      if z%1000==0:  sys.stderr.write(str(z)+"       \r")
      gpd = GPD(line)
      gpd.set_sequence(ref_genome)
      txome.add_transcript(gpd)
  sys.stderr.write("\n")
  sys.stderr.write("Serializing transcriptome\n")
  output['txome'] = txome.dump_serialized()
  txweights = {}
  weight_type = 'uniform_distribution' #default
  if args.expression_table:
    weight_type = 'expression_table'
    inf = None
    if args.expression_table[-3:]=='.gz':
      inf = gzip.open(args.expression_table)
    else: inf = open(args.expression_table)
    for line in inf:
      f = line.rstrip().split("\t")
      txweights[f[0]] = float(f[1])
  elif args.exponential_distribution: weight_type = 'exponential_distribution'
  output['weight_type'] = weight_type
  output['weights'] = txweights #only matters for expression based
  of = sys.stdout
  if args.output: of = open(args.output,'w')
  of.write(base64.b64encode(zlib.compress(pickle.dumps(output)))+"\n")
  of.close()


  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Esempio n. 10
0
def main(args):

  of = sys.stdout
  if args.output:
    if re.search('\.gz$',args.output):
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')

  #read the reference gpd
  rinf = None
  global txome
  txome = {}
  if re.search('\.gz$',args.reference):
    rinf = gzip.open(args.reference)
  else:
    rinf = open(args.reference)
  sys.stderr.write("Reading in reference\n")
  z = 0
  # populate txome with reference transcripts for each chromosome
  for line in rinf:
    z += 1
    gpd = GPD(line)
    gpd.set_payload(z)
    if z%100 == 0:  sys.stderr.write(str(z)+"          \r")
    if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
    r = gpd.get_range()
    r.set_payload(gpd)
    txome[gpd.value('chrom')].append(r)
  rinf.close()
  sys.stderr.write(str(z)+"          \r")
  sys.stderr.write("\n")
  inf = sys.stdin
  if args.input != '-':
    if re.search('\.gz$',args.input):
      inf = gzip.open(args.input)
    else:
      inf = open(args.input)

  #def annotate_line(gpd,txome,args):
  sys.stderr.write("annotating\n")
  p = Pool(processes=args.threads)
  csize = 100
  #for v in generate_tx(inf,args):
  #  res = annotate_line(v)
  #  if not res: continue
  #  print res.rstrip()
  results2 = p.imap(func=annotate_line,iterable=generate_tx(inf,args),chunksize=csize)
  #sys.stderr.write("done map\n")
  for res in results2:
    if not res: continue
    of.write(res)
  of.close()
Esempio n. 11
0
def process_locus(igpds,rgpds,args):
  input_entries = [GPD(x) for x in igpds]
  reference_entries = [GPD(x) for x in rgpds]
  outlines = []
  injun = get_consecutive_junctions(input_entries,args)
  refjun = get_consecutive_junctions(reference_entries,args)
  allrefjuncs = [] # consolidate reference junctions
  for refgpdset in refjun:
    (refgpd,refjuncs) = refgpdset
    for refjunc in refjuncs: allrefjuncs.append(refjunc) # append all reference junctions
  #sys.stderr.write("Now check the overlap\n")
  for ingpdset in injun:
    (ingpd,juncs) = ingpdset
    # one gpd at a time
    unsupported_pairs = junction_match(juncs,allrefjuncs,args)
    ostr = ''
    ostr += ingpd.get_gene_name()+"\t"
    ostr += ingpd.get_transcript_name()+"\t"
    ostr += str(len(juncs))+"\t"
    ostr += str(len(unsupported_pairs))+"\t"
    ostr += ";".join([x[0].get_string()+"~~"+x[1].get_string() for x in unsupported_pairs])
    outlines.append(ostr)
  return outlines
Esempio n. 12
0
def main():
  #do our inputs
  args = do_inputs()

  sys.stderr.write("Reading reference genepred\n")
  ref = {}
  tx_strand = {}
  z = 0
  with open(args.reference_genepred) as inf:
    for line in inf:
      gpd = GPD(line)
      gname = gpd.get_gene_name()
      tname = gpd.get_transcript_name()
      tx_strand[tname] = gpd.get_strand()
      if gname not in ref: ref[gname] = []
      ref[gname].append(gpd)
      z += 1
  sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n")

  if args.maximum_isoforms > 0:
    sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n")
    for gname in ref.keys():
      if len(ref[gname]) > args.maximum_isoforms: del ref[gname]
    sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n")

  sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n")
  for gname in ref.keys():
    passing = []
    for gpd in ref[gname]:
      if gpd.get_length() < args.minimum_length: continue
      passing.append(gpd)
    if len(passing) == 0: del ref[gname]
    else: ref[gname] = passing
  sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n")
  
  sys.stderr.write("Converting gpd into exon bed\n")
  beds = []
  for gname in ref.keys():
    for gpd in ref[gname]:
      tname = gpd.get_transcript_name()
      for i in range(0,len(gpd.exons)):
        ex = gpd.exons[i]
        beds.append(ex.get_range().get_bed_array()+[gname,tname,i])
  with open(args.tempdir+'/gpd.bed','w') as of:
    for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])):
      of.write("\t".join([str(x) for x in bed])+"\n")
  sys.stderr.write("intersecting with bed depth\n")
  of = open(args.tempdir+'/intersect.bed','w')
  cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed'
  p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of)
  p.communicate()
  coverage = {}
  sys.stderr.write("Reading the intersection\n")
  with open(args.tempdir+'/intersect.bed') as inf:
    for line in inf:
        f = line.rstrip().split("\t")
        gname = f[7]
        tname = f[8]
        depth = int(f[3])
        bed1 = Bed(f[0],int(f[1]),int(f[2]))
        bed2 = Bed(f[4],int(f[5]),int(f[6]))
        bed = bed1.union(bed2)
        bed.set_payload(depth)
        if gname not in coverage:
          coverage[gname] = {}
        if tname not in coverage[gname]:
          coverage[gname][tname] = []
        coverage[gname][tname].append(bed)
  transcript_depths = {}
  for gname in coverage:
    for tname in coverage[gname]:
      ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0]
      rlen = ref_gpd.get_length()
      bases_covered = sum([x.length() for x in coverage[gname][tname]])
      bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]])
      avg_depth = float(bases_area)/float(rlen)
      if avg_depth < args.minimum_average_depth: continue
      if bases_covered < args.minimum_length: continue
      #print gname
      #print tname
      #print rlen
      #print bases_covered
      #print bases_area
      total_positions = {}
      for ex in ref_gpd.exons:
        b = ex.get_range().get_bed_array()
        for i in range(b[1],b[2]):
          total_positions[i] = 0 # zero indexed
      for b in coverage[gname][tname]:
        depth = b.get_payload()
        barr = b.get_bed_array()
        for i in range(barr[1],barr[2]):
          total_positions[i] = depth
      transcript_depths[tname] = total_positions
  sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n")
  outputs = []
  for tname in transcript_depths:
    depths = transcript_depths[tname]
    positions = sorted(depths.keys())
    tx_len = len(positions)
    bins = {}
    for i in range(0,tx_len):
      bin = int(100*float(i)/float(tx_len))
      if bin not in bins: bins[bin] = []
      bins[bin].append(depths[positions[i]])
    for bin in bins:
      bins[bin] = average(bins[bin])
    biggest = float(max(bins.values()))
    tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())]
    if tx_strand[tname] == '-':
      tx_array.reverse()
    #outputs.append(tx_array)
    args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n")
  #for i in range(0,100):
  #  args.output.write("\t".join([str(x[i]) for x in outputs])+"\n")
  
  args.output.close()

  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Esempio n. 13
0
def main(args):
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
    color = '0,0,0'

    if args.color:
        if args.color == 'blue':
            color = '67,162,202'
        elif args.color == 'green':
            color = '49,163,84'
        elif args.color == 'orange':
            color = '254,178,76'
        elif args.color == 'purple':
            color = '136,86,167'
        elif args.color == 'red':
            color = '240,59,32'

    # set up the header if one is desired
    header = ''
    if not args.noheader:
        newname = 'longreads'
        m = re.search('([^\/]+)$', args.input)
        if m:
            newname = m.group(1)
        newname = re.sub('[\s]+', '_', newname)
        if args.headername:
            newname = args.headername
        elif args.input == '-':
            newname = 'STDIN'
        header += "track\tname=" + newname + "\t"
        description = newname + ' GenePred Entries'
        if args.headerdescription:
            description = args.headerdescription
        header += 'description="' + description + '"' + "\t"
        header += 'itemRgb="On"'
        of.write(header + "\n")

    gpd_handle = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            gpd_handle = gzip.open(args.input)
        else:
            gpd_handle = open(args.input)
    gs = GPDStream(gpd_handle)
    #with gpd_handle as infile:
    for gpd in gs:
        #for line in infile:
        #if re.match('^#',line):
        #  continue
        #genepred_entry = GenePredBasics.line_to_entry(line)
        if args.minintron:
            gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line())
        exoncount = gpd.get_exon_count()
        ostring = gpd.value('chrom') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        if args.namefield == 1:
            ostring += gpd.value('gene_name') + "\t"
        else:
            ostring += gpd.value('name')
        ostring += '1000' + "\t"
        ostring += gpd.value('strand') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        ostring += color + "\t"
        ostring += str(exoncount) + "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ','
        ostring += "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ','
        of.write(ostring + "\n")
        #for i in range(0,len(genepred_entry['exonStarts'])):
    gpd_handle.close()
    of.close()
Esempio n. 14
0
def main(args):

    global of
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    #read the reference gpd
    rinf = None
    txome = {}
    if re.search('\.gz$', args.reference):
        rinf = gzip.open(args.reference)
    else:
        rinf = open(args.reference)
    sys.stderr.write("Reading in reference\n")
    z = 0
    for line in rinf:
        z += 1
        gpd = GPD(line)
        gpd.set_payload(z)
        if z % 100 == 0: sys.stderr.write(str(z) + "          \r")
        if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
        r = gpd.get_range()
        r.set_payload(gpd)
        txome[gpd.value('chrom')].append(r)
    rinf.close()
    sys.stderr.write(str(z) + "          \r")
    sys.stderr.write("\n")
    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    z = 0
    chroms = {}
    sys.stderr.write("Buffering reads\n")
    for line in inf:
        z += 1
        m = re.match('[^\t]*\t[^\t]*\t([^\t]+)', line)
        chrom = m.group(1)
        if z % 100 == 0: sys.stderr.write(str(z) + "      \r")
        if chrom not in chroms:
            chroms[chrom] = []
        chroms[chrom].append([line, z])
    sys.stderr.write("\n")
    sys.stderr.write("Finished buffering reads\n")
    if args.threads > 1:
        p = Pool(processes=args.threads)
    results = []
    global chrtotal
    chrtotal = len(chroms)
    for chrom in chroms:
        if chrom not in txome: continue
        if args.threads > 1:
            v = p.apply_async(do_buffer,
                              args=(chroms[chrom], {
                                  chrom: txome[chrom]
                              }, args),
                              callback=do_out)
            results.append(v)
        else:
            v = do_buffer(chroms[chrom], {chrom: txome[chrom]}, args)
            results.append(Queue(v))
            do_out(v)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\n")
    for res in [x.get() for x in results]:
        for oline in res:
            of.write(oline)
    inf.close()
    of.close()
Esempio n. 15
0
def main(args):
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
  color = '0,0,0'

  if args.color:
    if args.color == 'blue':
      color = '67,162,202'
    elif args.color == 'green':
      color = '49,163,84'
    elif args.color == 'orange':
      color = '254,178,76'
    elif args.color == 'purple':
      color = '136,86,167'
    elif args.color == 'red':
      color = '240,59,32'

  # set up the header if one is desired
  header = ''
  if not args.noheader:
    newname = 'longreads'
    m = re.search('([^\/]+)$',args.input)
    if m:
      newname = m.group(1)
    newname = re.sub('[\s]+','_',newname)
    if args.headername:
      newname = args.headername
    elif args.input == '-':
      newname = 'STDIN'
    header += "track\tname="+newname+"\t"
    description = newname+' GenePred Entries'
    if args.headerdescription:
       description = args.headerdescription
    header += 'description="'+description + '"'+"\t"
    header += 'itemRgb="On"'
    of.write(header+"\n")
  
  gpd_handle = sys.stdin
  if args.input != '-': 
    if args.input[-3:]=='.gz':
      gpd_handle = gzip.open(args.input)
    else:
      gpd_handle = open(args.input)
  gs = GPDStream(gpd_handle)
  #with gpd_handle as infile:
  for gpd in gs:
      #for line in infile:
      #if re.match('^#',line):
      #  continue
      #genepred_entry = GenePredBasics.line_to_entry(line)
      if args.minintron:
        gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line())
      exoncount = gpd.get_exon_count()
      ostring  = gpd.value('chrom') + "\t" 
      ostring += str(gpd.value('exonStarts')[0]) + "\t"
      ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t"
      if args.namefield == 1:
        ostring += gpd.value('gene_name') + "\t"
      else: 
        ostring += gpd.value('name')
      ostring += '1000' + "\t"
      ostring += gpd.value('strand') + "\t" 
      ostring += str(gpd.value('exonStarts')[0]) + "\t"
      ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t"      
      ostring += color+"\t"
      ostring += str(exoncount) + "\t"
      for i in range(0,exoncount):
        ostring += str(gpd.value('exonEnds')[i]-gpd.value('exonStarts')[i]) + ','
      ostring += "\t"
      for i in range(0,exoncount):
        ostring += str(gpd.value('exonStarts')[i]-gpd.value('exonStarts')[0])+','
      of.write(ostring+"\n")
      #for i in range(0,len(genepred_entry['exonStarts'])):
  gpd_handle.close()
  of.close()
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('genepred', help="the genepred used for this alignqc")
    parser.add_argument('--min_exons',
                        type=int,
                        default=1,
                        help="At least this number of exons")
    parser.add_argument('--full',
                        action='store_true',
                        help="only use full matches")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    genes = {}
    sys.stderr.write("Reading annotation file\n")
    for line in inf:
        f = line.rstrip().split("\t")
        gene = f[2]
        tx = f[3]
        type = f[4]
        if args.full and type != 'full': continue
        if gene not in genes:
            genes[gene] = {}
            genes[gene]['transcripts'] = {}
            genes[gene]['cnt'] = 0
        if tx not in genes[gene]['transcripts']:
            genes[gene]['transcripts'][tx] = 0
        genes[gene]['cnt'] += 1
        genes[gene]['transcripts'][tx] += 1
    inf.close()

    txs = {}
    sys.stderr.write("Reading genepred file\n")
    z = 0
    with open(args.genepred) as inf:
        for line in inf:
            z += 1
            if z % 1000 == 0: sys.stderr.write(str(z) + "   \r")
            gpd = GPD(line)
            exs = []
            for ex in gpd.exons:
                exs.append(ex.get_range())
            txs[gpd.get_transcript_name()] = exs
    sys.stderr.write("\n")
    vals = []
    sys.stderr.write("Traversing annotation file\n")
    for gene in genes:
        for tx in genes[gene]['transcripts']:
            v = genes[gene]['transcripts'][tx]
            exons = txs[tx]
            if len(exons) < args.min_exons: continue
            for i in range(0, v):
                vals += exons[:]
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()
def annotate_line(inputs):
    global txome
    (line, z, args) = inputs
    gpd = GPD(line)
    gpd.set_payload(z)
    v = gpd.get_range()
    if v.chr not in txome: return None
    possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
    candidates = []
    if len(possible) == 0: return None
    for tx in possible:
        eo = None
        full = False
        subset = False
        econsec = 1
        if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
            eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5)
        else:
            eo = gpd.exon_overlap(tx,
                                  multi_minover=10,
                                  multi_endfrac=0,
                                  multi_midfrac=0.8,
                                  multi_consec=False)
            if eo.is_full_overlap():
                full = True
            if eo.is_subset():
                subset = True
            if eo:
                econsec = eo.consecutive_exon_count()
        if not eo: continue
        ecnt = eo.match_exon_count()
        osize = gpd.overlap_size(tx)
        candidates.append([
            full, subset, ecnt, econsec,
            gpd.get_exon_count(),
            tx.get_exon_count(), osize,
            gpd.get_length(),
            tx.get_length(), tx
        ])
    if len(candidates) == 0: return None
    bests = sorted(candidates,
                   key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min(
                       float(x[6]) / float(x[7]),
                       float(x[6]) / float(x[8]))))
    #line_z
    v = bests[0]
    ### we have the annotation
    z = gpd.get_payload()
    #line = line_z[0]
    #gpd = GPD(line)
    if not v: return None
    type = 'partial'
    if v[0]: type = 'full'
    exon_count = v[2]
    most_consecutive_exons = v[3]
    read_exon_count = v[4]
    tx_exon_count = v[5]
    overlap_size = v[6]
    read_length = v[7]
    tx_length = v[8]
    return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
            str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
            str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
Esempio n. 18
0
def annotate_line(inputs):
  global txome
  (line,z,args) = inputs
  gpd = GPD(line)
  gpd.set_payload(z)
  v = gpd.get_range()
  if v.chr not in txome: return None
  possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
  candidates = []
  if len(possible) == 0: return None
  for tx in possible:
    eo = None
    full = False
    subset = False
    econsec = 1
    if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
      eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5)
    else:
      eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False)
      if eo.is_full_overlap():
        full = True
      if eo.is_subset():
        subset = True
      if eo:
        econsec = eo.consecutive_exon_count()
    if not eo: continue
    ecnt = eo.match_exon_count()
    osize = gpd.overlap_size(tx)
    candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx])
  if len(candidates)==0: return None
  bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8]))))
  #line_z
  v = bests[0]
  ### we have the annotation
  z = gpd.get_payload()
  #line = line_z[0]
  #gpd = GPD(line)
  if not v: return None
  type = 'partial'
  if v[0]: type = 'full'
  exon_count = v[2]    
  most_consecutive_exons = v[3]
  read_exon_count = v[4]
  tx_exon_count = v[5]
  overlap_size = v[6]
  read_length = v[7]
  tx_length = v[8]
  return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
          str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
          str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('genepred',help="the genepred used for this alignqc")
  parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
  parser.add_argument('--full',action='store_true',help="only use full matches")
  parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT")
  args = parser.parse_args()
  
  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:]=='.gz':
      inf = gzip.open(args.input)
    else: inf = open(args.input)
  genes = {}
  sys.stderr.write("Reading annotation file\n")
  for line in inf:
    f = line.rstrip().split("\t")  
    gene = f[2]
    tx = f[3]
    type = f[4]
    if args.full and type != 'full': continue
    if gene not in genes:
      genes[gene] = {}
      genes[gene]['transcripts'] = {}
      genes[gene]['cnt'] = 0
    if tx not in genes[gene]['transcripts']:
      genes[gene]['transcripts'][tx] = 0
    genes[gene]['cnt'] += 1
    genes[gene]['transcripts'][tx] += 1
  inf.close()

  txs = {}
  sys.stderr.write("Reading genepred file\n")
  z = 0
  with open(args.genepred) as inf:
    for line in inf:
      z +=1
      if z%1000==0: sys.stderr.write(str(z)+"   \r")
      gpd = GPD(line)
      exs = []
      for ex in gpd.exons:
        exs.append(ex.range)
      txs[gpd.get_transcript_name()] = exs
  sys.stderr.write("\n")
  vals = []
  sys.stderr.write("Traversing annotation file\n")
  for gene in genes:
    for tx in genes[gene]['transcripts']:
      v = genes[gene]['transcripts'][tx]
      exons = txs[tx]
      if len(exons) < args.min_exons: continue
      for i in range(0,v):
        vals += exons[:]
  sys.stderr.write("Generating coverage file "+str(len(vals))+"\n")
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  covs = ranges_to_coverage(vals)
  for v in covs:
    of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n")
  #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
  of.close()