def annotate_line(inputs):
  global txome
  (line,z,args) = inputs
  gpd = GPD(line)
  gpd.set_payload(z)
  v = gpd.get_range()
  if v.chr not in txome: return None
  possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
  candidates = []
  if len(possible) == 0: return None
  for tx in possible:
    eo = None
    full = False
    subset = False
    econsec = 1
    if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
      eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5)
    else:
      eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False)
      if eo.is_full_overlap():
        full = True
      if eo.is_subset():
        subset = True
      if eo:
        econsec = eo.consecutive_exon_count()
    if not eo: continue
    ecnt = eo.match_exon_count()
    osize = gpd.overlap_size(tx)
    candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx])
  if len(candidates)==0: return None
  bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8]))))
  #line_z
  v = bests[0]
  ### we have the annotation
  z = gpd.get_payload()
  #line = line_z[0]
  #gpd = GPD(line)
  if not v: return None
  type = 'partial'
  if v[0]: type = 'full'
  exon_count = v[2]    
  most_consecutive_exons = v[3]
  read_exon_count = v[4]
  tx_exon_count = v[5]
  overlap_size = v[6]
  read_length = v[7]
  tx_length = v[8]
  return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
          str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
          str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
Beispiel #2
0
def main(args):
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
    color = '0,0,0'

    if args.color:
        if args.color == 'blue':
            color = '67,162,202'
        elif args.color == 'green':
            color = '49,163,84'
        elif args.color == 'orange':
            color = '254,178,76'
        elif args.color == 'purple':
            color = '136,86,167'
        elif args.color == 'red':
            color = '240,59,32'

    # set up the header if one is desired
    header = ''
    if not args.noheader:
        newname = 'longreads'
        m = re.search('([^\/]+)$', args.input)
        if m:
            newname = m.group(1)
        newname = re.sub('[\s]+', '_', newname)
        if args.headername:
            newname = args.headername
        elif args.input == '-':
            newname = 'STDIN'
        header += "track\tname=" + newname + "\t"
        description = newname + ' GenePred Entries'
        if args.headerdescription:
            description = args.headerdescription
        header += 'description="' + description + '"' + "\t"
        header += 'itemRgb="On"'
        of.write(header + "\n")

    gpd_handle = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            gpd_handle = gzip.open(args.input)
        else:
            gpd_handle = open(args.input)
    gs = GPDStream(gpd_handle)
    #with gpd_handle as infile:
    for gpd in gs:
        #for line in infile:
        #if re.match('^#',line):
        #  continue
        #genepred_entry = GenePredBasics.line_to_entry(line)
        if args.minintron:
            gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line())
        exoncount = gpd.get_exon_count()
        ostring = gpd.value('chrom') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        if args.namefield == 1:
            ostring += gpd.value('gene_name') + "\t"
        else:
            ostring += gpd.value('name')
        ostring += '1000' + "\t"
        ostring += gpd.value('strand') + "\t"
        ostring += str(gpd.value('exonStarts')[0]) + "\t"
        ostring += str(gpd.value('exonEnds')[exoncount - 1]) + "\t"
        ostring += color + "\t"
        ostring += str(exoncount) + "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonEnds')[i] - gpd.value('exonStarts')[i]) + ','
        ostring += "\t"
        for i in range(0, exoncount):
            ostring += str(
                gpd.value('exonStarts')[i] - gpd.value('exonStarts')[0]) + ','
        of.write(ostring + "\n")
        #for i in range(0,len(genepred_entry['exonStarts'])):
    gpd_handle.close()
    of.close()
def main(args):
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
  color = '0,0,0'

  if args.color:
    if args.color == 'blue':
      color = '67,162,202'
    elif args.color == 'green':
      color = '49,163,84'
    elif args.color == 'orange':
      color = '254,178,76'
    elif args.color == 'purple':
      color = '136,86,167'
    elif args.color == 'red':
      color = '240,59,32'

  # set up the header if one is desired
  header = ''
  if not args.noheader:
    newname = 'longreads'
    m = re.search('([^\/]+)$',args.input)
    if m:
      newname = m.group(1)
    newname = re.sub('[\s]+','_',newname)
    if args.headername:
      newname = args.headername
    elif args.input == '-':
      newname = 'STDIN'
    header += "track\tname="+newname+"\t"
    description = newname+' GenePred Entries'
    if args.headerdescription:
       description = args.headerdescription
    header += 'description="'+description + '"'+"\t"
    header += 'itemRgb="On"'
    of.write(header+"\n")
  
  gpd_handle = sys.stdin
  if args.input != '-': 
    if args.input[-3:]=='.gz':
      gpd_handle = gzip.open(args.input)
    else:
      gpd_handle = open(args.input)
  gs = GPDStream(gpd_handle)
  #with gpd_handle as infile:
  for gpd in gs:
      #for line in infile:
      #if re.match('^#',line):
      #  continue
      #genepred_entry = GenePredBasics.line_to_entry(line)
      if args.minintron:
        gpd = GPD(gpd.smooth_gaps(args.minintron).get_gpd_line())
      exoncount = gpd.get_exon_count()
      ostring  = gpd.value('chrom') + "\t" 
      ostring += str(gpd.value('exonStarts')[0]) + "\t"
      ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t"
      if args.namefield == 1:
        ostring += gpd.value('gene_name') + "\t"
      else: 
        ostring += gpd.value('name')
      ostring += '1000' + "\t"
      ostring += gpd.value('strand') + "\t" 
      ostring += str(gpd.value('exonStarts')[0]) + "\t"
      ostring += str(gpd.value('exonEnds')[exoncount-1]) + "\t"      
      ostring += color+"\t"
      ostring += str(exoncount) + "\t"
      for i in range(0,exoncount):
        ostring += str(gpd.value('exonEnds')[i]-gpd.value('exonStarts')[i]) + ','
      ostring += "\t"
      for i in range(0,exoncount):
        ostring += str(gpd.value('exonStarts')[i]-gpd.value('exonStarts')[0])+','
      of.write(ostring+"\n")
      #for i in range(0,len(genepred_entry['exonStarts'])):
  gpd_handle.close()
  of.close()
def annotate_line(inputs):
    global txome
    (line, z, args) = inputs
    gpd = GPD(line)
    gpd.set_payload(z)
    v = gpd.get_range()
    if v.chr not in txome: return None
    possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
    candidates = []
    if len(possible) == 0: return None
    for tx in possible:
        eo = None
        full = False
        subset = False
        econsec = 1
        if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
            eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5)
        else:
            eo = gpd.exon_overlap(tx,
                                  multi_minover=10,
                                  multi_endfrac=0,
                                  multi_midfrac=0.8,
                                  multi_consec=False)
            if eo.is_full_overlap():
                full = True
            if eo.is_subset():
                subset = True
            if eo:
                econsec = eo.consecutive_exon_count()
        if not eo: continue
        ecnt = eo.match_exon_count()
        osize = gpd.overlap_size(tx)
        candidates.append([
            full, subset, ecnt, econsec,
            gpd.get_exon_count(),
            tx.get_exon_count(), osize,
            gpd.get_length(),
            tx.get_length(), tx
        ])
    if len(candidates) == 0: return None
    bests = sorted(candidates,
                   key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min(
                       float(x[6]) / float(x[7]),
                       float(x[6]) / float(x[8]))))
    #line_z
    v = bests[0]
    ### we have the annotation
    z = gpd.get_payload()
    #line = line_z[0]
    #gpd = GPD(line)
    if not v: return None
    type = 'partial'
    if v[0]: type = 'full'
    exon_count = v[2]
    most_consecutive_exons = v[3]
    read_exon_count = v[4]
    tx_exon_count = v[5]
    overlap_size = v[6]
    read_length = v[7]
    tx_length = v[8]
    return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
            str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
            str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"