コード例 #1
0
def annotate_line(inputs):
  global txome
  (line,z,args) = inputs
  gpd = GPD(line)
  gpd.set_payload(z)
  v = gpd.get_range()
  if v.chr not in txome: return None
  possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
  candidates = []
  if len(possible) == 0: return None
  for tx in possible:
    eo = None
    full = False
    subset = False
    econsec = 1
    if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
      eo = gpd.exon_overlap(tx,single_minover=100,single_frac=0.5)
    else:
      eo = gpd.exon_overlap(tx,multi_minover=10,multi_endfrac=0,multi_midfrac=0.8,multi_consec=False)
      if eo.is_full_overlap():
        full = True
      if eo.is_subset():
        subset = True
      if eo:
        econsec = eo.consecutive_exon_count()
    if not eo: continue
    ecnt = eo.match_exon_count()
    osize = gpd.overlap_size(tx)
    candidates.append([full,subset,ecnt,econsec,gpd.get_exon_count(),tx.get_exon_count(),osize,gpd.get_length(),tx.get_length(),tx])
  if len(candidates)==0: return None
  bests = sorted(candidates,key=lambda x: (-x[0],-x[1],-x[3],-x[2],-min(float(x[6])/float(x[7]),float(x[6])/float(x[8]))))
  #line_z
  v = bests[0]
  ### we have the annotation
  z = gpd.get_payload()
  #line = line_z[0]
  #gpd = GPD(line)
  if not v: return None
  type = 'partial'
  if v[0]: type = 'full'
  exon_count = v[2]    
  most_consecutive_exons = v[3]
  read_exon_count = v[4]
  tx_exon_count = v[5]
  overlap_size = v[6]
  read_length = v[7]
  tx_length = v[8]
  return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
          str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
          str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"
コード例 #2
0
def main(args):

    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    #read the reference gpd
    rinf = None
    global txome
    txome = {}
    if re.search('\.gz$', args.reference):
        rinf = gzip.open(args.reference)
    else:
        rinf = open(args.reference)
    sys.stderr.write("Reading in reference\n")
    z = 0
    # populate txome with reference transcripts for each chromosome
    for line in rinf:
        z += 1
        gpd = GPD(line)
        gpd.set_payload(z)
        if z % 100 == 0: sys.stderr.write(str(z) + "          \r")
        if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
        r = gpd.get_range()
        r.set_payload(gpd)
        txome[gpd.value('chrom')].append(r)
    rinf.close()
    sys.stderr.write(str(z) + "          \r")
    sys.stderr.write("\n")
    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)

    #def annotate_line(gpd,txome,args):
    sys.stderr.write("annotating\n")
    p = Pool(processes=args.threads)
    csize = 100
    #for v in generate_tx(inf,args):
    #  res = annotate_line(v)
    #  if not res: continue
    #  print res.rstrip()
    results2 = p.imap(func=annotate_line,
                      iterable=generate_tx(inf, args),
                      chunksize=csize)
    #sys.stderr.write("done map\n")
    for res in results2:
        if not res: continue
        of.write(res)
    of.close()
コード例 #3
0
def main(args):

  of = sys.stdout
  if args.output:
    if re.search('\.gz$',args.output):
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')

  #read the reference gpd
  rinf = None
  global txome
  txome = {}
  if re.search('\.gz$',args.reference):
    rinf = gzip.open(args.reference)
  else:
    rinf = open(args.reference)
  sys.stderr.write("Reading in reference\n")
  z = 0
  # populate txome with reference transcripts for each chromosome
  for line in rinf:
    z += 1
    gpd = GPD(line)
    gpd.set_payload(z)
    if z%100 == 0:  sys.stderr.write(str(z)+"          \r")
    if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
    r = gpd.get_range()
    r.set_payload(gpd)
    txome[gpd.value('chrom')].append(r)
  rinf.close()
  sys.stderr.write(str(z)+"          \r")
  sys.stderr.write("\n")
  inf = sys.stdin
  if args.input != '-':
    if re.search('\.gz$',args.input):
      inf = gzip.open(args.input)
    else:
      inf = open(args.input)

  #def annotate_line(gpd,txome,args):
  sys.stderr.write("annotating\n")
  p = Pool(processes=args.threads)
  csize = 100
  #for v in generate_tx(inf,args):
  #  res = annotate_line(v)
  #  if not res: continue
  #  print res.rstrip()
  results2 = p.imap(func=annotate_line,iterable=generate_tx(inf,args),chunksize=csize)
  #sys.stderr.write("done map\n")
  for res in results2:
    if not res: continue
    of.write(res)
  of.close()
コード例 #4
0
def do_buffer(buffer, txome, args):
    results = []
    for line_z in buffer:
        z = line_z[1]
        line = line_z[0]
        gpd = GPD(line)
        v = annotate_line(gpd, txome, args)
        if not v: continue
        type = 'partial'
        if v[0]: type = 'full'
        exon_count = v[2]
        most_consecutive_exons = v[3]
        read_exon_count = v[4]
        tx_exon_count = v[5]
        overlap_size = v[6]
        read_length = v[7]
        tx_length = v[8]
        results.append(str(z)+"\t"+gpd.get_gene_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
              str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
              str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n")
    return results
コード例 #5
0
def main(args):

    global of
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    #read the reference gpd
    rinf = None
    txome = {}
    if re.search('\.gz$', args.reference):
        rinf = gzip.open(args.reference)
    else:
        rinf = open(args.reference)
    sys.stderr.write("Reading in reference\n")
    z = 0
    for line in rinf:
        z += 1
        gpd = GPD(line)
        gpd.set_payload(z)
        if z % 100 == 0: sys.stderr.write(str(z) + "          \r")
        if gpd.value('chrom') not in txome: txome[gpd.value('chrom')] = []
        r = gpd.get_range()
        r.set_payload(gpd)
        txome[gpd.value('chrom')].append(r)
    rinf.close()
    sys.stderr.write(str(z) + "          \r")
    sys.stderr.write("\n")
    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    z = 0
    chroms = {}
    sys.stderr.write("Buffering reads\n")
    for line in inf:
        z += 1
        m = re.match('[^\t]*\t[^\t]*\t([^\t]+)', line)
        chrom = m.group(1)
        if z % 100 == 0: sys.stderr.write(str(z) + "      \r")
        if chrom not in chroms:
            chroms[chrom] = []
        chroms[chrom].append([line, z])
    sys.stderr.write("\n")
    sys.stderr.write("Finished buffering reads\n")
    if args.threads > 1:
        p = Pool(processes=args.threads)
    results = []
    global chrtotal
    chrtotal = len(chroms)
    for chrom in chroms:
        if chrom not in txome: continue
        if args.threads > 1:
            v = p.apply_async(do_buffer,
                              args=(chroms[chrom], {
                                  chrom: txome[chrom]
                              }, args),
                              callback=do_out)
            results.append(v)
        else:
            v = do_buffer(chroms[chrom], {chrom: txome[chrom]}, args)
            results.append(Queue(v))
            do_out(v)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\n")
    for res in [x.get() for x in results]:
        for oline in res:
            of.write(oline)
    inf.close()
    of.close()
コード例 #6
0
def annotate_line(inputs):
    global txome
    (line, z, args) = inputs
    gpd = GPD(line)
    gpd.set_payload(z)
    v = gpd.get_range()
    if v.chr not in txome: return None
    possible = [x.get_payload() for x in txome[v.chr] if x.overlaps(v)]
    candidates = []
    if len(possible) == 0: return None
    for tx in possible:
        eo = None
        full = False
        subset = False
        econsec = 1
        if tx.get_exon_count() == 1 or gpd.get_exon_count() == 1:
            eo = gpd.exon_overlap(tx, single_minover=100, single_frac=0.5)
        else:
            eo = gpd.exon_overlap(tx,
                                  multi_minover=10,
                                  multi_endfrac=0,
                                  multi_midfrac=0.8,
                                  multi_consec=False)
            if eo.is_full_overlap():
                full = True
            if eo.is_subset():
                subset = True
            if eo:
                econsec = eo.consecutive_exon_count()
        if not eo: continue
        ecnt = eo.match_exon_count()
        osize = gpd.overlap_size(tx)
        candidates.append([
            full, subset, ecnt, econsec,
            gpd.get_exon_count(),
            tx.get_exon_count(), osize,
            gpd.get_length(),
            tx.get_length(), tx
        ])
    if len(candidates) == 0: return None
    bests = sorted(candidates,
                   key=lambda x: (-x[0], -x[1], -x[3], -x[2], -min(
                       float(x[6]) / float(x[7]),
                       float(x[6]) / float(x[8]))))
    #line_z
    v = bests[0]
    ### we have the annotation
    z = gpd.get_payload()
    #line = line_z[0]
    #gpd = GPD(line)
    if not v: return None
    type = 'partial'
    if v[0]: type = 'full'
    exon_count = v[2]
    most_consecutive_exons = v[3]
    read_exon_count = v[4]
    tx_exon_count = v[5]
    overlap_size = v[6]
    read_length = v[7]
    tx_length = v[8]
    return str(z)+"\t"+gpd.get_transcript_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\
            str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\
            str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n"