Exemple #1
0
def read_next(inf,strata):
  while True:
    v = inf.readline()
    if not v: return False
    num = int(v.rstrip().split("\t")[3])
    if num < strata: continue
    arr = v.split("\t")
    res = Bed(arr[0],int(arr[1]),int(arr[2]))
    res.set_payload(int(num))
    return res
def get_junctions(sams,args):
  prog = re.compile('^[MDNX=]$')
  outsams = {}
  z = 0
  outs = []
  for sam in sams:
    z+=1
    outsams[z] = sam
    v = [x for x in sam.value('cigar_array') if prog.match(x['op'])]
    juncs = [i for i in range(0,len(v)) if v[i]['op'] =='N' and v[i]['val'] >= args.minimum_intron_size]
    for i in juncs:
      coord1 = sum([x['val'] for x in v[0:i]]) + sam.value('pos')
      coord2 = coord1 + v[i]['val'] 
      b1 = Bed(sam.value('rname'),coord1-2,coord1-1)
      b2 = Bed(sam.value('rname'),coord2-1,coord2)
      outs.append([Junction(b1,b2),z])
  return [outs,outsams]
Exemple #3
0
def main():
  #do our inputs
  args = do_inputs()

  sys.stderr.write("Reading reference genepred\n")
  ref = {}
  tx_strand = {}
  z = 0
  with open(args.reference_genepred) as inf:
    for line in inf:
      gpd = GPD(line)
      gname = gpd.get_gene_name()
      tname = gpd.get_transcript_name()
      tx_strand[tname] = gpd.get_strand()
      if gname not in ref: ref[gname] = []
      ref[gname].append(gpd)
      z += 1
  sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n")

  if args.maximum_isoforms > 0:
    sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n")
    for gname in ref.keys():
      if len(ref[gname]) > args.maximum_isoforms: del ref[gname]
    sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n")

  sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n")
  for gname in ref.keys():
    passing = []
    for gpd in ref[gname]:
      if gpd.get_length() < args.minimum_length: continue
      passing.append(gpd)
    if len(passing) == 0: del ref[gname]
    else: ref[gname] = passing
  sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n")
  
  sys.stderr.write("Converting gpd into exon bed\n")
  beds = []
  for gname in ref.keys():
    for gpd in ref[gname]:
      tname = gpd.get_transcript_name()
      for i in range(0,len(gpd.exons)):
        ex = gpd.exons[i]
        beds.append(ex.get_range().get_bed_array()+[gname,tname,i])
  with open(args.tempdir+'/gpd.bed','w') as of:
    for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])):
      of.write("\t".join([str(x) for x in bed])+"\n")
  sys.stderr.write("intersecting with bed depth\n")
  of = open(args.tempdir+'/intersect.bed','w')
  cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed'
  p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of)
  p.communicate()
  coverage = {}
  sys.stderr.write("Reading the intersection\n")
  with open(args.tempdir+'/intersect.bed') as inf:
    for line in inf:
        f = line.rstrip().split("\t")
        gname = f[7]
        tname = f[8]
        depth = int(f[3])
        bed1 = Bed(f[0],int(f[1]),int(f[2]))
        bed2 = Bed(f[4],int(f[5]),int(f[6]))
        bed = bed1.union(bed2)
        bed.set_payload(depth)
        if gname not in coverage:
          coverage[gname] = {}
        if tname not in coverage[gname]:
          coverage[gname][tname] = []
        coverage[gname][tname].append(bed)
  transcript_depths = {}
  for gname in coverage:
    for tname in coverage[gname]:
      ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0]
      rlen = ref_gpd.get_length()
      bases_covered = sum([x.length() for x in coverage[gname][tname]])
      bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]])
      avg_depth = float(bases_area)/float(rlen)
      if avg_depth < args.minimum_average_depth: continue
      if bases_covered < args.minimum_length: continue
      #print gname
      #print tname
      #print rlen
      #print bases_covered
      #print bases_area
      total_positions = {}
      for ex in ref_gpd.exons:
        b = ex.get_range().get_bed_array()
        for i in range(b[1],b[2]):
          total_positions[i] = 0 # zero indexed
      for b in coverage[gname][tname]:
        depth = b.get_payload()
        barr = b.get_bed_array()
        for i in range(barr[1],barr[2]):
          total_positions[i] = depth
      transcript_depths[tname] = total_positions
  sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n")
  outputs = []
  for tname in transcript_depths:
    depths = transcript_depths[tname]
    positions = sorted(depths.keys())
    tx_len = len(positions)
    bins = {}
    for i in range(0,tx_len):
      bin = int(100*float(i)/float(tx_len))
      if bin not in bins: bins[bin] = []
      bins[bin].append(depths[positions[i]])
    for bin in bins:
      bins[bin] = average(bins[bin])
    biggest = float(max(bins.values()))
    tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())]
    if tx_strand[tname] == '-':
      tx_array.reverse()
    #outputs.append(tx_array)
    args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n")
  #for i in range(0,100):
  #  args.output.write("\t".join([str(x[i]) for x in outputs])+"\n")
  
  args.output.close()

  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Exemple #4
0
def get_overlap(fileA,fileB,min_A,min_B):
  infA = do_open(fileA)
  infB = do_open(fileB)
  bufA = read_next(infA,min_A)
  bufB = read_next(infB,min_B)
  tot = 0
  sizeA = 0
  sizeB = 0
  if bufA:
    sizeA = bufA.length()
  if bufB:
    sizeB = bufB.length()
  zA = 1
  zB = 1
  while True:
    #if (zA%10000 ==0 or zB%10000==0): sys.stderr.write(str(zA)+" "+str(zB)+"  \r")
    if not bufA or not bufB: break
    c = bufA.cmp(bufB)
    if c == 0:
      tot += bufA.overlap_size(bufB)
      saveA = bufA
      nA = bufA.subtract(bufB)
      if len(nA) > 0 and nA[-1].end == bufA.end:
        num = bufA.get_payload()
        bufA = Bed(nA[-1].chr,nA[-1].start-1,nA[-1].end)
        bufA.set_payload(num)
      else:
        bufA = read_next(infA,min_A)
        if bufA:
          sizeA += bufA.length()
        zA+=1

      nB = bufB.subtract(saveA)
      if len(nB) > 0 and nB[-1].end == bufB.end:
        num = bufB.get_payload()
        bufB = Bed(nB[-1].chr,nB[-1].start-1,nB[-1].end)
        bufB.set_payload(num)
      else:
        bufB = read_next(infB,min_B)
        if bufB:
          sizeB += bufB.length()
        zB+=1

    elif c == -1:
      bufA = read_next(infA,min_A)
      if bufA:
        sizeA += bufA.length()
      zA += 1
    else:
      bufB = read_next(infB,min_B)
      if bufB:
        sizeB += bufB.length()
      zB += 1
  #sys.stderr.write("\n")
  if bufA:
    while True:
      bufA = read_next(infA,min_A)
      if bufA: sizeA += bufA.length()
      else: break
  if bufB:
    while True:
      bufB = read_next(infB,min_B)
      if bufB: sizeB += bufB.length()
      else: break
  infA.close()
  infB.close()
  return [min_A,min_B,sizeA,sizeB,tot]
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('--offset',
                        type=int,
                        default=0,
                        help="add this much to transcript tpms")
    parser.add_argument('--mult',
                        type=int,
                        default=10,
                        help="multiply this much to tpms")
    parser.add_argument('--min_exons',
                        type=int,
                        default=1,
                        help="require at least this many exons")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    genes = {}
    sys.stderr.write("Reading gtf file\n")
    txs = {}
    for line in inf:
        if re.match('#', line): continue
        f = line.rstrip().split("\t")
        tx = None
        if f[2] == 'exon' or f[2] == 'transcript':
            tx = re.search('transcript_id\s+"([^"]+)"', f[8]).group(1)
            if tx not in txs:
                txs[tx] = {}
                txs[tx]['tpm'] = 0
                txs[tx]['exons'] = []
        if f[2] == 'transcript':
            tpm = float(re.search('TPM\s+"([^"]+)"', f[8]).group(1))
            txs[tx]['tpm'] = int((tpm * float(args.mult)) + args.offset)
        if f[2] == 'exon':
            chr = f[0]
            start = int(f[3]) - 1
            end = int(f[4])
            txs[tx]['exons'].append(Bed(chr, start, end))
    inf.close()
    vals = []
    sys.stderr.write("Traversing annotation file\n")
    for tx in txs:
        exons = txs[tx]['exons']
        v = txs[tx]['tpm']
        if len(exons) < args.min_exons: continue
        for i in range(0, v):
            vals += exons[:]
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()