def get_lengths(args, tdir):
    # get our gene lengths
    lengths = {}
    of = open(tdir + '/ref.bed', 'w')
    cmd = 'bedtools sort -i - | bedtools merge -i - -c 4 -o collapse'
    p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=of)
    with open(args.transcriptome_reference) as inf:
        for line in inf:
            if re.match('^#', line): continue
            e = genepred_line_to_entry(line)
            for i in range(0, len(e['exonStarts'])):
                #dont' consider exons that are too long.
                if e['exonEnds'][i] - e['exonStarts'][
                        i] > args.max_exon_length and args.max_exon_length > 0:
                    continue
                p.stdin.write(e['chrom'] + "\t" + str(e['exonStarts'][i]) +
                              "\t" + str(e['exonEnds'][i]) + "\t" +
                              e['gene_name'] + "\n")
    p.communicate()
    of.close()
    with open(tdir + '/ref.bed') as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            elen = int(f[2]) - int(f[1])
            genes = f[3].split(',')
            for gene in genes:
                if gene not in lengths:
                    lengths[gene] = 0
                lengths[gene] += elen
    return lengths
def transcriptome_to_exons(fname, tdir):
    of1 = open(tdir + '/all_exons.bed', 'w')
    of2 = open(tdir + '/all_loci.bed', 'w')
    bounds = {}
    with open(fname) as inf:
        for line in inf:
            if re.match('^#', line): continue
            e = genepred_line_to_entry(line)
            for i in range(0, len(e['exonStarts'])):
                if e['chrom'] not in bounds:
                    bounds[e['chrom']] = [100000000000, 0]
                if e['exonStarts'][i] < bounds[e['chrom']][0]:
                    bounds[e['chrom']][0] = e['exonStarts'][i]
                if e['exonEnds'][i] > bounds[e['chrom']][1]:
                    bounds[e['chrom']][1] = e['exonEnds'][i]
                of1.write(e['chrom'] + "\t" + str(e['exonStarts'][i]) + "\t" +
                          str(e['exonEnds'][i]) + "\n")
            of2.write(e['chrom'] + "\t" + str(e['txStart']) + "\t" +
                      str(e['txEnd']) + "\n")
    of1.close()
    of2.close()
    # Get the compressed exons
    cmd = "bedtools sort -i " + tdir + '/all_exons.bed > ' + tdir + '/all_exons.sorted.bed'
    subprocess.call(cmd, shell=True)
    cmd = "bedtools merge -i " + tdir + '/all_exons.sorted.bed > ' + tdir + '/merged_exons.bed'
    subprocess.call(cmd, shell=True)
    cmd = "bedtools sort -i " + tdir + '/all_loci.bed > ' + tdir + '/all_loci.sorted.bed'
    subprocess.call(cmd, shell=True)
    cmd = "bedtools merge -i " + tdir + '/all_loci.sorted.bed > ' + tdir + '/merged_loci.bed'
    subprocess.call(cmd, shell=True)
    return bounds
def transcriptome_to_exons(fname,tdir):
  of1 = open(tdir+'/all_exons.bed','w')
  of2 = open(tdir+'/all_loci.bed','w')
  bounds = {}
  with open(fname) as inf:
    for line in inf:
      if re.match('^#',line): continue
      e = genepred_line_to_entry(line)
      for i in range(0,len(e['exonStarts'])):
        if e['chrom'] not in bounds:
          bounds[e['chrom']] = [100000000000,0]
        if e['exonStarts'][i] < bounds[e['chrom']][0]:
          bounds[e['chrom']][0] = e['exonStarts'][i]
        if e['exonEnds'][i] > bounds[e['chrom']][1]:
          bounds[e['chrom']][1] = e['exonEnds'][i]
        of1.write(e['chrom']+"\t"+str(e['exonStarts'][i])+"\t"+str(e['exonEnds'][i])+"\n")
      of2.write(e['chrom']+"\t"+str(e['txStart'])+"\t"+str(e['txEnd'])+"\n")
  of1.close()
  of2.close()
  # Get the compressed exons
  cmd = "bedtools sort -i "+tdir+'/all_exons.bed > '+tdir+'/all_exons.sorted.bed'
  subprocess.call(cmd,shell=True)
  cmd = "bedtools merge -i "+tdir+'/all_exons.sorted.bed > '+tdir+'/merged_exons.bed'
  subprocess.call(cmd,shell=True)
  cmd = "bedtools sort -i "+tdir+'/all_loci.bed > '+tdir+'/all_loci.sorted.bed'
  subprocess.call(cmd,shell=True)
  cmd = "bedtools merge -i "+tdir+'/all_loci.sorted.bed > '+tdir+'/merged_loci.bed'
  subprocess.call(cmd,shell=True)
  return bounds
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input',help='FILENAME input genepred, use - for STDIN')
  args = parser.parse_args()
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  for line in inf:
    if re.match('^#',line): continue
    e = genepred_line_to_entry(line)
    for i in range(0,len(e['exonStarts'])):
      print e['chrom']+"\t"+str(e['exonStarts'][i])+"\t"+str(e['exonEnds'][i])+"\t"+e['gene_name']+"\t"+e['name']+"\t"+str(i)
  inf.close()
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input',
                        help='FILENAME input genepred, use - for STDIN')
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    for line in inf:
        if re.match('^#', line): continue
        e = genepred_line_to_entry(line)
        for i in range(0, len(e['exonStarts'])):
            print e['chrom'] + "\t" + str(e['exonStarts'][i]) + "\t" + str(
                e['exonEnds']
                [i]) + "\t" + e['gene_name'] + "\t" + e['name'] + "\t" + str(i)
    inf.close()
def get_lengths(args, tdir):
    # get our gene lengths
    lengths = {}
    of = open(tdir + "/ref.bed", "w")
    cmd = "bedtools sort -i - | bedtools merge -i - -c 4 -o collapse"
    p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=of)
    with open(args.transcriptome_reference) as inf:
        for line in inf:
            if re.match("^#", line):
                continue
            e = genepred_line_to_entry(line)
            for i in range(0, len(e["exonStarts"])):
                # dont' consider exons that are too long.
                if e["exonEnds"][i] - e["exonStarts"][i] > args.max_exon_length and args.max_exon_length > 0:
                    continue
                p.stdin.write(
                    e["chrom"]
                    + "\t"
                    + str(e["exonStarts"][i])
                    + "\t"
                    + str(e["exonEnds"][i])
                    + "\t"
                    + e["gene_name"]
                    + "\n"
                )
    p.communicate()
    of.close()
    with open(tdir + "/ref.bed") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            elen = int(f[2]) - int(f[1])
            genes = f[3].split(",")
            for gene in genes:
                if gene not in lengths:
                    lengths[gene] = 0
                lengths[gene] += elen
    return lengths