def get_lengths(args, tdir): # get our gene lengths lengths = {} of = open(tdir + '/ref.bed', 'w') cmd = 'bedtools sort -i - | bedtools merge -i - -c 4 -o collapse' p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=of) with open(args.transcriptome_reference) as inf: for line in inf: if re.match('^#', line): continue e = genepred_line_to_entry(line) for i in range(0, len(e['exonStarts'])): #dont' consider exons that are too long. if e['exonEnds'][i] - e['exonStarts'][ i] > args.max_exon_length and args.max_exon_length > 0: continue p.stdin.write(e['chrom'] + "\t" + str(e['exonStarts'][i]) + "\t" + str(e['exonEnds'][i]) + "\t" + e['gene_name'] + "\n") p.communicate() of.close() with open(tdir + '/ref.bed') as inf: for line in inf: f = line.rstrip().split("\t") elen = int(f[2]) - int(f[1]) genes = f[3].split(',') for gene in genes: if gene not in lengths: lengths[gene] = 0 lengths[gene] += elen return lengths
def transcriptome_to_exons(fname, tdir): of1 = open(tdir + '/all_exons.bed', 'w') of2 = open(tdir + '/all_loci.bed', 'w') bounds = {} with open(fname) as inf: for line in inf: if re.match('^#', line): continue e = genepred_line_to_entry(line) for i in range(0, len(e['exonStarts'])): if e['chrom'] not in bounds: bounds[e['chrom']] = [100000000000, 0] if e['exonStarts'][i] < bounds[e['chrom']][0]: bounds[e['chrom']][0] = e['exonStarts'][i] if e['exonEnds'][i] > bounds[e['chrom']][1]: bounds[e['chrom']][1] = e['exonEnds'][i] of1.write(e['chrom'] + "\t" + str(e['exonStarts'][i]) + "\t" + str(e['exonEnds'][i]) + "\n") of2.write(e['chrom'] + "\t" + str(e['txStart']) + "\t" + str(e['txEnd']) + "\n") of1.close() of2.close() # Get the compressed exons cmd = "bedtools sort -i " + tdir + '/all_exons.bed > ' + tdir + '/all_exons.sorted.bed' subprocess.call(cmd, shell=True) cmd = "bedtools merge -i " + tdir + '/all_exons.sorted.bed > ' + tdir + '/merged_exons.bed' subprocess.call(cmd, shell=True) cmd = "bedtools sort -i " + tdir + '/all_loci.bed > ' + tdir + '/all_loci.sorted.bed' subprocess.call(cmd, shell=True) cmd = "bedtools merge -i " + tdir + '/all_loci.sorted.bed > ' + tdir + '/merged_loci.bed' subprocess.call(cmd, shell=True) return bounds
def transcriptome_to_exons(fname,tdir): of1 = open(tdir+'/all_exons.bed','w') of2 = open(tdir+'/all_loci.bed','w') bounds = {} with open(fname) as inf: for line in inf: if re.match('^#',line): continue e = genepred_line_to_entry(line) for i in range(0,len(e['exonStarts'])): if e['chrom'] not in bounds: bounds[e['chrom']] = [100000000000,0] if e['exonStarts'][i] < bounds[e['chrom']][0]: bounds[e['chrom']][0] = e['exonStarts'][i] if e['exonEnds'][i] > bounds[e['chrom']][1]: bounds[e['chrom']][1] = e['exonEnds'][i] of1.write(e['chrom']+"\t"+str(e['exonStarts'][i])+"\t"+str(e['exonEnds'][i])+"\n") of2.write(e['chrom']+"\t"+str(e['txStart'])+"\t"+str(e['txEnd'])+"\n") of1.close() of2.close() # Get the compressed exons cmd = "bedtools sort -i "+tdir+'/all_exons.bed > '+tdir+'/all_exons.sorted.bed' subprocess.call(cmd,shell=True) cmd = "bedtools merge -i "+tdir+'/all_exons.sorted.bed > '+tdir+'/merged_exons.bed' subprocess.call(cmd,shell=True) cmd = "bedtools sort -i "+tdir+'/all_loci.bed > '+tdir+'/all_loci.sorted.bed' subprocess.call(cmd,shell=True) cmd = "bedtools merge -i "+tdir+'/all_loci.sorted.bed > '+tdir+'/merged_loci.bed' subprocess.call(cmd,shell=True) return bounds
def main(): parser = argparse.ArgumentParser() parser.add_argument('input',help='FILENAME input genepred, use - for STDIN') args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) for line in inf: if re.match('^#',line): continue e = genepred_line_to_entry(line) for i in range(0,len(e['exonStarts'])): print e['chrom']+"\t"+str(e['exonStarts'][i])+"\t"+str(e['exonEnds'][i])+"\t"+e['gene_name']+"\t"+e['name']+"\t"+str(i) inf.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help='FILENAME input genepred, use - for STDIN') args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) for line in inf: if re.match('^#', line): continue e = genepred_line_to_entry(line) for i in range(0, len(e['exonStarts'])): print e['chrom'] + "\t" + str(e['exonStarts'][i]) + "\t" + str( e['exonEnds'] [i]) + "\t" + e['gene_name'] + "\t" + e['name'] + "\t" + str(i) inf.close()
def get_lengths(args, tdir): # get our gene lengths lengths = {} of = open(tdir + "/ref.bed", "w") cmd = "bedtools sort -i - | bedtools merge -i - -c 4 -o collapse" p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=of) with open(args.transcriptome_reference) as inf: for line in inf: if re.match("^#", line): continue e = genepred_line_to_entry(line) for i in range(0, len(e["exonStarts"])): # dont' consider exons that are too long. if e["exonEnds"][i] - e["exonStarts"][i] > args.max_exon_length and args.max_exon_length > 0: continue p.stdin.write( e["chrom"] + "\t" + str(e["exonStarts"][i]) + "\t" + str(e["exonEnds"][i]) + "\t" + e["gene_name"] + "\n" ) p.communicate() of.close() with open(tdir + "/ref.bed") as inf: for line in inf: f = line.rstrip().split("\t") elen = int(f[2]) - int(f[1]) genes = f[3].split(",") for gene in genes: if gene not in lengths: lengths[gene] = 0 lengths[gene] += elen return lengths