def models_to_compare(): from pita.io import read_bed_transcripts f1 = open("tests/data/annotation1.bed") f2 = open("tests/data/annotation2.bed") t1 = read_bed_transcripts(f1) t2 = read_bed_transcripts(f2) result = {} f = open("tests/data/ann1_vs_ann2.txt") f.readline() # header for line in f.readlines(): vals = line.strip().split("\t") result.setdefault(vals[0], {}) result[vals[0]][vals[1]] = [float(x) for x in vals[2:]] return t1, t2, result
def test_long_exon_filter(db, t1, t2): from pita.dbcollection import DbCollection from pita.io import read_bed_transcripts for tname, source, exons in read_bed_transcripts(open(t1)): db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons) for tname, source, exons in read_bed_transcripts(open(t2)): db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons) c = DbCollection(db, [], chrom="chr1") c.filter_long(l=500, evidence=1) models = [] for cluster in c.get_best_variants([]): models.append(cluster) assert [3, 5] == sorted([len(m) for m in models])
def test_long_exon_filter(db, t1, t2): from pita.dbcollection import DbCollection from pita.io import read_bed_transcripts for tname, source, exons in read_bed_transcripts(open(t1)): db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons) for tname, source, exons in read_bed_transcripts(open(t2)): db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons) c = DbCollection(db, chrom="chr1") c.filter_long(evidence=1) models = [] for cluster in c.get_connected_models(): for m in cluster: models.append(m) assert [1,3,5] == sorted([len(m) for m in models])
def collection(db): from pita.dbcollection import DbCollection from pita.io import read_bed_transcripts bed = "tests/data/scaffold_54_genes.bed" for tname, source, exons in read_bed_transcripts(open(bed), "test", 0): db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons) mc = DbCollection(db) return mc
def test_get_overlapping_models(db, bedfile): from pita.io import read_bed_transcripts from pita.util import get_overlapping_models for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0): db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons) exons = db.get_exons("JGIv7b.000000226") assert 72 == len(get_overlapping_models(exons))
def collection(db): from pita.dbcollection import DbCollection from pita.io import read_bed_transcripts bed = "tests/data/scaffold_54_genes.bed" for tname, source, exons in read_bed_transcripts(open(bed), "test", 0): db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons) mc = DbCollection(db, []) return mc
def load_chrom_data(conn, new, chrom, anno_files, data, index=None): logger = logging.getLogger("pita") try: # Read annotation files db = AnnotationDb(index=index, conn=conn, new=new) logger.debug("%s %s", chrom, id(db.session)) logger.info("Reading annotation for %s", chrom) for name, fname, tabix_file, ftype, min_exons in anno_files: logger.info("Reading annotation from %s", fname) tabixfile = pysam.Tabixfile(tabix_file) #tabixfile = fname if chrom in tabixfile.contigs: fobj = TabixIteratorAsFile(tabixfile.fetch(chrom)) if ftype == "bed": it = read_bed_transcripts(fobj, fname, min_exons=min_exons, merge=10) elif ftype in ["gff", "gtf", "gff3"]: it = read_gff_transcripts(fobj, fname, min_exons=min_exons, merge=10) for tname, source, exons in it: db.add_transcript("{0}{1}{2}".format(name, SEP, tname), source, exons) del fobj tabixfile.close() del tabixfile logger.info("Loading data for %s", chrom) for name, fname, span, extend in data: if span == "splice": logger.info("Reading splice data %s from %s", name, fname) db.get_splice_statistics(chrom, fname, name) else: logger.info("Reading BAM data %s from %s", name, fname) db.get_read_statistics(chrom, fname, name=name, span=span, extend=extend, nreads=None) except: logger.exception("Error on %s", chrom) raise
def test_variants(db, variant_track): from pita.dbcollection import DbCollection from pita.io import read_bed_transcripts from pita.util import model_to_bed for tname, source, exons in read_bed_transcripts(open(variant_track)): db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons) c = DbCollection(db) best_model = [m for m in c.get_connected_models()][0][0] cuts = [str(e) for e in c.get_node_cuts(best_model)] assert ["chr1:800+900", "chr1:1400+1500"] == cuts best_variant = c.get_best_variant(best_model, [{"weight":1,"type":"length","name":"length"}]) s = [str(s) for s in best_variant] assert ["chr1:100+200", "chr1:400+700", "chr1:800+900", "chr1:1000+1300", "chr1:1400+1500", "chr1:1600+1900", "chr1:2000+2100"] == s
def load_chrom_data(conn, new, chrom, anno_files, data, index=None): logger = logging.getLogger("pita") try: # Read annotation files db = AnnotationDb(index=index, conn=conn, new=new) logger.debug("%s %s", chrom, id(db.session)) logger.info("Reading annotation for %s", chrom) for name, fname, tabix_file, ftype, min_exons in anno_files: logger.info("Reading annotation from %s", fname) tabixfile = pysam.Tabixfile(tabix_file) #tabixfile = fname if chrom in tabixfile.contigs: fobj = TabixIteratorAsFile(tabixfile.fetch(chrom)) if ftype == "bed": it = read_bed_transcripts(fobj, fname, min_exons=min_exons, merge=10) elif ftype in ["gff", "gtf", "gff3"]: it = read_gff_transcripts(fobj, fname, min_exons=min_exons, merge=10) for tname, source, exons in it: db.add_transcript( "{0}{1}{2}".format(name, SEP, tname), source, exons) del fobj tabixfile.close() del tabixfile logger.info("Loading data for %s", chrom) for name, fname, span, extend in data: if span == "splice": logger.info("Reading splice data %s from %s", name, fname) db.get_splice_statistics(chrom, fname, name) else: logger.info("Reading BAM data %s from %s", name, fname) db.get_read_statistics(chrom, fname, name=name, span=span, extend=extend, nreads=None) except: logger.exception("Error on %s", chrom) raise
def call_utr(inbed, bamfiles, utr5=False, utr3=True): """ Call 3' UTR for all genes in a BED12 file based on RNA-seq reads in BAM files. """ # Load genes in BED file transcripts = read_bed_transcripts(open(inbed)) # No genes if len(transcripts) == 0: return td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts]) #Trying to fix the scaffold struggles #td = {} #for t in transcripts: # if "scaffold" not in t[0]: # td[t[0].split("_")[1]+"_"] = t[2] # else: # inter = t[0].split(":") # scafName = "_".join(inter[0].split("_")[2:4]) # pos = inter[1].split("_")[0] # td[scafName+":"+pos+"_"] = t[2] # Create a BED6 file with exons, used to determine UTR boundaries sys.stderr.write("Preparing temporary BED files\n") exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed") bed2exonbed(inbed, exonbed.name) # Determine boundaries using bedtools genes = pybedtools.BedTool(inbed) exons = pybedtools.BedTool(exonbed.name) tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed") EXTEND = 10000 sys.stderr.write("Determining gene boundaries determined by closest gene\n") for x in genes.closest(exons, D="a", io=True, iu=True): transcript = td[x[3]] # Extend to closest exon or EXTEND, whichever is closer extend = EXTEND if (int(x[-1]) >= 0) and (int(x[-1]) < extend): extend = int(x[-1]) if transcript[0][-1] == "+": first = transcript[-1] first[2] += extend else: first = transcript[-0] first[1] -= extend if first[1] < 0: first[1] = 0 tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( first[0], first[1], first[2], x[3], 0, first[3] )) tmp.flush() tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam") tmpbam = NamedTemporaryFile(prefix="pita.") # Retrieve header from first BAM file sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name), shell=True) # Filter all BAM files for the specific regions. This runs much faster # then running bedtools coverage on all individual BAM files tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam") cmd = "samtools view -L {} {} > {}" sys.stderr.write("Merging bam files\n") for bamfile in bamfiles: try: sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name), shell=True) sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name), shell=True) except sp.CalledProcessError as e: sys.stderr.write("Error in file {}, skipping:\n".format(bamfile)) sys.stderr.write("{}\n".format(e)) tmp_check.close()
def compare_annotation_files(fname1, fname2): file1 = open(fname1) file2 = open(fname2) data1 = read_bed_transcripts(file1) data2 = read_bed_transcripts(file2) result = compare_annotation(data1, data2) for t2, x, exons in data2: if not result.has_key(t2) or result[t2] == {}: print "{0}\t{1}\t{2}\t{3}".format(exons[0][0], exons[0][1], exons[-1][2], 1) #print result cluster = {} t2_cluster = {} for transcript2, d in result.items(): for transcript1 in d.keys(): if not cluster.has_key(transcript1): cluster[transcript1] = d.keys() else: for t in d.keys(): if not t in cluster[transcript1]: cluster[transcript1].append(t) for t1 in cluster[transcript1]: t2_cluster.setdefault(t1, []).append(transcript2) clusters = set([tuple(x) for x in cluster.values()]) loc = {} for name, x, exons in data1: loc[name] = [exons[0][0], exons[0][1], exons[-1][2]] for cluster in clusters: t2s = set(t2_cluster[cluster[0]]) min_acs = [] for transcript1 in cluster: acs = [] for transcript2 in t2s: if result[transcript2].has_key(transcript1): sn, sp, ac = result[transcript2][transcript1] #print transcript1, transcript2, sn, sp, ac acs.append(1 - sp) min_acs.append(np.min(acs)) #print transcript1, np.min(acs) #print "###" #print cluster, min_acs chrom = loc[cluster[0]][0] start = np.min([loc[t][1] for t in cluster]) end = np.max([loc[t][2] for t in cluster]) print "{0}\t{1}\t{2}\t{3}".format(chrom, start, end, np.mean(min_acs)) #print loc #for t in cluster: sys.exit() for k, v in result.items(): if len(v.keys()) > 0: print "{0}\t{1}\t{2}".format(loc[k], k, 1 - np.max(v.values())) else: print "{0}\t{1}\t{2}".format(loc[k], k, 1.0)
def compare_annotation_files(fname1, fname2): file1 = open(fname1) file2 = open(fname2) data1 = read_bed_transcripts(file1) data2 = read_bed_transcripts(file2) result = compare_annotation(data1, data2) for t2, x, exons in data2: if not result.has_key(t2) or result[t2] == {}: print "{0}\t{1}\t{2}\t{3}".format(exons[0][0], exons[0][1], exons[-1][2], 1) #print result cluster = {} t2_cluster = {} for transcript2,d in result.items(): for transcript1 in d.keys(): if not cluster.has_key(transcript1): cluster[transcript1] = d.keys() else: for t in d.keys(): if not t in cluster[transcript1]: cluster[transcript1].append(t) for t1 in cluster[transcript1]: t2_cluster.setdefault(t1, []).append(transcript2) clusters = set([tuple(x) for x in cluster.values()]) loc = {} for name,x,exons in data1: loc[name] = [exons[0][0], exons[0][1], exons[-1][2]] for cluster in clusters: t2s = set(t2_cluster[cluster[0]]) min_acs = [] for transcript1 in cluster: acs = [] for transcript2 in t2s: if result[transcript2].has_key(transcript1): sn, sp, ac = result[transcript2][transcript1] #print transcript1, transcript2, sn, sp, ac acs.append(1 - sp) min_acs.append(np.min(acs)) #print transcript1, np.min(acs) #print "###" #print cluster, min_acs chrom = loc[cluster[0]][0] start = np.min([loc[t][1] for t in cluster]) end = np.max([loc[t][2] for t in cluster]) print "{0}\t{1}\t{2}\t{3}".format(chrom, start, end, np.mean(min_acs)) #print loc #for t in cluster: sys.exit() for k,v in result.items(): if len(v.keys()) > 0: print "{0}\t{1}\t{2}".format(loc[k], k, 1 - np.max(v.values())) else: print "{0}\t{1}\t{2}".format(loc[k], k, 1.0)
def call_utr(inbed, bamfiles, utr5=False, utr3=True): """ Call 3' UTR for all genes in a BED12 file based on RNA-seq reads in BAM files. """ # Load genes in BED file transcripts = read_bed_transcripts(open(inbed)) # No genes if len(transcripts) == 0: return td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts]) #Trying to fix the scaffold struggles #td = {} #for t in transcripts: # if "scaffold" not in t[0]: # td[t[0].split("_")[1]+"_"] = t[2] # else: # inter = t[0].split(":") # scafName = "_".join(inter[0].split("_")[2:4]) # pos = inter[1].split("_")[0] # td[scafName+":"+pos+"_"] = t[2] # Create a BED6 file with exons, used to determine UTR boundaries sys.stderr.write("Preparing temporary BED files\n") exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed") bed2exonbed(inbed, exonbed.name) # Determine boundaries using bedtools genes = pybedtools.BedTool(inbed) exons = pybedtools.BedTool(exonbed.name) tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed") EXTEND = 10000 sys.stderr.write( "Determining gene boundaries determined by closest gene\n") for x in genes.closest(exons, D="a", io=True, iu=True): transcript = td[x[3]] # Extend to closest exon or EXTEND, whichever is closer extend = EXTEND if (int(x[-1]) >= 0) and (int(x[-1]) < extend): extend = int(x[-1]) if transcript[0][-1] == "+": first = transcript[-1] first[2] += extend else: first = transcript[-0] first[1] -= extend if first[1] < 0: first[1] = 0 tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(first[0], first[1], first[2], x[3], 0, first[3])) tmp.flush() tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam") tmpbam = NamedTemporaryFile(prefix="pita.") # Retrieve header from first BAM file sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name), shell=True) # Filter all BAM files for the specific regions. This runs much faster # then running bedtools coverage on all individual BAM files tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam") cmd = "samtools view -L {} {} > {}" sys.stderr.write("Merging bam files\n") for bamfile in bamfiles: try: sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name), shell=True) sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name), shell=True) except sp.CalledProcessError as e: sys.stderr.write("Error in file {}, skipping:\n".format(bamfile)) sys.stderr.write("{}\n".format(e)) tmp_check.close() # Created sorted and index bam cmd = "samtools view -Sb {} | samtools sort -m 4G - {}" sp.call(cmd.format(tmpsam.name, tmpbam.name), shell=True) sp.call("samtools index {}.bam".format(tmpbam.name), shell=True) # Close and remove temporary SAM file tmpsam.close() sys.stderr.write("Calculating coverage\n") cmd = "bedtools coverage -abam {} -b {} -d -split " p = sp.Popen(cmd.format(tmpbam.name + ".bam", tmp.name), shell=True, stdout=sp.PIPE, bufsize=1) sys.stderr.write("Calling UTRs\n") data = [] current = [None] utr = {} for line in iter(p.stdout.readline, b''): vals = line.strip().split("\t") if vals[3] != current[0]: if len(data) > 0: result = call_cpt(current[1], current[2], current[3], data, len(bamfile)) #print result if result: utr[current[0]] = result data = [] current = [vals[3], int(vals[1]), int(vals[2]), vals[5]] data.append(int(vals[7])) if current[0]: result = call_cpt(current[1], current[2], current[3], data, len(bamfiles)) if result: utr[current[0]] = result for fname in [tmpbam.name + ".bam", tmpbam.name + ".bam.bai"]: if os.path.exists(fname): os.unlink(fname) tmpbam.close() tmp.close() return utr
def call_utr(inbed, bamfiles, utr5=False, utr3=True): """ Call 3' UTR for all genes in a BED12 file based on RNA-seq reads in BAM files. """ # Load genes in BED file transcripts = read_bed_transcripts(open(inbed)) # No genes if len(transcripts) == 0: return td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts]) #Trying to fix the scaffold struggles #td = {} #for t in transcripts: # if "scaffold" not in t[0]: # td[t[0].split("_")[1]+"_"] = t[2] # else: # inter = t[0].split(":") # scafName = "_".join(inter[0].split("_")[2:4]) # pos = inter[1].split("_")[0] # td[scafName+":"+pos+"_"] = t[2] # Create a BED6 file with exons, used to determine UTR boundaries sys.stderr.write("Preparing temporary BED files\n") exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed") bed2exonbed(inbed, exonbed.name) # Determine boundaries using bedtools genes = pybedtools.BedTool(inbed) exons = pybedtools.BedTool(exonbed.name) tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed") EXTEND = 10000 sys.stderr.write("Determining gene boundaries determined by closest gene\n") for x in genes.closest(exons, D="a", io=True, iu=True): transcript = td[x[3]] # Extend to closest exon or EXTEND, whichever is closer extend = EXTEND if (int(x[-1]) >= 0) and (int(x[-1]) < extend): extend = int(x[-1]) if transcript[0][-1] == "+": first = transcript[-1] first[2] += extend else: first = transcript[-0] first[1] -= extend if first[1] < 0: first[1] = 0 tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( first[0], first[1], first[2], x[3], 0, first[3] )) tmp.flush() tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam") tmpbam = NamedTemporaryFile(prefix="pita.") # Retrieve header from first BAM file sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name), shell=True) # Filter all BAM files for the specific regions. This runs much faster # then running bedtools coverage on all individual BAM files tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam") cmd = "samtools view -L {} {} > {}" sys.stderr.write("Merging bam files\n") for bamfile in bamfiles: try: sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name), shell=True) sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name), shell=True) except sp.CalledProcessError as e: sys.stderr.write("Error in file {}, skipping:\n".format(bamfile)) sys.stderr.write("{}\n".format(e)) tmp_check.close() # Created sorted and index bam cmd = "samtools view -Sb {} | samtools sort -m 4G - {}" sp.call(cmd.format(tmpsam.name, tmpbam.name), shell=True) sp.call("samtools index {}.bam".format(tmpbam.name), shell=True) # Close and remove temporary SAM file tmpsam.close() sys.stderr.write("Calculating coverage\n") cmd = "bedtools coverage -abam {} -b {} -d -split " p = sp.Popen(cmd.format(tmpbam.name + ".bam", tmp.name), shell=True, stdout=sp.PIPE, bufsize=1) sys.stderr.write("Calling UTRs\n") data = [] current = [None] utr = {} for line in iter(p.stdout.readline, b''): vals = line.strip().split("\t") if vals[3] != current[0]: if len(data) > 0: result = call_cpt(current[1], current[2], current[3], data, len(bamfile)) #print result if result: utr[current[0]] = result data = [] current = [vals[3], int(vals[1]), int(vals[2]), vals[5]] data.append(int(vals[7])) if current[0]: result = call_cpt(current[1], current[2], current[3], data, len(bamfiles)) if result: utr[current[0]] = result for fname in [tmpbam.name + ".bam", tmpbam.name + ".bam.bai"]: if os.path.exists(fname): os.unlink(fname) tmpbam.close() tmp.close() return utr