def test_get_overlapping_models(db, bedfile): from pita.io import read_bed_transcripts from pita.util import get_overlapping_models for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0): db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons) exons = db.get_exons("JGIv7b.000000226") assert 72 == len(get_overlapping_models(exons))
def get_chrom_models(conn, chrom, weight, repeats=None, prune=None, keep=None, filter_ev=None, experimental=None): if keep is None: keep = [] if filter_ev is None: filter_ev = [] if experimental is None: experimental = [] logger = logging.getLogger("pita") logger.debug(str(weight)) try: db = AnnotationDb(conn=conn) # Filter repeats if repeats: for x in repeats: db.filter_repeats(chrom, x) for ev in filter_ev: db.filter_evidence(chrom, ev, experimental) mc = DbCollection(db, weight, prune=prune, chrom=chrom) # Remove short introns #mc.filter_short_introns() models = {} exons = {} logger.info("Calling transcripts for %s", chrom) for model in mc.get_best_variants(weight): genename = "{0}:{1}-{2}_".format( model[0].chrom, model[0].start, model[-1].end, ) logger.info("Best model: %s with %s exons", genename, len(model)) models[genename] = [genename, model] for exon in model: exons[str(exon)] = [exon, genename] discard = {} if prune: logger.debug("Prune: {0}".format(prune)) overlap = get_overlapping_models([x[0] for x in exons.values()]) if len(overlap) > 1: logger.info("%s overlapping exons", len(overlap)) # logger.warn("Overlap: {0}".format(overlap)) gene_count = {} for e1, e2 in overlap: gene1 = exons[str(e1)][1] gene2 = exons[str(e2)][1] gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1 gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1 for e1, e2 in overlap: gene1 = exons[str(e1)][1] gene2 = exons[str(e2)][1] if not (gene1 in discard or gene2 in discard): m1 = models[gene1][1] m2 = models[gene2][1] loc1, loc2 = sorted( [m1, m2], cmp=lambda x, y: cmp(x[0].start, y[0].start)) l1 = float(loc1[-1].end - loc1[0].start) l2 = float(loc2[-1].end - loc2[0].start) if loc2[-1].end > loc1[-1].end: overlap = float(loc1[-1].end - loc2[0].start) else: overlap = l2 #logger.info("Pruning {} vs. {}".format(str(m1),str(m2))) #logger.info("1: {}, 2: {}, overlap: {}".format( # l1, l2, overlap)) #logger.info("Gene {} count {}, gene {} count {}".format( # str(gene1), gene_count[gene1], str(gene2), gene_count[gene2] # )) # prune_overlap = prune["overlap"]["fraction"] if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap: logger.debug( "Not pruning because fraction of overlap is too small!" ) continue w1 = 0.0 w2 = 0.0 for d in prune["overlap"]["weights"]: logger.debug("Pruning overlap: %s", d) tmp_w1 = -mc.get_weight(m1) tmp_w2 = -mc.get_weight(m2) m = max((tmp_w1, tmp_w2)) if m > 0: w1 += tmp_w1 / max((tmp_w1, tmp_w2)) w2 += tmp_w2 / max((tmp_w1, tmp_w2)) if w1 >= w2: logger.info("Discarding %s", gene2) discard[gene2] = 1 else: logger.info("Discarding %s", gene1) discard[gene1] = 1 logger.info("Done calling transcripts for %s", chrom) result = [v for m, v in models.items() if not m in discard] #print "VV", result return [[name, [e.to_flat_exon() for e in exons]] for name, exons in result] except: logger.exception("Error on %s", chrom) return []
def get_chrom_models(conn, chrom, weight, repeats=None, prune=None, keep=None, filter_ev=None, experimental=None): if keep is None: keep = [] if filter_ev is None: filter_ev = [] if experimental is None: experimental = [] logger = logging.getLogger("pita") logger.debug(str(weight)) try: db = AnnotationDb(conn=conn) # Filter repeats if repeats: for x in repeats: db.filter_repeats(chrom, x) for ev in filter_ev: db.filter_evidence(chrom, ev, experimental) mc = DbCollection(db, weight, prune=prune, chrom=chrom) # Remove short introns #mc.filter_short_introns() models = {} exons = {} logger.info("Calling transcripts for %s", chrom) for model in mc.get_best_variants(weight): genename = "{0}:{1}-{2}_".format( model[0].chrom, model[0].start, model[-1].end, ) logger.info("Best model: %s with %s exons", genename, len(model)) models[genename] = [genename, model] for exon in model: exons[str(exon)] = [exon, genename] discard = {} if prune: logger.debug("Prune: {0}".format(prune)) overlap = get_overlapping_models([x[0] for x in exons.values()]) if len(overlap) > 1: logger.info("%s overlapping exons", len(overlap)) # logger.warn("Overlap: {0}".format(overlap)) gene_count = {} for e1, e2 in overlap: gene1 = exons[str(e1)][1] gene2 = exons[str(e2)][1] gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1 gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1 for e1, e2 in overlap: gene1 = exons[str(e1)][1] gene2 = exons[str(e2)][1] if not(gene1 in discard or gene2 in discard): m1 = models[gene1][1] m2 = models[gene2][1] loc1,loc2 = sorted([m1, m2], cmp=lambda x,y: cmp(x[0].start, y[0].start)) l1 = float(loc1[-1].end - loc1[0].start) l2 = float(loc2[-1].end - loc2[0].start) if loc2[-1].end > loc1[-1].end: overlap = float(loc1[-1].end - loc2[0].start) else: overlap = l2 #logger.info("Pruning {} vs. {}".format(str(m1),str(m2))) #logger.info("1: {}, 2: {}, overlap: {}".format( # l1, l2, overlap)) #logger.info("Gene {} count {}, gene {} count {}".format( # str(gene1), gene_count[gene1], str(gene2), gene_count[gene2] # )) # prune_overlap = prune["overlap"]["fraction"] if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap: logger.debug("Not pruning because fraction of overlap is too small!") continue w1 = 0.0 w2 = 0.0 for d in prune["overlap"]["weights"]: logger.debug("Pruning overlap: %s", d) tmp_w1 = -mc.get_weight(m1) tmp_w2 = -mc.get_weight(m2) m = max((tmp_w1, tmp_w2)) if m > 0: w1 += tmp_w1 / max((tmp_w1, tmp_w2)) w2 += tmp_w2 / max((tmp_w1, tmp_w2)) if w1 >= w2: logger.info("Discarding %s", gene2) discard[gene2] = 1 else: logger.info("Discarding %s", gene1) discard[gene1] = 1 logger.info("Done calling transcripts for %s", chrom) result = [v for m,v in models.items() if not m in discard] #print "VV", result return [[name, [e.to_flat_exon() for e in exons]] for name, exons in result] except: logger.exception("Error on %s", chrom) return []