def test_get_overlapping_models(db, bedfile):
    from pita.io import read_bed_transcripts
    from pita.util import get_overlapping_models

    for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons)

    exons = db.get_exons("JGIv7b.000000226")
    
    assert 72 == len(get_overlapping_models(exons))
def test_get_overlapping_models(db, bedfile):
    from pita.io import read_bed_transcripts
    from pita.util import get_overlapping_models

    for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source,
                          exons)

    exons = db.get_exons("JGIv7b.000000226")

    assert 72 == len(get_overlapping_models(exons))
Exemple #3
0
def get_chrom_models(conn,
                     chrom,
                     weight,
                     repeats=None,
                     prune=None,
                     keep=None,
                     filter_ev=None,
                     experimental=None):
    if keep is None:
        keep = []
    if filter_ev is None:
        filter_ev = []
    if experimental is None:
        experimental = []

    logger = logging.getLogger("pita")
    logger.debug(str(weight))
    try:
        db = AnnotationDb(conn=conn)

        # Filter repeats
        if repeats:
            for x in repeats:
                db.filter_repeats(chrom, x)

        for ev in filter_ev:
            db.filter_evidence(chrom, ev, experimental)

        mc = DbCollection(db, weight, prune=prune, chrom=chrom)

        # Remove short introns
        #mc.filter_short_introns()

        models = {}
        exons = {}
        logger.info("Calling transcripts for %s", chrom)
        for model in mc.get_best_variants(weight):
            genename = "{0}:{1}-{2}_".format(
                model[0].chrom,
                model[0].start,
                model[-1].end,
            )

            logger.info("Best model: %s with %s exons", genename, len(model))
            models[genename] = [genename, model]

            for exon in model:
                exons[str(exon)] = [exon, genename]

        discard = {}
        if prune:
            logger.debug("Prune: {0}".format(prune))
            overlap = get_overlapping_models([x[0] for x in exons.values()])
            if len(overlap) > 1:
                logger.info("%s overlapping exons", len(overlap))
#                logger.warn("Overlap: {0}".format(overlap))

            gene_count = {}
            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1
                gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1

            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                if not (gene1 in discard or gene2 in discard):
                    m1 = models[gene1][1]
                    m2 = models[gene2][1]

                    loc1, loc2 = sorted(
                        [m1, m2], cmp=lambda x, y: cmp(x[0].start, y[0].start))
                    l1 = float(loc1[-1].end - loc1[0].start)
                    l2 = float(loc2[-1].end - loc2[0].start)
                    if loc2[-1].end > loc1[-1].end:
                        overlap = float(loc1[-1].end - loc2[0].start)
                    else:
                        overlap = l2

                    #logger.info("Pruning {} vs. {}".format(str(m1),str(m2)))
                    #logger.info("1: {}, 2: {}, overlap: {}".format(
                    #    l1, l2, overlap))
                    #logger.info("Gene {} count {}, gene {} count {}".format(
                    #    str(gene1), gene_count[gene1], str(gene2), gene_count[gene2]
                    #    ))
#
                    prune_overlap = prune["overlap"]["fraction"]
                    if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap:
                        logger.debug(
                            "Not pruning because fraction of overlap is too small!"
                        )
                        continue

                    w1 = 0.0
                    w2 = 0.0
                    for d in prune["overlap"]["weights"]:
                        logger.debug("Pruning overlap: %s", d)
                        tmp_w1 = -mc.get_weight(m1)
                        tmp_w2 = -mc.get_weight(m2)
                        m = max((tmp_w1, tmp_w2))
                        if m > 0:
                            w1 += tmp_w1 / max((tmp_w1, tmp_w2))
                            w2 += tmp_w2 / max((tmp_w1, tmp_w2))

                    if w1 >= w2:
                        logger.info("Discarding %s", gene2)
                        discard[gene2] = 1
                    else:
                        logger.info("Discarding %s", gene1)
                        discard[gene1] = 1

        logger.info("Done calling transcripts for %s", chrom)
        result = [v for m, v in models.items() if not m in discard]
        #print "VV", result
        return [[name, [e.to_flat_exon() for e in exons]]
                for name, exons in result]

    except:
        logger.exception("Error on %s", chrom)

    return []
Exemple #4
0
def get_chrom_models(conn, chrom, weight, repeats=None, prune=None, keep=None, filter_ev=None, experimental=None):
    if keep is None:
        keep = []
    if filter_ev is None:
        filter_ev = []
    if experimental is None:
        experimental = []

    logger = logging.getLogger("pita")
    logger.debug(str(weight)) 
    try:
        db = AnnotationDb(conn=conn)
        
        # Filter repeats
        if repeats:
            for x in repeats:
                db.filter_repeats(chrom, x)

        for ev in filter_ev:
            db.filter_evidence(chrom, ev, experimental) 
        
        mc = DbCollection(db, weight, prune=prune, chrom=chrom)
       
        # Remove short introns
        #mc.filter_short_introns()
      
        models = {}
        exons = {}
        logger.info("Calling transcripts for %s", chrom)
        for model in mc.get_best_variants(weight):
            genename = "{0}:{1}-{2}_".format(
                                        model[0].chrom,
                                        model[0].start,
                                        model[-1].end,
                                        )
                   
                
            logger.info("Best model: %s with %s exons", 
                    genename, len(model))
            models[genename] = [genename, model]
            
            for exon in model:
                exons[str(exon)] = [exon, genename]

        discard = {}
        if prune:
            logger.debug("Prune: {0}".format(prune))
            overlap = get_overlapping_models([x[0] for x in exons.values()])
            if len(overlap) > 1:
                logger.info("%s overlapping exons", len(overlap))
#                logger.warn("Overlap: {0}".format(overlap))
                
            gene_count = {}
            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1
                gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1

            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                if not(gene1 in discard or gene2 in discard):
                    m1 = models[gene1][1]
                    m2 = models[gene2][1]
                
                    loc1,loc2 = sorted([m1, m2], cmp=lambda x,y: cmp(x[0].start, y[0].start))
                    l1 = float(loc1[-1].end - loc1[0].start)
                    l2 = float(loc2[-1].end - loc2[0].start)
                    if loc2[-1].end > loc1[-1].end:
                        overlap = float(loc1[-1].end - loc2[0].start)
                    else:
                        overlap = l2

                    #logger.info("Pruning {} vs. {}".format(str(m1),str(m2)))
                    #logger.info("1: {}, 2: {}, overlap: {}".format(
                    #    l1, l2, overlap))
                    #logger.info("Gene {} count {}, gene {} count {}".format(
                    #    str(gene1), gene_count[gene1], str(gene2), gene_count[gene2]
                    #    ))
#                   
                    prune_overlap = prune["overlap"]["fraction"]
                    if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap:
                        logger.debug("Not pruning because fraction of overlap is too small!")
                        continue
                    
                    w1 = 0.0
                    w2 = 0.0
                    for d in prune["overlap"]["weights"]:
                        logger.debug("Pruning overlap: %s", d)
                        tmp_w1 = -mc.get_weight(m1)
                        tmp_w2 = -mc.get_weight(m2)
                        m = max((tmp_w1, tmp_w2))
                        if m > 0:
                            w1 += tmp_w1 / max((tmp_w1, tmp_w2))
                            w2 += tmp_w2 / max((tmp_w1, tmp_w2))

                    if w1 >= w2:
                        logger.info("Discarding %s", gene2)
                        discard[gene2] = 1
                    else:
                        logger.info("Discarding %s", gene1)
                        discard[gene1] = 1
        
        logger.info("Done calling transcripts for %s", chrom)
        result = [v for m,v in models.items() if not m in discard]
        #print "VV", result
        return [[name, [e.to_flat_exon() for e in exons]] for name, exons in result]

    except:
        logger.exception("Error on %s", chrom)
  
    return []