Exemple #1
0
    def test_nearest(self):
        from cruzdb.models import Feature
        f = Feature()
        f.chrom = "chr1"
        f.txStart = 10
        f.txEnd = 61
        #db = Genome('hg18', host="localhost", user="******")
        db = self.dba
        self.assert_(db.refGene.first() is not None)
        self.assert_(db.refGene is not None)

        res = db.knearest(db.refGene, f, k=2)
        self.assert_(len(res) >= 2)

        f = db.refGene.first()
        key = (f.chrom, f.start, f.end, f.name)

        for k in (2, 4, 6):
            res = db.knearest("refGene", f, k=k)
            assert len(res) >= k
            self.assert_(
                key in ((n.chrom, n.start, n.end, n.name) for n in res),
                (res, f))

        f = db.refGene.order_by(
            db.refGene.txStart).filter(db.refGene.c.strand == "+").first()
        assert f in db.upstream(db.refGene, f)

        down = db.downstream(db.refGene, f, k=10)
        self.assert_(len(down) >= 10)

        self.assert_(all(d.start >= f.start for d in down))
Exemple #2
0
    def test_nearest(self):
        from cruzdb.models import Feature
        f = Feature()
        f.chrom = "chr1"
        f.txStart = 10
        f.txEnd = 61
        #db = Genome('hg18', host="localhost", user="******")
        db = self.dba
        self.assert_(db.refGene.first() is not None)
        self.assert_(db.refGene is not None)

        res = db.knearest(db.refGene, f, k=2)
        self.assert_(len(res) >= 2)

        f = db.refGene.first()
        key = (f.chrom, f.start, f.end, f.name)

        for k in (2, 4, 6):
            res = db.knearest("refGene", f, k=k)
            assert len(res) >= k
            self.assert_(key in ((n.chrom, n.start, n.end, n.name) for n in res),
                    (res, f))


        f = db.refGene.order_by(db.refGene.txStart).filter(db.refGene.c.strand == "+").first()
        assert f in db.upstream(db.refGene, f)

        down = db.downstream(db.refGene, f, k=10)
        self.assert_(len(down) >= 10)

        self.assert_(all(d.start >= f.start for d in down))
Exemple #3
0
    def setUp(self):
        from cruzdb.models import Feature
        self.f = Feature()
        self.f.chrom = "chr1"
        self.f.txStart = 10
        self.f.txEnd = 61

        self.f.cdsStart = 29
        self.f.cdsEnd = 59
        """
        + exon
        | coding-exon
        _ UTR
        - intron

        10        20    26 29   34   39      47   52     59 61
        ++++++++++______+++|||||-----||||||||-----|||||||+++

        # introns = [(20, 26), (34, 39), (47, 52)]

        # coding introns = [(34, 39), (47, 52)]
        """
        self.f.exonStarts = "10,26,39,52,"
        self.f.exonEnds = "20,34,47,61,"

        self.strand = self.f.strand = '+'
Exemple #4
0
def annotate(g, fname, tables, feature_strand=False, in_memory=False,
        header=None, out=sys.stdout, _chrom=None, parallel=False):
    """
    annotate bed file in fname with tables.
    distances are integers for distance. and intron/exon/utr5 etc for gene-pred
    tables. if the annotation features have a strand, the distance reported is
    negative if the annotation feature is upstream of the feature in question
    if feature_strand is True, then the distance is negative if t
    """
    close = False
    if isinstance(out, basestring):
        out = nopen(out, "w")
        close = True


    if parallel:
        import multiprocessing
        import signal
        p = multiprocessing.Pool(initializer=lambda:
                                signal.signal(signal.SIGINT, signal.SIG_IGN))
        chroms = _split_chroms(fname)

        def write_result(fanno, written=[False]):
            for i, d in enumerate(reader(fanno, header="ordered")):
                if i == 0 and written[0] == False:
                    print >>out, "\t".join(d.keys())
                    written[0] = True
                print >>out, "\t".join(x if x else "NA" for x in d.values())
            os.unlink(fanno)
            os.unlink(fanno.replace(".anno", ""))

        for fchrom, (fout, fanno) in chroms:
            p.apply_async(annotate, args=(g.db, fout.name, tables, feature_strand, True,
                                 header, fanno, fchrom),
                                 callback=write_result)
        p.close()
        p.join()
        return out.name

    if isinstance(g, basestring):
        from . import Genome
        g = Genome(g)
    if in_memory:
        from . intersecter import Intersecter
        intersecters = [] # 1 per table.
        for t in tables:
            q = getattr(g, t) if isinstance(t, basestring) else t
            if _chrom is not None:
                q = q.filter_by(chrom=_chrom)
            table_iter = q #page_query(q, g.session)
            intersecters.append(Intersecter(table_iter))

    elif isinstance(fname, basestring) and os.path.exists(fname) \
            and sum(1 for _ in nopen(fname)) > 25000:
        print >>sys.stderr, "annotating many intervals, may be faster using in_memory=True"
    if header is None:
        header = []
    extra_header = []
    for j, toks in enumerate(reader(fname, header=False)):
        if j == 0 and not header:
            if not (toks[1] + toks[2]).isdigit():
                header = toks
        if j == 0:
            for t in tables:
                annos = (getattr(g, t) if isinstance(t, basestring) else t).first().anno_cols
                h = t if isinstance(t, basestring) else t._table.name if hasattr(t, "_table") else t.first()._table.name
                extra_header += ["%s_%s" % (h, a) for a in annos]

            if 0 != len(header):
                if not header[0].startswith("#"):
                    header[0] = "#" + header[0]
                print >>out, "\t".join(header + extra_header)
            if header == toks: continue

        if not isinstance(toks, ABase):
            f = Feature()
            f.chrom = toks[0]
            f.txStart = int(toks[1])
            f.txEnd = int(toks[2])
            try:
                f.strand = toks[header.index('strand')]
            except ValueError:
                pass
        else:
            f = toks
            # for now, use the objects str to get the columns
            # might want to use getattr on the original cols

            toks = f.bed(*header).split("\t")
        sep = "^*^"
        for ti, tbl in enumerate(tables):
            if in_memory:
                objs = intersecters[ti].knearest(int(toks[1]), int(toks[2]), chrom=toks[0], k = 1)
            else:
                objs = g.knearest(tbl, toks[0], int(toks[1]), int(toks[2]), k=1)
            if len(objs) == 0:
                print >>out, "\t".join(toks + ["", "", ""])
                continue

            gp = hasattr(objs[0], "exonStarts")
            names = [o.gene_name for o in objs]
            if feature_strand:
                strands = [-1 if f.is_upstream_of(o) else 1 for o in objs]
            else:
                strands = [-1 if o.is_upstream_of(f) else 1 for o in objs]

            # dists can be a list of tuples where the 2nd item is something
            # like 'island' or 'shore'
            dists = [o.distance(f, features=gp) for o in objs]
            pure_dists = [d[0] if isinstance(d, (tuple, list)) else d for d in dists]

            # convert to negative if the feature is upstream of the query
            for i, s in enumerate(strands):
                if s == 1: continue
                if isinstance(pure_dists[i], basestring): continue
                pure_dists[i] *= -1

            for i, (pd, d) in enumerate(zip(pure_dists, dists)):
                if isinstance(d, tuple):
                    if len(d) > 1:
                        dists[i] = "%s%s%s" % (pd, sep, sep.join(d[1:]))
                    else:
                        dists[i] = pd
            # keep uniqe name, dist combinations (occurs because of
            # transcripts)
            name_dists = set(["%s%s%s" % (n, sep, d) \
                            for (n, d) in zip(names, dists)])
            name_dists = [nd.split(sep) for nd in name_dists]

            # just take the first gene name if they are all the same
            if len(set(nd[0] for nd in name_dists)) == 1:
                toks.append(name_dists[0][0])
            else:
                toks.append(";".join(nd[0] for nd in name_dists))

            # iterate over the feat type, dist cols
            for i in range(1, len(name_dists[0])):

                toks.append(";".join(nd[i] for nd in name_dists))
        print >>out, "\t".join(toks)

    if close:
        out.close()
    return out.name
Exemple #5
0
def annotate(g,
             fname,
             tables,
             feature_strand=False,
             in_memory=False,
             header=None,
             out=sys.stdout,
             _chrom=None,
             parallel=False):
    """
    annotate bed file in fname with tables.
    distances are integers for distance. and intron/exon/utr5 etc for gene-pred
    tables. if the annotation features have a strand, the distance reported is
    negative if the annotation feature is upstream of the feature in question
    if feature_strand is True, then the distance is negative if t
    """
    close = False
    if isinstance(out, basestring):
        out = nopen(out, "w")
        close = True

    if parallel:
        import multiprocessing
        import signal
        p = multiprocessing.Pool(
            initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
        chroms = _split_chroms(fname)

        def write_result(fanno, written=[False]):
            for i, d in enumerate(reader(fanno, header="ordered")):
                if i == 0 and written[0] == False:
                    print >> out, "\t".join(d.keys())
                    written[0] = True
                print >> out, "\t".join(d.values())
            os.unlink(fanno)
            os.unlink(fanno.replace(".anno", ""))

        for fchrom, (fout, fanno) in chroms:
            p.apply_async(annotate,
                          args=(g.db, fout.name, tables, feature_strand, True,
                                header, fanno, fchrom),
                          callback=write_result)
        p.close()
        p.join()
        return out.name

    if isinstance(g, basestring):
        from . import Genome
        g = Genome(g)
    if in_memory:
        from .intersecter import Intersecter
        intersecters = []  # 1 per table.
        for t in tables:
            q = getattr(g, t) if isinstance(t, basestring) else t
            if _chrom is not None:
                q = q.filter_by(chrom=_chrom)
            table_iter = q  #page_query(q, g.session)
            intersecters.append(Intersecter(table_iter))

    elif isinstance(fname, basestring) and os.path.exists(fname) \
            and sum(1 for _ in nopen(fname)) > 25000:
        print >> sys.stderr, "annotating many intervals, may be faster using in_memory=True"
    if header is None:
        header = []
    extra_header = []
    for j, toks in enumerate(reader(fname, header=False)):
        if j == 0 and not header:
            if not (toks[1] + toks[2]).isdigit():
                header = toks
        if j == 0:
            for t in tables:
                annos = (getattr(g, t)
                         if isinstance(t, basestring) else t).first().anno_cols
                h = t if isinstance(t,
                                    basestring) else t._table.name if hasattr(
                                        t, "_table") else t.first()._table.name
                extra_header += ["%s_%s" % (h, a) for a in annos]

            if 0 != len(header):
                if not header[0].startswith("#"):
                    header[0] = "#" + header[0]
                print >> out, "\t".join(header + extra_header)
            if header == toks: continue

        if not isinstance(toks, ABase):
            f = Feature()
            f.chrom = toks[0]
            f.txStart = int(toks[1])
            f.txEnd = int(toks[2])
            try:
                f.strand = toks[header.index('strand')]
            except ValueError:
                pass
        else:
            f = toks
            # for now, use the objects str to get the columns
            # might want to use getattr on the original cols

            toks = f.bed(*header).split("\t")
        sep = "^*^"
        for ti, tbl in enumerate(tables):
            if in_memory:
                objs = intersecters[ti].knearest(int(toks[1]),
                                                 int(toks[2]),
                                                 chrom=toks[0],
                                                 k=1)
            else:
                objs = g.knearest(tbl,
                                  toks[0],
                                  int(toks[1]),
                                  int(toks[2]),
                                  k=1)
            if len(objs) == 0:
                print >> out, "\t".join(toks + ["", "", ""])
                continue

            gp = hasattr(objs[0], "exonStarts")
            names = [o.gene_name for o in objs]
            if feature_strand:
                strands = [-1 if f.is_upstream_of(o) else 1 for o in objs]
            else:
                strands = [-1 if o.is_upstream_of(f) else 1 for o in objs]

            # dists can be a list of tuples where the 2nd item is something
            # like 'island' or 'shore'
            dists = [o.distance(f, features=gp) for o in objs]
            pure_dists = [
                d[0] if isinstance(d, (tuple, list)) else d for d in dists
            ]

            # convert to negative if the feature is upstream of the query
            for i, s in enumerate(strands):
                if s == 1: continue
                if isinstance(pure_dists[i], basestring): continue
                pure_dists[i] *= -1

            for i, (pd, d) in enumerate(zip(pure_dists, dists)):
                if isinstance(d, tuple):
                    if len(d) > 1:
                        dists[i] = "%s%s%s" % (pd, sep, sep.join(d[1:]))
                    else:
                        dists[i] = pd
            # keep uniqe name, dist combinations (occurs because of
            # transcripts)
            name_dists = set(["%s%s%s" % (n, sep, d) \
                            for (n, d) in zip(names, dists)])
            name_dists = [nd.split(sep) for nd in name_dists]

            for i in range(len(name_dists[0])
                           ):  # iterate over the dist, feature, name cols

                toks.append(";".join(nd[i] for nd in name_dists))
        print >> out, "\t".join(toks)

    if close:
        out.close()
    return out.name