Esempio n. 1
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
    """
    Get all synteny blocks for a query, algorithm is single linkage
    anchors are a window centered on query

    Two categories of syntenic regions depending on what query is:
    (Syntelog): syntenic region is denoted by the syntelog
    (Gray gene): syntenic region is marked by the closest flanker
    """
    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = tee(ysorted)
    next(b, None)
    for ia, ib in zip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
            get_flanker(group, query)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
                orientation = "+"
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff:
            continue

        # y-boundary of the block
        left, right = group[0][1], group[-1][1]
        # this characterizes a syntenic region (left, right).
        # syntelog is -1 if it's a gray gene
        syn_region = (syntelog, far_syntelog, left, right, gray, orientation,
                      score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
Esempio n. 2
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
    """
    Get all synteny blocks for a query, algorithm is single linkage
    anchors are a window centered on query

    Two categories of syntenic regions depending on what query is:
    (Syntelog): syntenic region is denoted by the syntelog
    (Gray gene): syntenic region is marked by the closest flanker
    """
    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = tee(ysorted)
    next(b, None)
    for ia, ib in izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
            get_flanker(group, query)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
                orientation = "+"
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff:
            continue

        # y-boundary of the block
        left, right = group[0][1], group[-1][1]
        # this characterizes a syntenic region (left, right).
        # syntelog is -1 if it's a gray gene
        syn_region = (syntelog, far_syntelog, left,
                      right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
Esempio n. 3
0
def main(arg):
    fp = open(arg)
    N = int(fp.readline().strip())
    a = [int(x) for x in fp.readline().split()]
    assert N == len(a)

    lis = longest_increasing_subsequence(a)
    lds = longest_decreasing_subsequence(a)
    print " ".join(str(x) for x in lis)
    print " ".join(str(x) for x in lds)
Esempio n. 4
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x)
                    for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Esempio n. 5
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Esempio n. 6
0
def test_longest_increasing_subsequence(input_array, expected):
    from jcvi.algorithms.lis import longest_increasing_subsequence

    assert longest_increasing_subsequence(input_array) == expected