Esempio n. 1
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Esempio n. 2
0
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds
    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last:
            active.clear()
        if left_right == 0:
            active.add(i)
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else:  # right end
            active.remove(i)

        chr_last = chr

    return mergeables
Esempio n. 3
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
    """
    Get all synteny blocks for a query, algorithm is single linkage
    anchors are a window centered on query

    Two categories of syntenic regions depending on what query is:
    (Syntelog): syntenic region is denoted by the syntelog
    (Gray gene): syntenic region is marked by the closest flanker
    """
    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = tee(ysorted)
    next(b, None)
    for ia, ib in izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
            get_flanker(group, query)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
                orientation = "+"
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff:
            continue

        # y-boundary of the block
        left, right = group[0][1], group[-1][1]
        # this characterizes a syntenic region (left, right).
        # syntelog is -1 if it's a gray gene
        syn_region = (syntelog, far_syntelog, left,
                      right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
Esempio n. 4
0
File: quota.py Progetto: rrane/jcvi
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right == 0:
            active.add(i)
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else: # right end
            active.remove(i)

        chr_last = chr

    return mergeables
Esempio n. 5
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b in ac.iter_pairs():
            groups.join(a, b)

    ngroups = len(groups)
    nmembers = sum(len(x) for x in groups)
    logging.debug("Created {0} groups with {1} members.".\
                  format(ngroups, nmembers))

    for g in groups:
        print ",".join(sorted(g))
Esempio n. 6
0
def synteny_scan(points, xdist, ydist, N):
    """
    This is the core single linkage algorithm which behaves in O(n):
    iterate through the pairs, foreach pair we look back on the
    adjacent pairs to find links
    """
    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i - 1, -1, -1):
            # x-axis distance
            del_x = points[i][0] - points[j][0]
            if del_x > xdist:
                break
            # y-axis distance
            del_y = points[i][1] - points[j][1]
            if abs(del_y) > ydist:
                continue
            # otherwise join
            clusters.join(points[i], points[j])

    # select clusters that are at least >=N
    clusters = [sorted(cluster) for cluster in list(clusters) \
            if _score(cluster) >= N]

    return clusters
Esempio n. 7
0
File: cdt.py Progetto: Hensonmw/jcvi
    def iter_partitions(self, cutoff=.3, gtr=True):
        from jcvi.utils.grouper import Grouper

        if gtr:
            names = self.gnames
            fp = open(self.gtrfile)
        else:
            names = self.anames
            fp = open(self.atrfile)

        reader = csv.reader(fp, delimiter="\t")
        grouper = Grouper()
        for g in map(GTRLine._make, reader):
            d = float(g.dist)
            if d < cutoff:
                continue

            grouper.join(g.parent, g.left_child, g.right_child)

        parents = {}
        for i, group in enumerate(grouper):
            for g in group:
                parents[g] = i

        partitions = [[parents.get(a, x), x] for a, x in names]
        for key, parts in groupby(partitions, key=lambda x: x[0]):
            yield list(x[1] for x in parts)
Esempio n. 8
0
def synteny_scan(points, xdist, ydist, N):
    """
    This is the core single linkage algorithm which behaves in O(n):
    iterate through the pairs, foreach pair we look back on the
    adjacent pairs to find links
    """
    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i - 1, -1, -1):
            # x-axis distance
            del_x = points[i][0] - points[j][0]
            if del_x > xdist:
                break
            # y-axis distance
            del_y = points[i][1] - points[j][1]
            if abs(del_y) > ydist:
                continue
            # otherwise join
            clusters.join(points[i], points[j])

    # select clusters that are at least >=N
    clusters = [sorted(cluster) for cluster in list(clusters) \
            if _score(cluster) >= N]

    return clusters
Esempio n. 9
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Esempio n. 10
0
File: bed.py Progetto: radaniba/jcvi
def pile(args):
    """
    %prog pile abedfile bbedfile > piles

    Call intersectBed on two bedfiles.
    """
    from jcvi.utils.grouper import Grouper

    p = OptionParser(pile.__doc__)
    p.add_option("--minOverlap",
                 default=0,
                 type="int",
                 help="Minimum overlap required [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    abedfile, bbedfile = args
    iw = intersectBed_wao(abedfile, bbedfile, minOverlap=opts.minOverlap)
    groups = Grouper()
    for a, b in iw:
        groups.join(a.accn, b.accn)

    ngroups = 0
    for group in groups:
        if len(group) > 1:
            ngroups += 1
            print "|".join(group)

    logging.debug("A total of {0} piles (>= 2 members)".format(ngroups))
Esempio n. 11
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
    """
    Get all synteny blocks for a query, algorithm is single linkage
    anchors are a window centered on query

    Two categories of syntenic regions depending on what query is:
    (Syntelog): syntenic region is denoted by the syntelog
    (Gray gene): syntenic region is marked by the closest flanker
    """
    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = tee(ysorted)
    next(b, None)
    for ia, ib in zip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
            get_flanker(group, query)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
                orientation = "+"
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff:
            continue

        # y-boundary of the block
        left, right = group[0][1], group[-1][1]
        # this characterizes a syntenic region (left, right).
        # syntelog is -1 if it's a gray gene
        syn_region = (syntelog, far_syntelog, left, right, gray, orientation,
                      score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
Esempio n. 12
0
def pile(args):
    """
    %prog pile abedfile bbedfile > piles

    Call intersectBed on two bedfiles.
    """
    from jcvi.utils.grouper import Grouper

    p = OptionParser(pile.__doc__)
    p.add_option("--minOverlap", default=0, type="int",
                 help="Minimum overlap required [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    abedfile, bbedfile = args
    iw = intersectBed_wao(abedfile, bbedfile, minOverlap=opts.minOverlap)
    groups = Grouper()
    for a, b in iw:
        groups.join(a.accn, b.accn)

    ngroups = 0
    for group in groups:
        if len(group) > 1:
            ngroups += 1
            print "|".join(group)

    logging.debug("A total of {0} piles (>= 2 members)".format(ngroups))
Esempio n. 13
0
    def iter_partitions(self, cutoff=.3, gtr=True):
        from jcvi.utils.grouper import Grouper

        if gtr:
            names = self.gnames
            fp = open(self.gtrfile)
        else:
            names = self.anames
            fp = open(self.atrfile)

        reader = csv.reader(fp, delimiter="\t")
        grouper = Grouper()
        for g in map(GTRLine._make, reader):
            d = float(g.dist)
            if d < cutoff:
                continue

            grouper.join(g.parent, g.left_child, g.right_child)

        parents = {}
        for i, group in enumerate(grouper):
            for g in group:
                parents[g] = i

        partitions = [[parents.get(a, x), x] for a, x in names]
        for key, parts in groupby(partitions, key=lambda x: x[0]):
            yield list(x[1] for x in parts)
Esempio n. 14
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}"
                  .format(len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    G = BiGraph()
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        #add_bed_to_graph(G, bed, families)
        print_edges(G, bed, families)
Esempio n. 15
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b in ac.iter_pairs():
            groups.join(a, b)

    ngroups = len(groups)
    nmembers = sum(len(x) for x in groups)
    logging.debug("Created {0} groups with {1} members.".\
                  format(ngroups, nmembers))

    for g in groups:
        print ",".join(sorted(g))
Esempio n. 16
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}".format(
        len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        print_edges(bed, families)
Esempio n. 17
0
def athalianatruth(args):
    """
    %prog athalianatruth J_a.txt J_bc.txt

    Prepare pairs data for At alpha/beta/gamma.
    """
    p = OptionParser(athalianatruth.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    atxt, bctxt = args
    g = Grouper()
    pairs = set()
    for txt in (atxt, bctxt):
        extract_groups(g, pairs, txt)

    fw = open("pairs", "w")
    for pair in sorted(pairs):
        print("\t".join(pair), file=fw)
    fw.close()

    fw = open("groups", "w")
    for group in list(g):
        print(",".join(group), file=fw)
    fw.close()
Esempio n. 18
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))
Esempio n. 19
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10]
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10]

    simple_blast.sort()

    standems = Grouper()
    for name, hits in groupby(simple_blast, key=lambda x: x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chr and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]:
                standems.join(a[1], b[1])

    return standems
Esempio n. 20
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))
Esempio n. 21
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) \
                for b in blast_list if b.evalue < 1e-10]
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) \
                for b in blast_list if b.evalue < 1e-10]

    simple_blast.sort()

    standems = Grouper()
    for name, hits in groupby(simple_blast, key=lambda x: x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chr and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]:
                standems.join(a[1], b[1])

    return standems
Esempio n. 22
0
def chain_HSPs(blastlines, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blastlines.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blastlines, key=key):
        points = sorted(list(points), \
                key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))

        n = len(points)
        for i in xrange(n):
            a = points[i]
            clusters.join(a)
            for j in xrange(i + 1, n):
                b = points[j]
                if a.orientation != b.orientation:
                    continue

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    continue
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = []
    for c in clusters:
        chained_hsps.append(combine_HSPs(c))
    chained_hsps = sorted(chained_hsps, key=lambda x: -x.score)

    return chained_hsps
Esempio n. 23
0
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]
    aligned_genes = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            aligned_genes.join(a, b)

    print list(aligned_genes)
    logging.debug("Total aligned genes: {}".format(len(aligned_genes)))
Esempio n. 24
0
def chain_HSPs(blast, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blast.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blast, key=key):
        points = sorted(list(points), \
                key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))

        n = len(points)
        for i in xrange(n):
            a = points[i]
            clusters.join(a)
            for j in xrange(i + 1, n):
                b = points[j]

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    break
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = [combine_HSPs(x) for x in clusters]
    key = lambda x: (x.query, -x.score if x.has_score else 0)
    chained_hsps = sorted(chained_hsps, key=key)

    return chained_hsps
Esempio n. 25
0
def chain_HSPs(blastlines, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blastlines.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blastlines, key=key):
        points = sorted(list(points), \
                key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))

        n = len(points)
        for i in xrange(n):
            a = points[i]
            clusters.join(a)
            for j in xrange(i + 1, n):
                b = points[j]
                if a.orientation != b.orientation:
                    continue

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    continue
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = []
    for c in clusters:
        chained_hsps.append(combine_HSPs(c))
    chained_hsps = sorted(chained_hsps, key=lambda x: -x.score)

    return chained_hsps
Esempio n. 26
0
def chain_HSPs(blast, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blast.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blast, key=key):
        points = sorted(
            list(points), key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop)
        )

        n = len(points)
        for i in range(n):
            a = points[i]
            clusters.join(a)
            for j in range(i + 1, n):
                b = points[j]

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    break
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = [combine_HSPs(x) for x in clusters]
    key = lambda x: (x.query, -x.score if x.has_score else 0)
    chained_hsps = sorted(chained_hsps, key=key)

    return chained_hsps
Esempio n. 27
0
def napus(args):
    """
    %prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation

    Extract napus gene loss vs diploid ancestors. We are looking specifically
    for anything that has the pattern:

        BR - BO    or     BR - BO
        |                       |
        AN                     CN

    Step 1: extract BR - BO syntenic pairs
    Step 2: get diploid gene retention patterns from BR or BO as query
    Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and
    specifically with NS, the NS location is actually the homeologous site.
    Step 4: categorize gene losses into singleton, or segmental (defined as
    consecutive losses with a maximum skip of 1
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(napus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    napusbed, brbo, dpnp = args
    retention = {}
    fp = open(dpnp)
    for row in fp:
        seqid, query, hit = row.split()
        retention[query] = hit

    order = Bed(napusbed).order

    quartetsfile = "quartets"
    fp = open(brbo)
    fw = open(quartetsfile, "w")
    AL = "AN LOST"
    CL = "CN LOST"
    for row in fp:
        br, bo = row.split()
        if '.' in (br, bo):
            continue
        an, cn = retention[br], retention[bo]
        row = "\t".join((br, bo, an, cn))
        if '.' in (an, cn):
            #print row
            continue

        # label loss candidates
        antag, anrange = get_tag(an, order)
        cntag, cnrange = get_tag(cn, order)

        if range_overlap(anrange, cnrange):
            if (antag, cntag) == ("NS", None):
                row = row + "\t{0}|{1}".format(AL, br)
            if (antag, cntag) == (None, "NS"):
                row = row + "\t{0}|{1}".format(CL, bo)

        print >> fw, row
    fw.close()

    logging.debug("Quartets and gene losses written to `{0}`.".\
                    format(quartetsfile))

    # Parse the quartets file to extract singletons vs.segmental losses
    fp = open(quartetsfile)
    fw = open(quartetsfile + ".summary", "w")
    data = [x.rstrip().split("\t") for x in fp]
    skip = 1  # max distance between losses

    g = Grouper()
    losses = [(len(x) == 5) for x in data]
    for i, d in enumerate(losses):
        if not d:
            continue
        g.join(i, i)
        itag = data[i][-1].split("|")[0]
        for j in xrange(i + 1, i + skip + 1):
            jtag = data[j][-1].split("|")[0]
            if j < len(losses) and losses[j] and itag == jtag:
                g.join(i, j)

    losses = list(g)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm = len(singletons), len(segments)
    assert len(losses) == ns + nm

    grab_tag = lambda pool, tag: \
            [x for x in pool if all(data[z][-1].startswith(tag) for z in x)]

    an_loss_singletons = grab_tag(singletons, AL)
    cn_loss_singletons = grab_tag(singletons, CL)
    als, cls = len(an_loss_singletons), len(cn_loss_singletons)

    an_loss_segments = grab_tag(segments, AL)
    cn_loss_segments = grab_tag(segments, CL)
    alm, clm = len(an_loss_segments), len(cn_loss_segments)
    mixed = len(segments) - alm - clm
    assert mixed == 0

    logging.debug("Singletons: {0} (AN LOSS: {1}, CN LOSS: {2})".\
                        format(ns, als, cls))
    logging.debug("Segments: {0} (AN LOSS: {1}, CN LOSS: {2})".\
                        format(nm, alm, clm))
    print >> sys.stderr, SummaryStats([len(x) for x in losses])

    for x in singletons + segments:
        print >> fw, "### LENGTH =", len(x)
        for i in x:
            print >> fw, "\t".join(data[i])
    fw.close()
Esempio n. 28
0
def segment(args):
    """
    %prog segment loss.ids bedfile

    Merge adjacent gene loss into segmental loss.

    Then based on the segmental loss, estimate amount of DNA loss in base pairs.
    Two estimates can be given:
    - conservative: just within the start and end of a single gene
    - aggressive: extend the deletion track to the next gene

    The real deletion size is within these estimates.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(segment.__doc__)
    p.add_option("--chain", default=1, type="int",
                 help="Allow next N genes to be chained [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    idsfile, bedfile = args
    bed = Bed(bedfile)
    order = bed.order
    ids = SetFile(idsfile)
    losses = Grouper()
    skip = opts.chain
    for i, a in enumerate(bed):
        a = a.accn
        for j in xrange(i + 1, i + 1 + skip):
            if j >= len(bed):
                break
            b = bed[j].accn
            if a in ids:
                losses.join(a, a)
            if a in ids and b in ids:
                losses.join(a, b)

    losses = list(losses)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm, nt = len(singletons), len(segments), len(losses)
    assert ns + nm == nt

    # Summary for all segments
    for x in sorted(singletons) + sorted(segments):
        print "\t".join(str(x) for x in ("|".join(sorted(x)), len(x),
                        estimate_size(x, bed, order)))

    # Find longest segment stretch
    if segments:
        mx, maxsegment = max([(len(x), x) for x in segments])
        print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx)
        print >> sys.stderr, "  {0}".format("|".join(sorted(maxsegment)))
        seg_asize = sum(estimate_size(x, bed, order) for x in segments)
        seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                             for x in segments)
    else:
        seg_asize = seg_bsize = 0

    sing_asize = sum(estimate_size(x, bed, order) for x in singletons)
    sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                           for x in singletons)
    total_asize = sing_asize + seg_asize
    total_bsize = sing_bsize + seg_bsize
    print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\
                         format(ns, sing_asize, sing_bsize)
    print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\
                         format(nm, seg_asize, seg_bsize)
    print >> sys.stderr, "Total ({0}): {1} - {2} bp".\
                         format(nt, total_asize, total_bsize)
    print >> sys.stderr, "Average ({0}): {1} bp".\
                         format(nt, (total_asize + total_bsize) / 2)
Esempio n. 29
0
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-w",
                 "--weightsfile",
                 default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance",
                 default="rank",
                 choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage",
                 default="double",
                 choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize",
                 default=100,
                 type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen",
                 default=500,
                 type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop",
                 default=100,
                 type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links",
                 default=10,
                 type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot",
                 default=False,
                 action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {
        "single": min,
        "double": double_linkage,
        "complete": max,
        "average": np.mean,
        "median": np.median
    }[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
            consensus = C[mlg]
            mapname = mlg.split("-")[0]
            mw = weights[mapname]
            if consensus not in counts:
                counts[consensus] = 0
            counts[consensus] += count * mw
        best_consensus, best_value = best_no_ambiguous(counts, seqid)
        if best_consensus is None:
            continue
        partitions[best_consensus].append(seqid)

    # Perform OO within each partition
    agpfile = pf + ".chr.agp"
    tourfile = pf + ".tour"
    sizes = Sizes(fastafile).mapping
    fwagp = must_open(agpfile, "w")
    fwtour = must_open(tourfile, "w")
    solutions = []
    for lgs, scaffolds in sorted(partitions.items()):
        if oseqid and oseqid not in lgs:
            continue
        tag = "|".join(lgs)
        lgs_maps = set(x.split("-")[0] for x in lgs)
        if pivot not in lgs_maps:
            logging.debug("Skipping {0} ...".format(tag))
            continue
        logging.debug("Working on {0} ...".format(tag))
        s = ScaffoldOO(lgs,
                       scaffolds,
                       cc,
                       pivot,
                       weights,
                       sizes,
                       function=function,
                       linkage=linkage,
                       ngen=ngen,
                       npop=npop,
                       cpus=cpus)

        for fw in (sys.stderr, fwtour):
            print >> fw, ">{0} ({1})".format(s.object, tag)
            print >> fw, " ".join("".join(x) for x in s.tour)
        solutions.append(s)
    fwtour.close()

    # meta-data about the run parameters
    command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\
                     format(" ".join(oargs))
    comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\
                     format(version, get_today(), command)
    AGP.print_header(fwagp, comment=comment)

    for s in sorted(solutions, key=lambda x: x.object):
        order_to_agp(s.object,
                     s.tour,
                     sizes,
                     fwagp,
                     gapsize=gapsize,
                     gaptype="map")
    fwagp.close()

    logging.debug("AGP file written to `{0}`.".format(agpfile))
    logging.debug("Tour file written to `{0}`.".format(tourfile))

    build([inputbed, fastafile])

    summaryfile = pf + ".summary.txt"
    summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)])

    if not opts.noplot:
        plotall([inputbed, "--links={0}".format(opts.links)])
Esempio n. 30
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N + 1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i - x].accn]
                        if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.):
                            continue
                        g.join(bed[i - x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >> fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))

    return families
Esempio n. 31
0
File: pasa.py Progetto: BrokeW/jcvi
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through every gene locus and identify all cases of same and
    different isoforms across the different input datasets.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    gene_coords = {}
    mrna = AutoVivification()
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)
        for gene in gffdbx[dbn].features_of_type('gene',
                                                 order_by=('seqid', 'start')):
            if gene.id not in gene_coords:
                gene_coords[gene.id] = []
            gene_coords[gene.id].extend([gene.start, gene.stop])

            c = list(gffdbx[dbn].children(gene,
                                          featuretype='mRNA',
                                          order_by='start'))
            if len(c) > 0:
                mrna[gene.id][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    summary = ["id"]
    summary.extend(gffdbx.keys())
    print >> sys.stderr, "\t".join(str(x) for x in summary)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                for mrna1, mrna2 in product(mrna[gene][dbn1],
                                            mrna[gene][dbn2]):
                    g.join((dbn1, mrna1.id))
                    g.join((dbn2, mrna2.id))

                    fUTR, tUTR = None, None
                    if match_subfeats(mrna1, mrna2, gffdbx[dbn1],
                                      gffdbx[dbn2]):
                        fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='five_prime_UTR', slop=slop)
                        tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='three_prime_UTR', slop=slop)

                    if fUTR and tUTR:
                        g.join((dbn1, mrna1.id), (dbn2, mrna2.id))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id))

        dbn = mrna[gene].keys()[0]
        gene_coords[gene].sort()
        _gene = gffdbx[dbn][gene]
        _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1]
        print >> fw, _gene

        logging.debug(list(g))
        for group in g:
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]
            if slop:
                mlen = 0
                for D, M in zip(dbs, mrnas):
                    _mrna = gffdbx[D][M]
                    _mlen = (_mrna.stop - _mrna.start) + 1
                    if _mlen > mlen:
                        d, m, mlen = D, M, _mlen

            dbid, _mrnaid = "".join(str(x) for x in set(dbs)), []
            _mrnaid = [x for x in mrnas if x not in _mrnaid]
            mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid))

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            summary = [mrnaid]
            summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
            print >> sys.stderr, "\t".join(str(x) for x in summary)

    fw.close()
Esempio n. 32
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x)
                    for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Esempio n. 33
0
def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option("--resolve", default="alignment", choices=valid_resolve_choices,
                 help="Resolve ID assignment based on a certain metric" \
                        + " [default: %default]")
    p.add_option("--atg_name", default=False, action="store_true",
                help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \
                        + " [default: %default]")

    g1 = OptionGroup(p, "Optional parameters (alignment):\n" \
            + "Use if resolving ambiguities based on sequence `alignment`")
    g1.add_option("--pid", dest="pid", default=35., type="float",
            help="Percent identity cutoff [default: %default]")
    g1.add_option("--score", dest="score", default=250., type="float",
            help="Alignment score cutoff [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters (overlap):\n" \
            + "Use if resolving ambiguities based on `overlap` length\n" \
            + "Parameters equivalent to `intersectBed`")
    g2.add_option("-f", dest="f", default=0.5, type="float",
            help="Minimum overlap fraction (0.0 - 1.0) [default: %default]")
    g2.add_option("-r", dest="r", default=False, action="store_true",
            help="Require fraction overlap to be reciprocal [default: %default]")
    g2.add_option("-s", dest="s", default=True, action="store_true",
            help="Require same strandedness [default: %default]")
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".\
            format(opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning("`{0}` already exists. Checking for needle output".\
                    format(pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error("`{0}` does not exist. Please process {1} using `needle`".\
                    format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning("`{0}' exists. Storing scores in memory".\
            format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed: nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print >> afh, abedline[accn]
    afh.close()

    sort([abedfile, "-i"])
Esempio n. 34
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \
                    "==>", len(lranks)

        granks = set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, uc=opts.uc) for x in lranks) | \
                 set(gene_name(current_chr, x, prefix=opts.prefix, \
                     pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print "\t".join((str(s), "|".join(tag)))
Esempio n. 35
0
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-b", "--bedfile", help=SUPPRESS_HELP)
    p.add_option("-s", "--fastafile", help=SUPPRESS_HELP)
    p.add_option("-w", "--weightsfile", default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance", default="rank", choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage", default="double", choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize", default=100, type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen", default=500, type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop", default=100, type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links", default=10, type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot", default=False, action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    inputbed = opts.bedfile or inputbed
    fastafile = opts.fastafile or fastafile
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {"single": min, "double": double_linkage, "complete": max,
               "average": np.mean, "median": np.median}[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
            consensus = C[mlg]
            mapname = mlg.split("-")[0]
            mw = weights[mapname]
            if consensus not in counts:
                counts[consensus] = 0
            counts[consensus] += count * mw
        best_consensus, best_value = best_no_ambiguous(counts, seqid)
        if best_consensus is None:
            continue
        partitions[best_consensus].append(seqid)

    # Perform OO within each partition
    agpfile = pf + ".chr.agp"
    tourfile = pf + ".tour"
    sizes = Sizes(fastafile).mapping
    fwagp = must_open(agpfile, "w")
    fwtour = must_open(tourfile, "w")
    solutions = []
    for lgs, scaffolds in sorted(partitions.items()):
        if oseqid and oseqid not in lgs:
            continue
        tag = "|".join(lgs)
        lgs_maps = set(x.split("-")[0] for x in lgs)
        if pivot not in lgs_maps:
            logging.debug("Skipping {0} ...".format(tag))
            continue
        logging.debug("Working on {0} ...".format(tag))
        s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes,
                       function=function, linkage=linkage,
                       ngen=ngen, npop=npop, cpus=cpus)

        for fw in (sys.stderr, fwtour):
            print >> fw, ">{0} ({1})".format(s.object, tag)
            print >> fw, " ".join("".join(x) for x in s.tour)
        solutions.append(s)
    fwtour.close()

    # meta-data about the run parameters
    command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\
                     format(" ".join(oargs))
    comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\
                     format(version, get_today(), command)
    AGP.print_header(fwagp, comment=comment)

    for s in sorted(solutions, key=lambda x: x.object):
        order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize,
                     gaptype="map")
    fwagp.close()

    logging.debug("AGP file written to `{0}`.".format(agpfile))
    logging.debug("Tour file written to `{0}`.".format(tourfile))

    build([inputbed, fastafile])

    summaryfile = pf + ".summary.txt"
    summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)])

    if not opts.noplot:
        plotall([inputbed, "--links={0}".format(opts.links)])
Esempio n. 36
0
def enrich(args):
    """
    %prog enrich omgfile groups ntaxa > enriched.omg

    Enrich OMG output by pulling genes misses by OMG.
    """
    p = OptionParser(enrich.__doc__)
    p.add_option("--ghost", default=False, action="store_true",
                 help="Add ghost homologs already used [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    omgfile, groupsfile, ntaxa = args
    ntaxa = int(ntaxa)
    ghost = opts.ghost

    # Get gene pair => weight mapping
    weights = get_edges()
    info = get_info()
    # Get gene => taxon mapping
    info = dict((k, v.split()[5]) for k, v in info.items())

    groups = Grouper()

    fp = open(groupsfile)
    for row in fp:
        members = row.strip().split(",")
        groups.join(*members)

    logging.debug("Imported {0} families with {1} members.".\
                    format(len(groups), groups.num_members))

    seen = set()
    omggroups = Grouper()
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        seen.update(genes)
        omggroups.join(*genes)

    nmembers = omggroups.num_members
    logging.debug("Imported {0} OMG families with {1} members.".\
                    format(len(omggroups), nmembers))
    assert nmembers == len(seen)

    alltaxa = set(str(x) for x in range(ntaxa))
    recruited = []
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        a = genes[0]

        idxs = set(idxs.split(","))
        missing_taxa = alltaxa - idxs
        if not missing_taxa:
            print row.rstrip()
            continue

        leftover = groups[a]
        if not ghost:
            leftover = set(leftover) - seen

        if not leftover:
            print row.rstrip()
            continue

        leftover_sorted_by_taxa = dict((k, \
                             [x for x in leftover if info[x] == k]) \
                                for k in missing_taxa)

        #print genes, leftover
        #print leftover_sorted_by_taxa
        solutions = []
        for solution in product(*leftover_sorted_by_taxa.values()):
            score = sum(weights.get((a, b), 0) for a in solution for b in genes)
            if score == 0:
                continue
            score += sum(weights.get((a, b), 0) for a, b in combinations(solution, 2))
            solutions.append((score, solution))
            #print solution, score

        best_solution = max(solutions) if solutions else None
        if best_solution is None:
            print row.rstrip()
            continue

        #print "best ==>", best_solution
        best_score, best_addition = best_solution
        genes.extend(best_addition)
        recruited.extend(best_addition)

        genes = sorted([(info[x], x) for x in genes])
        idxs, genes = zip(*genes)

        if ghost:  # decorate additions so it's clear that they were added
            pgenes = []
            for g in genes:
                if g in recruited and g in seen:
                    pgenes.append("|{0}|".format(g))
                else:
                    pgenes.append(g)
            genes = pgenes

        print "\t".join((",".join(genes), ",".join(idxs)))
        if not ghost:
            seen.update(best_addition)

    logging.debug("Recruited {0} new genes.".format(len(recruited)))
Esempio n. 37
0
def napus(args):
    """
    %prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation

    Extract napus gene loss vs diploid ancestors. We are looking specifically
    for anything that has the pattern:

        BR - BO    or     BR - BO
        |                       |
        AN                     CN

    Step 1: extract BR - BO syntenic pairs
    Step 2: get diploid gene retention patterns from BR or BO as query
    Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and
    specifically with NS, the NS location is actually the homeologous site.
    Step 4: categorize gene losses into singleton, or segmental (defined as
    consecutive losses with a maximum skip of 1
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(napus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    napusbed, brbo, dpnp = args
    retention = {}
    fp = open(dpnp)
    for row in fp:
        seqid, query, hit = row.split()
        retention[query] = hit

    order = Bed(napusbed).order

    quartetsfile = "quartets"
    fp = open(brbo)
    fw = open(quartetsfile, "w")
    AL = "AN LOST"
    CL = "CN LOST"
    for row in fp:
        br, bo = row.split()
        if '.' in (br, bo):
            continue
        an, cn = retention[br], retention[bo]
        row = "\t".join((br, bo, an, cn))
        if '.' in (an, cn):
            #print row
            continue

        # label loss candidates
        antag, anrange = get_tag(an, order)
        cntag, cnrange = get_tag(cn, order)

        if range_overlap(anrange, cnrange):
            if (antag, cntag) == ("NS", None):
                row = row + "\t{0}|{1}".format(AL, br)
            if (antag, cntag) == (None, "NS"):
                row = row + "\t{0}|{1}".format(CL, bo)

        print >> fw, row
    fw.close()

    logging.debug("Quartets and gene losses written to `{0}`.".\
                    format(quartetsfile))

    # Parse the quartets file to extract singletons vs.segmental losses
    fp = open(quartetsfile)
    fw = open(quartetsfile + ".summary", "w")
    data = [x.rstrip().split("\t") for x in fp]
    skip = 1  # max distance between losses

    g = Grouper()
    losses = [(len(x) == 5) for x in data]
    for i, d in enumerate(losses):
        if not d:
            continue
        g.join(i, i)
        itag = data[i][-1].split("|")[0]
        for j in xrange(i + 1, i + skip + 1):
            jtag = data[j][-1].split("|")[0]
            if j < len(losses) and losses[j] and itag == jtag:
                g.join(i, j)

    losses = list(g)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm = len(singletons), len(segments)
    assert len(losses) == ns + nm

    grab_tag = lambda pool, tag: \
            [x for x in pool if all(data[z][-1].startswith(tag) for z in x)]

    an_loss_singletons = grab_tag(singletons, AL)
    cn_loss_singletons = grab_tag(singletons, CL)
    als, cls = len(an_loss_singletons), len(cn_loss_singletons)

    an_loss_segments = grab_tag(segments, AL)
    cn_loss_segments = grab_tag(segments, CL)
    alm, clm = len(an_loss_segments), len(cn_loss_segments)
    mixed = len(segments) - alm - clm
    assert mixed == 0

    logging.debug("Singletons: {0} (AN LOSS: {1}, CN LOSS: {2})".\
                        format(ns, als, cls))
    logging.debug("Segments: {0} (AN LOSS: {1}, CN LOSS: {2})".\
                        format(nm, alm, clm))
    print >> sys.stderr, SummaryStats([len(x) for x in losses])

    for x in singletons + segments:
        print >> fw, "### LENGTH =", len(x)
        for i in x:
            print >> fw, "\t".join(data[i])
    fw.close()
Esempio n. 38
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N+1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i-x].accn]
                        if abs(leni - lenx) > max(leni, lenx)*(1-P/100.):
                            continue
                        g.join(bed[i-x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >>fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))

    return families
Esempio n. 39
0
def enrich(args):
    """
    %prog enrich omgfile groups ntaxa > enriched.omg

    Enrich OMG output by pulling genes misses by OMG.
    """
    p = OptionParser(enrich.__doc__)
    p.add_option("--ghost",
                 default=False,
                 action="store_true",
                 help="Add ghost homologs already used [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    omgfile, groupsfile, ntaxa = args
    ntaxa = int(ntaxa)
    ghost = opts.ghost

    # Get gene pair => weight mapping
    weights = get_edges()
    info = get_info()
    # Get gene => taxon mapping
    info = dict((k, v.split()[5]) for k, v in info.items())

    groups = Grouper()

    fp = open(groupsfile)
    for row in fp:
        members = row.strip().split(",")
        groups.join(*members)

    logging.debug("Imported {0} families with {1} members.".\
                    format(len(groups), groups.num_members))

    seen = set()
    omggroups = Grouper()
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        seen.update(genes)
        omggroups.join(*genes)

    nmembers = omggroups.num_members
    logging.debug("Imported {0} OMG families with {1} members.".\
                    format(len(omggroups), nmembers))
    assert nmembers == len(seen)

    alltaxa = set(str(x) for x in range(ntaxa))
    recruited = []
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        a = genes[0]

        idxs = set(idxs.split(","))
        missing_taxa = alltaxa - idxs
        if not missing_taxa:
            print row.rstrip()
            continue

        leftover = groups[a]
        if not ghost:
            leftover = set(leftover) - seen

        if not leftover:
            print row.rstrip()
            continue

        leftover_sorted_by_taxa = dict((k, \
                             [x for x in leftover if info[x] == k]) \
                                for k in missing_taxa)

        #print genes, leftover
        #print leftover_sorted_by_taxa
        solutions = []
        for solution in product(*leftover_sorted_by_taxa.values()):
            score = sum(
                weights.get((a, b), 0) for a in solution for b in genes)
            if score == 0:
                continue
            score += sum(
                weights.get((a, b), 0) for a, b in combinations(solution, 2))
            solutions.append((score, solution))
            #print solution, score

        best_solution = max(solutions) if solutions else None
        if best_solution is None:
            print row.rstrip()
            continue

        #print "best ==>", best_solution
        best_score, best_addition = best_solution
        genes.extend(best_addition)
        recruited.extend(best_addition)

        genes = sorted([(info[x], x) for x in genes])
        idxs, genes = zip(*genes)

        if ghost:  # decorate additions so it's clear that they were added
            pgenes = []
            for g in genes:
                if g in recruited and g in seen:
                    pgenes.append("|{0}|".format(g))
                else:
                    pgenes.append(g)
            genes = pgenes

        print "\t".join((",".join(genes), ",".join(idxs)))
        if not ghost:
            seen.update(best_addition)

    logging.debug("Recruited {0} new genes.".format(len(recruited)))
Esempio n. 40
0
def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option("--resolve", default="alignment", choices=valid_resolve_choices,
                 help="Resolve ID assignment based on a certain metric" \
                        + " [default: %default]")
    p.add_option("--atg_name", default=False, action="store_true",
                help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \
                        + " [default: %default]")

    g1 = OptionGroup(p, "Optional parameters (alignment):\n" \
            + "Use if resolving ambiguities based on sequence `alignment`")
    g1.add_option("--pid",
                  dest="pid",
                  default=35.,
                  type="float",
                  help="Percent identity cutoff [default: %default]")
    g1.add_option("--score",
                  dest="score",
                  default=250.,
                  type="float",
                  help="Alignment score cutoff [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters (overlap):\n" \
            + "Use if resolving ambiguities based on `overlap` length\n" \
            + "Parameters equivalent to `intersectBed`")
    g2.add_option(
        "-f",
        dest="f",
        default=0.5,
        type="float",
        help="Minimum overlap fraction (0.0 - 1.0) [default: %default]")
    g2.add_option(
        "-r",
        dest="r",
        default=False,
        action="store_true",
        help="Require fraction overlap to be reciprocal [default: %default]")
    g2.add_option("-s",
                  dest="s",
                  default=True,
                  action="store_true",
                  help="Require same strandedness [default: %default]")
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".\
            format(opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning("`{0}` already exists. Checking for needle output".\
                    format(pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error("`{0}` does not exist. Please process {1} using `needle`".\
                    format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning("`{0}' exists. Storing scores in memory".\
            format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed:
        nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline,
                                        abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print >> afh, abedline[accn]
    afh.close()

    sort([abedfile, "-i"])
Esempio n. 41
0
def segment(args):
    """
    %prog segment loss.ids bedfile

    Merge adjacent gene loss into segmental loss.

    Then based on the segmental loss, estimate amount of DNA loss in base pairs.
    Two estimates can be given:
    - conservative: just within the start and end of a single gene
    - aggressive: extend the deletion track to the next gene

    The real deletion size is within these estimates.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(segment.__doc__)
    p.add_option("--chain",
                 default=1,
                 type="int",
                 help="Allow next N genes to be chained [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    idsfile, bedfile = args
    bed = Bed(bedfile)
    order = bed.order
    ids = SetFile(idsfile)
    losses = Grouper()
    skip = opts.chain
    for i, a in enumerate(bed):
        a = a.accn
        for j in xrange(i + 1, i + 1 + skip):
            if j >= len(bed):
                break
            b = bed[j].accn
            if a in ids:
                losses.join(a, a)
            if a in ids and b in ids:
                losses.join(a, b)

    losses = list(losses)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm, nt = len(singletons), len(segments), len(losses)
    assert ns + nm == nt

    # Summary for all segments
    for x in sorted(singletons) + sorted(segments):
        print "\t".join(
            str(x) for x in ("|".join(sorted(x)), len(x),
                             estimate_size(x, bed, order)))

    # Find longest segment stretch
    if segments:
        mx, maxsegment = max([(len(x), x) for x in segments])
        print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx)
        print >> sys.stderr, "  {0}".format("|".join(sorted(maxsegment)))
        seg_asize = sum(estimate_size(x, bed, order) for x in segments)
        seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                             for x in segments)
    else:
        seg_asize = seg_bsize = 0

    sing_asize = sum(estimate_size(x, bed, order) for x in singletons)
    sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                           for x in singletons)
    total_asize = sing_asize + seg_asize
    total_bsize = sing_bsize + seg_bsize
    print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\
                         format(ns, sing_asize, sing_bsize)
    print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\
                         format(nm, seg_asize, seg_bsize)
    print >> sys.stderr, "Total ({0}): {1} - {2} bp".\
                         format(nt, total_asize, total_bsize)
    print >> sys.stderr, "Average ({0}): {1} bp".\
                         format(nt, (total_asize + total_bsize) / 2)
Esempio n. 42
0
def group(args):
    """
    %prog group tabfile > tabfile.grouped

    Given a tab-delimited file, either group all elements within the file or
    group the elements in the value column(s) based on the key (groupby) column

    For example, convert this | into this
    ---------------------------------------
    a   2    3    4           | a,2,3,4,5,6
    a   5    6                | b,7,8
    b   7    8                | c,9,10,11
    c   9                     |
    c  10   11                |

    If grouping by a particular column,
    convert this              | into this:
    ---------------------------------------------
    a   2    3    4           | a   2,5   3,6   4
    a   5    6                | b   7     8
    b   7    8                | c   9,10  11
    c   9                     |
    c  10   11                |

    By default, it uniqifies all the grouped elements
    """
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper

    p = OptionParser(group.__doc__)
    p.set_sep()
    p.add_option("--groupby",
                 default=None,
                 type="int",
                 help="Default column to groupby")
    p.add_option("--groupsep",
                 default=",",
                 help="Separator to join the grouped elements")
    p.add_option(
        "--nouniq",
        default=False,
        action="store_true",
        help="Do not uniqify the grouped elements",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (tabfile, ) = args
    sep = opts.sep
    groupby = opts.groupby
    groupsep = opts.groupsep

    cols = []
    grouper = AutoVivification() if groupby is not None else Grouper()
    fp = must_open(tabfile)
    for row in fp:
        row = row.rstrip()
        atoms = row.split(sep)
        if groupby is not None:
            if len(cols) < len(atoms):
                cols = [x for x in range(len(atoms))]
            if groupby not in cols:
                logging.error(
                    "groupby col index `{0}` is out of range".format(groupby))
                sys.exit()

            key = atoms[groupby]
            for col in cols:
                if col == groupby:
                    continue
                if not grouper[key][col]:
                    grouper[key][col] = [] if opts.nouniq else set()
                if col < len(atoms):
                    if groupsep in atoms[col]:
                        for atom in atoms[col].split(groupsep):
                            if opts.nouniq:
                                grouper[key][col].append(atom)
                            else:
                                grouper[key][col].add(atom)
                    else:
                        if opts.nouniq:
                            grouper[key][col].append(atoms[col])
                        else:
                            grouper[key][col].add(atoms[col])
        else:
            grouper.join(*atoms)

    for key in grouper:
        if groupby is not None:
            line = []
            for col in cols:
                if col == groupby:
                    line.append(key)
                elif col in grouper[key].keys():
                    line.append(groupsep.join(grouper[key][col]))
                else:
                    line.append("na")
            print(sep.join(line))
        else:
            print(groupsep.join(key))
Esempio n. 43
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through each locus (shared locus name or overlapping CDS)
    and identify same/different isoforms (shared splicing structure)
    across the input datasets.

    If `slop` is enabled, consolidation will collapse any variation
    in terminal UTR lengths, keeping the longest as representative.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index, match_subfeats
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    supported_modes = ["name", "coords"]
    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.add_option("--inferUTR", default=False, action="store_true",
            help="infer presence of UTRs from exon coordinates")
    p.add_option("--mode", default="name", choices=supported_modes,
            help="method used to determine overlapping loci")
    p.add_option("--summary", default=False, action="store_true",
            help="Generate summary table of consolidation process")
    p.add_option("--clusters", default=False, action="store_true",
            help="Generate table of cluster members after consolidation")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop
    inferUTR = opts.inferUTR
    mode = opts.mode

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)

    loci = Grouper()
    for dbn in gffdbx:
        odbns = [odbn for odbn in gffdbx if dbn != odbn]
        for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')):
            if mode == "name":
                loci.join(gene.id, (gene.id, dbn))
            else:
                if (gene.id, dbn) not in loci:
                    loci.join((gene.id, dbn))
                    gene_cds = list(gffdbx[dbn].children(gene, \
                        featuretype='CDS', order_by=('start')))
                    gene_cds_start, gene_cds_stop = gene_cds[0].start, \
                        gene_cds[-1].stop
                    for odbn in odbns:
                        for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \
                                start=gene_cds_start, end=gene_cds_stop, \
                                strand=gene.strand, featuretype='CDS'):
                            for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'):
                                loci.join((gene.id, dbn), (ogene.id, odbn))

    gfeats = {}
    mrna = AutoVivification()
    for i, locus in enumerate(loci):
        gene = "gene_{0:0{pad}}".format(i, pad=6) \
                if mode == "coords" else None

        for elem in locus:
            if type(elem) == tuple:
                _gene, dbn = elem
                if gene is None: gene = _gene

                g = gffdbx[dbn][_gene]
                if gene not in gfeats:
                    gfeats[gene] = g
                    gfeats[gene].attributes['ID'] = [gene]
                else:
                    if g.start < gfeats[gene].start:
                        gfeats[gene].start = g.start
                    if g.stop > gfeats[gene].stop:
                        gfeats[gene].stop = g.stop

                c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start'))
                if len(c) > 0:
                    mrna[gene][dbn] = c

    fw = must_open(opts.outfile, "w")
    print("##gff-version	3", file=fw)
    seen = {}
    if opts.summary:
        summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0])
        sfw = must_open(summaryfile, "w")
        summary = ["id"]
        summary.extend(gffdbx.keys())
        print("\t".join(str(x) for x in summary), file=sfw)
    if opts.clusters:
        clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0])
        cfw = must_open(clustersfile, "w")
        clusters = ["id", "dbns", "members", "trlens"]
        print("\t".join(str(x) for x in clusters), file=cfw)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2]
                for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]):
                    mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \
                            mrna2.stop - mrna2.start + 1
                    g.join((dbn1, mrna1.id, mrna1s))
                    g.join((dbn2, mrna2.id, mrna2s))

                    if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'):
                        res = []
                        ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR']
                        for ftype in ftypes:
                            res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop))

                        if all(r == True for r in res):
                            g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1))

        print(gfeats[gene], file=fw)

        for group in g:
            group.sort(key=lambda x: x[2], reverse=True)
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]

            dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), []
            for x in mrnas:
                if x not in _mrnaid: _mrnaid.append(x)
            mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid))
            if mrnaid not in seen:
                seen[mrnaid] = 0
            else:
                seen[mrnaid] += 1
                mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid])

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            _mrna.attributes['Parent'] = [gene]
            children = gffdbx[d].children(m, order_by='start')
            print(_mrna, file=fw)
            for child in children:
                child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print(child, file=fw)

            if opts.summary:
                summary = [mrnaid]
                summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
                print("\t".join(str(x) for x in summary), file=sfw)

            if opts.clusters:
                clusters = [mrnaid]
                clusters.append(",".join(str(el[0]) for el in group))
                clusters.append(",".join(str(el[1]) for el in group))
                clusters.append(",".join(str(el[2]) for el in group))
                print("\t".join(str(x) for x in clusters), file=cfw)

    fw.close()
    if opts.summary: sfw.close()
    if opts.clusters: cfw.close()