Example #1
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print(a)
        print(b)
        print(om_dist, ch_dist)
Example #2
0
File: bed.py Project: radaniba/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
Example #3
0
File: bed.py Project: yangjl/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
Example #4
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path",
    )
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance",
    )
    p.set_verbose(help="Print verbose reports to stdout")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(asub, bsub, atag, btag)

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
Example #5
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
Example #6
0
def get_distance(a, b, xaxis=True):
    """
    Returns the distance between two blast HSPs.
    """
    if xaxis:
        arange = ("0", a.qstart, a.qstop, a.orientation)  # 0 is the dummy chromosome
        brange = ("0", b.qstart, b.qstop, b.orientation)
    else:
        arange = ("0", a.sstart, a.sstop, a.orientation)
        brange = ("0", b.sstart, b.sstop, b.orientation)

    dist, oo = range_distance(arange, brange, distmode="ee")
    dist = abs(dist)

    return dist
Example #7
0
def get_distance(a, b, xaxis=True):
    """
    Returns the distance between two blast HSPs.
    """
    if xaxis:
        arange = ("0", a.qstart, a.qstop, a.orientation)  # 0 is the dummy chromosome
        brange = ("0", b.qstart, b.qstop, b.orientation)
    else:
        arange = ("0", a.sstart, a.sstop, a.orientation)
        brange = ("0", b.sstart, b.sstop, b.orientation)

    dist, oo = range_distance(arange, brange, distmode="ee")
    dist = abs(dist)

    return dist
Example #8
0
def gaps(args):
    """
    %prog gaps OM.bed fastafile

    Create patches around OM gaps.
    """
    from jcvi.formats.bed import uniq
    from jcvi.utils.iter import pairwise

    p = OptionParser(gaps.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ombed, fastafile = args
    ombed = uniq([ombed])
    bed = Bed(ombed)

    for a, b in pairwise(bed):
        om_a = (a.seqid, a.start, a.end, "+")
        om_b = (b.seqid, b.start, b.end, "+")
        ch_a = range_parse(a.accn)
        ch_b = range_parse(b.accn)
        ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+")
        ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+")

        om_dist, x = range_distance(om_a, om_b, distmode="ee")
        ch_dist, x = range_distance(ch_a, ch_b, distmode="ee")

        if om_dist <= 0 and ch_dist <= 0:
            continue

        print a
        print b
        print om_dist, ch_dist
Example #9
0
def estimate_size(accns, bed, order, conservative=True):
    """
    Estimate the bp length for the deletion tracks, indicated by the gene accns.
    True different levels of estimates vary on conservativeness.
    """
    accns = [order[x] for x in accns]
    ii, bb = zip(*accns)
    mini, maxi = min(ii), max(ii)
    if not conservative: # extend one gene
        mini -= 1
        maxi += 1
    minb = bed[mini]
    maxb = bed[maxi]
    assert minb.seqid == maxb.seqid
    distmode = "ss" if conservative else "ee"
    ra = (minb.seqid, minb.start, minb.end, "+")
    rb = (maxb.seqid, maxb.start, maxb.end, "+")

    dist, orientation = range_distance(ra, rb, distmode=distmode)
    assert dist != -1
    return dist
Example #10
0
def estimate_size(accns, bed, order, conservative=True):
    """
    Estimate the bp length for the deletion tracks, indicated by the gene accns.
    True different levels of estimates vary on conservativeness.
    """
    accns = [order[x] for x in accns]
    ii, bb = zip(*accns)
    mini, maxi = min(ii), max(ii)
    if not conservative:  # extend one gene
        mini -= 1
        maxi += 1
    minb = bed[mini]
    maxb = bed[maxi]
    assert minb.seqid == maxb.seqid
    distmode = "ss" if conservative else "ee"
    ra = (minb.seqid, minb.start, minb.end, "+")
    rb = (maxb.seqid, maxb.start, maxb.end, "+")

    dist, orientation = range_distance(ra, rb, distmode=distmode)
    assert dist != -1
    return dist
Example #11
0
def fastpairs(args):
    """
    %prog fastpairs castabfile

    Assuming paired reads are adjacent in the castabfile. Print pair distance
    and orientations.
    """
    from jcvi.utils.range import range_distance
    from jcvi.assembly.base import orientationlabels

    p = OptionParser(fastpairs.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    castabfile, = args
    fp = open(castabfile)
    arow = fp.readline()
    while arow:
        brow = fp.readline()
        a, b = CasTabLine(arow), CasTabLine(brow)
        asubject, astart, astop = a.refnum, a.refstart, a.refstop
        bsubject, bstart, bstop = b.refnum, b.refstart, b.refstop
        if -1 not in (astart, bstart):
            aquery, bquery = a.readname, b.readname
            astrand, bstrand = a.strand, b.strand
            dist, orientation = range_distance(\
                (asubject, astart, astop, astrand),
                (bsubject, bstart, bstop, bstrand)
                    )
            orientation = orientationlabels[orientation]
            if dist != -1:
                print "\t".join(
                    str(x) for x in (aquery, bquery, dist, orientation))
        arow = fp.readline()
Example #12
0
File: cas.py Project: bennyyu/jcvi
def fastpairs(args):
    """
    %prog fastpairs castabfile

    Assuming paired reads are adjacent in the castabfile. Print pair distance
    and orientations.
    """
    from jcvi.utils.range import range_distance
    from jcvi.assembly.base import orientationlabels

    p = OptionParser(fastpairs.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    castabfile, = args
    fp = open(castabfile)
    arow = fp.readline()
    while arow:
        brow = fp.readline()
        a, b = CasTabLine(arow), CasTabLine(brow)
        asubject, astart, astop = a.refnum, a.refstart, a.refstop
        bsubject, bstart, bstop = b.refnum, b.refstart, b.refstop
        if -1 not in (astart, bstart):
            aquery, bquery = a.readname, b.readname
            astrand, bstrand = a.strand, b.strand
            dist, orientation = range_distance(\
                (asubject, astart, astop, astrand),
                (bsubject, bstart, bstop, bstrand)
                    )
            orientation = orientationlabels[orientation]
            if dist != -1:
                print "\t".join(str(x) for x in (aquery, bquery, dist, orientation))
        arow = fp.readline()
Example #13
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option("--cutoff", default=90, type="int",
                 help="Coverage cutoff to call gene missing [default: %default]")
    p.add_option("--flank", default=2000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    p.add_option("--maxsize", default=50000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and \
                   distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print >> fwp, "\t".join(str(x) for x in \
                                    (chr, leftb.start, rightb.end, gb.accn))

                print >> fwe, leftb
                print >> fwe, gb
                print >> fwe, rightb
                print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \
                            distance_to_right, label)
                print >> fw_ids, gb.accn
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name))

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Example #14
0
def test_range_distance(a, b, distmode, expected):
    from jcvi.utils.range import range_distance

    assert range_distance(a, b, distmode) == expected
Example #15
0
File: bed.py Project: radaniba/jcvi
def report_pairs(data,
                 cutoff=0,
                 mateorientation=None,
                 pairsfile=None,
                 insertsfile=None,
                 rclip=1,
                 ascii=False,
                 bins=20,
                 distmode="ss",
                 mpcutoff=1000):
    """
    This subroutine is used by the pairs function in blast.py and cas.py.
    Reports number of fragments and pairs as well as linked pairs
    """
    allowed_mateorientations = ("++", "--", "+-", "-+")

    if mateorientation:
        assert mateorientation in allowed_mateorientations

    num_fragments, num_pairs = 0, 0

    all_dist = []
    linked_dist = []
    # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie`
    orientations = defaultdict(int)

    # clip how many chars from end of the read name to get pair name
    key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn)
    data.sort(key=key)

    if pairsfile:
        pairsfw = open(pairsfile, "w")
    if insertsfile:
        insertsfw = open(insertsfile, "w")

    for pe, lines in groupby(data, key=key):
        lines = list(lines)
        if len(lines) != 2:
            num_fragments += len(lines)
            continue

        num_pairs += 1
        a, b = lines

        asubject, astart, astop = a.seqid, a.start, a.end
        bsubject, bstart, bstop = b.seqid, b.start, b.end

        aquery, bquery = a.accn, b.accn
        astrand, bstrand = a.strand, b.strand

        dist, orientation = range_distance(\
                (asubject, astart, astop, astrand),
                (bsubject, bstart, bstop, bstrand),
                distmode=distmode)

        if dist >= 0:
            all_dist.append((dist, orientation, aquery, bquery))

    # select only pairs with certain orientations - e.g. innies, outies, etc.
    if mateorientation:
        all_dist = [x for x in all_dist if x[1] == mateorientation]

    # try to infer cutoff as twice the median until convergence
    if cutoff <= 0:
        dists = np.array([x[0] for x in all_dist], dtype="int")
        p0 = analyze_dists(dists, cutoff=mpcutoff)
        cutoff = int(2 * p0)  # initial estimate
        cutoff = int(math.ceil(cutoff / bins)) * bins
        logging.debug("Insert size cutoff set to {0}, ".format(cutoff) +
                      "use '--cutoff' to override")

    for dist, orientation, aquery, bquery in all_dist:
        if dist > cutoff:
            continue
        if cutoff > 2 * mpcutoff and dist < mpcutoff:
            continue

        linked_dist.append(dist)
        if pairsfile:
            print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist)
        orientations[orientation] += 1

    print >>sys.stderr, "{0} fragments, {1} pairs ({2} total)".\
                format(num_fragments, num_pairs, num_fragments + num_pairs * 2)

    s = SummaryStats(linked_dist, dtype="int")
    num_links = s.size

    meandist, stdev = s.mean, s.sd
    p0, p1, p2 = s.median, s.p1, s.p2

    print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \
            (num_links, num_links * 100. / num_pairs, cutoff)
    print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\
            format(meandist, stdev)
    print >> sys.stderr, "median distance between mates: {0}".format(p0)
    print >> sys.stderr, "95% distance range: {0} - {1}".format(p1, p2)
    print >> sys.stderr, "\nOrientations:"

    orientation_summary = []
    for orientation, count in sorted(orientations.items()):
        o = "{0}:{1}".format(orientation, \
                percentage(count, num_links, mode=1))
        orientation_summary.append(o.split()[0])
        print >> sys.stderr, o

    if insertsfile:
        from jcvi.graphics.histogram import histogram

        print >> insertsfw, "\n".join(str(x) for x in linked_dist)
        insertsfw.close()
        prefix = insertsfile.rsplit(".", 1)[0]
        if prefix > 10:
            prefix = prefix.split("-")[0]
        osummary = " ".join(orientation_summary)
        title = "{0} ({1}; median:{2} bp)".format(prefix, osummary, p0)
        histogram(insertsfile,
                  vmin=0,
                  vmax=cutoff,
                  bins=bins,
                  xlabel="Insertsize",
                  title=title,
                  ascii=ascii)
        if op.exists(insertsfile):
            os.remove(insertsfile)

    return s
Example #16
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option("--clique", default=False, action="store_true",
                 help="Populate clique instead of linear path [default: %default]")
    p.add_option("--maxdist", default=100000, type="int",
                 help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose", default=False, action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Example #17
0
def fromblast(args):
    """
    %prog fromblast blastfile subject.fasta

    Generate path from BLAST file. If multiple subjects map to the same query,
    an edge is constructed between them (with the link provided by the query).

    The BLAST file MUST be filtered, chained, supermapped.
    """
    from jcvi.formats.blast import sort
    from jcvi.utils.range import range_distance

    p = OptionParser(fromblast.__doc__)
    p.add_option(
        "--clique",
        default=False,
        action="store_true",
        help="Populate clique instead of linear path [default: %default]")
    p.add_option(
        "--maxdist",
        default=100000,
        type="int",
        help="Create edge within certain distance [default: %default]")
    p.add_option("--verbose",
                 default=False,
                 action="store_true",
                 help="Print verbose reports to stdout [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, subjectfasta = args
    clique = opts.clique
    maxdist = opts.maxdist
    sort([blastfile, "--query"])
    blast = BlastSlow(blastfile, sorted=True)
    g = BiGraph()
    for query, blines in groupby(blast, key=lambda x: x.query):
        blines = list(blines)
        iterator = combinations(blines, 2) if clique else pairwise(blines)
        for a, b in iterator:
            asub, bsub = a.subject, b.subject
            if asub == bsub:
                continue

            arange = (a.query, a.qstart, a.qstop, "+")
            brange = (b.query, b.qstart, b.qstop, "+")
            dist, oo = range_distance(arange, brange, distmode="ee")
            if dist > maxdist:
                continue

            atag = ">" if a.orientation == "+" else "<"
            btag = ">" if b.orientation == "+" else "<"
            g.add_edge(BiEdge(asub, bsub, atag, btag))

    g.write("graph.txt")
    #g.draw("graph.pdf")

    logging.debug(str(g))
    paths = []
    for path in g.iter_paths():
        m, oo = g.path(path)
        if len(oo) == 1:  # Singleton path
            continue
        paths.append(oo)
        if opts.verbose:
            print m
            print oo

    npaths = len(paths)
    ntigs = sum(len(x) for x in paths)
    logging.debug("Graph decomposed to {0} paths with {1} components.".\
                  format(npaths, ntigs))

    agpfile = blastfile + ".agp"
    sizes = Sizes(subjectfasta)
    fwagp = open(agpfile, "w")
    scaffolded = set()
    for i, oo in enumerate(paths):
        ctgorder = [(str(ctg), ("+" if strand else "-")) \
                     for ctg, strand in oo]
        scaffolded |= set(ctg for ctg, strand in ctgorder)
        object = "pmol_{0:04d}".format(i)
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)

    # Get the singletons as well
    nsingletons = 0
    for ctg, size in sizes.iter_sizes():
        if ctg in scaffolded:
            continue

        ctgorder = [(ctg, "+")]
        object = ctg
        order_to_agp(object, ctgorder, sizes.mapping, fwagp)
        nsingletons += 1
    logging.debug("Written {0} unscaffolded singletons.".format(nsingletons))

    fwagp.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))
Example #18
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option(
        "--cutoff",
        default=90,
        type="int",
        help="Coverage cutoff to call gene missing",
    )
    p.add_option(
        "--flank",
        default=2000,
        type="int",
        help="Get the seq of size on two ends",
    )
    p.add_option(
        "--maxsize",
        default=50000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print(
                        "\t".join(
                            str(x) for x in (chr, leftb.start, rightb.end, gb.accn)
                        ),
                        file=fwp,
                    )

                print(leftb, file=fwe)
                print(gb, file=fwe)
                print(rightb, file=fwe)
                print(
                    "L:{0} R:{1} [{2}]".format(
                        distance_to_left, distance_to_right, label
                    ),
                    file=fwe,
                )
                print(gb.accn, file=fw_ids)
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw)

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Example #19
0
def report_pairs(data, cutoff=0, mateorientation=None,
        pairsfile=None, insertsfile=None, rclip=1, ascii=False, bins=20,
        distmode="ss"):
    """
    This subroutine is used by the pairs function in blast.py and cas.py.
    Reports number of fragments and pairs as well as linked pairs
    """
    from jcvi.utils.cbook import percentage

    allowed_mateorientations = ("++", "--", "+-", "-+")

    if mateorientation:
        assert mateorientation in allowed_mateorientations

    num_fragments, num_pairs = 0, 0

    all_dist = []
    linked_dist = []
    # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie`
    orientations = defaultdict(int)

    # clip how many chars from end of the read name to get pair name
    key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn)
    data.sort(key=key)

    if pairsfile:
        pairsfw = open(pairsfile, "w")
    if insertsfile:
        insertsfw = open(insertsfile, "w")

    for pe, lines in groupby(data, key=key):
        lines = list(lines)
        if len(lines) != 2:
            num_fragments += len(lines)
            continue

        num_pairs += 1
        a, b = lines

        asubject, astart, astop = a.seqid, a.start, a.end
        bsubject, bstart, bstop = b.seqid, b.start, b.end

        aquery, bquery = a.accn, b.accn
        astrand, bstrand = a.strand, b.strand

        dist, orientation = range_distance(\
                (asubject, astart, astop, astrand),
                (bsubject, bstart, bstop, bstrand),
                distmode=distmode)

        if dist >= 0:
            all_dist.append((dist, orientation, aquery, bquery))

    # select only pairs with certain orientations - e.g. innies, outies, etc.
    if mateorientation:
        all_dist = [x for x in all_dist if x[1] == mateorientation]

    # try to infer cutoff as twice the median until convergence
    if cutoff <= 0:
        dists = np.array([x[0] for x in all_dist], dtype="int")
        p0 = np.median(dists)
        cutoff = int(2 * p0)  # initial estimate
        cutoff = int(math.ceil(cutoff / bins)) * bins
        logging.debug("Insert size cutoff set to {0}, ".format(cutoff) +
            "use '--cutoff' to override")

    for dist, orientation, aquery, bquery in all_dist:
        if dist > cutoff:
            continue

        linked_dist.append(dist)
        if pairsfile:
            print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist)
        orientations[orientation] += 1

    print >>sys.stderr, "%d fragments, %d pairs" % (num_fragments, num_pairs)
    num_links = len(linked_dist)

    linked_dist = np.array(linked_dist, dtype="int")
    linked_dist = np.sort(linked_dist)

    meandist = np.mean(linked_dist)
    stdev = np.std(linked_dist)

    p0 = np.median(linked_dist)
    p1 = linked_dist[int(num_links * .025)]
    p2 = linked_dist[int(num_links * .975)]

    meandist, stdev = int(meandist), int(stdev)
    p0 = int(p0)

    print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \
            (num_links, num_links * 100. / num_pairs, cutoff)

    print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\
            format(meandist, stdev)
    print >>sys.stderr, "median distance between mates: {0}".format(p0)
    print >>sys.stderr, "95% distance range: {0} - {1}".format(p1, p2)
    print >>sys.stderr, "\nOrientations:"

    orientation_summary = []
    for orientation, count in sorted(orientations.items()):
        o = "{0}:{1}".format(orientation, \
                percentage(count, num_links, denominator=False))
        orientation_summary.append(o.split()[0])
        print >>sys.stderr, o

    if insertsfile:
        from jcvi.graphics.histogram import histogram

        print >>insertsfw, "\n".join(str(x) for x in linked_dist)
        insertsfw.close()
        prefix = insertsfile.rsplit(".", 1)[0]
        osummary = " ".join(orientation_summary)
        title="{0} ({1}; median dist:{2})".format(prefix, osummary, p0)
        histogram(insertsfile, vmin=0, vmax=cutoff, bins=bins,
                xlabel="Insertsize", title=title, ascii=ascii)
        if op.exists(insertsfile):
            os.remove(insertsfile)

    return meandist, stdev, p0, p1, p2