Ejemplo n.º 1
0
Archivo: gaps.py Proyecto: zjwang6/jcvi
def estimate(args):
    """
    %prog estimate gaps.bed all.spans.bed all.mates

    Estimate gap sizes based on mate positions and library insert sizes.
    """
    from collections import defaultdict
    from jcvi.formats.bed import intersectBed_wao
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(estimate.__doc__)
    p.add_option("--minlinks",
                 default=3,
                 type="int",
                 help="Minimum number of links to place [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gapsbed, spansbed, matesfile = args
    mf = MatesFile(matesfile)
    bed = Bed(gapsbed)
    order = bed.order

    gap2mate = defaultdict(set)
    mate2gap = defaultdict(set)

    for a, b in intersectBed_wao(gapsbed, spansbed):
        gapsize = a.span
        if gapsize != 100:
            continue

        gapname = a.accn

        if b is None:
            gap2mate[gapname] = set()
            continue

        matename = b.accn
        gap2mate[gapname].add(matename)
        mate2gap[matename].add(gapname)

    omgapsbed = "gaps.linkage.bed"
    fw = open(omgapsbed, "w")
    for gapname, mates in sorted(gap2mate.items()):
        i, b = order[gapname]
        nmates = len(mates)
        if nmates < opts.minlinks:
            print("{0}\t{1}".format(b, nmates), file=fw)
            continue

        print(gapname, mates)

    fw.close()
Ejemplo n.º 2
0
def bambus(args):
    """
    %prog bambus bambus.bed bambus.mates total.fasta

    Insert unplaced scaffolds based on mates.
    """
    from jcvi.formats.bed import BedLine
    from jcvi.formats.posmap import MatesFile

    p = OptionParser(bambus.__doc__)
    p.add_option(
        "--prefix",
        default="scaffold",
        help="Prefix of the unplaced scaffolds",
    )
    p.add_option(
        "--minlinks",
        default=3,
        type="int",
        help="Minimum number of links to place",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, matesfile, fastafile = args
    pf = matesfile.rsplit(".", 1)[0]
    logfile = pf + ".log"
    log = open(logfile, "w")

    mf = MatesFile(matesfile)
    maxdist = max(x.max for x in mf.libraries.values())
    logging.debug("Max separation: {0}".format(maxdist))

    prefix = opts.prefix
    minlinks = opts.minlinks

    is_unplaced = lambda x: x.startswith(prefix)
    bed = Bed(bedfile, sorted=False)
    beds = []
    unplaced = defaultdict(list)

    for a, b in pairwise(bed):
        aname, bname = a.accn, b.accn
        aseqid, bseqid = a.seqid, b.seqid

        if aname not in mf:
            continue

        pa, la = mf[aname]
        if pa != bname:
            continue

        ia = is_unplaced(aseqid)
        ib = is_unplaced(bseqid)
        if ia == ib:
            continue

        if ia:
            a, b = b, a

        unplaced[b.seqid].append((a, b))
        beds.extend([a, b])

    sizes = Sizes(fastafile)
    candidatebed = Bed()
    cbeds = []
    # For each unplaced scaffold, find most likely placement and orientation
    for scf, beds in sorted(unplaced.items()):
        print(file=log)
        ranges = []
        for a, b in beds:
            aname, astrand = a.accn, a.strand
            bname, bstrand = b.accn, b.strand
            aseqid, bseqid = a.seqid, b.seqid
            pa, lib = mf[aname]

            print(a, file=log)
            print(b, file=log)

            flip_b = astrand == bstrand
            fbstrand = "-" if flip_b else "+"
            if flip_b:
                b.reverse_complement(sizes)

            lmin, lmax = lib.min, lib.max

            L = sizes.get_size(scf)
            assert astrand in ("+", "-")
            if astrand == "+":
                offset = a.start - b.end
                sstart, sstop = offset + lmin, offset + lmax
            else:
                offset = a.end - b.start + L
                sstart, sstop = offset - lmax, offset - lmin

            # Prevent out of range error
            size = sizes.get_size(aseqid)
            sstart = max(0, sstart)
            sstop = max(0, sstop)
            sstart = min(size - 1, sstart)
            sstop = min(size - 1, sstop)

            start_range = (aseqid, sstart, sstop, scf, 1, fbstrand)
            print("*" + "\t".join(str(x) for x in start_range), file=log)
            ranges.append(start_range)

        mranges = [x[:3] for x in ranges]
        # Determine placement by finding the interval with the most support
        rd = ranges_depth(mranges, sizes.mapping, verbose=False)
        alldepths = []
        for depth in rd:
            alldepths.extend(depth)
        print(alldepths, file=log)

        maxdepth = max(alldepths, key=lambda x: x[-1])[-1]
        if maxdepth < minlinks:
            print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log)
            continue

        candidates = [x for x in alldepths if x[-1] == maxdepth]
        nseqids = len(set(x[0] for x in candidates))
        if nseqids != 1:
            msg = "Multiple conflicting candidates found"
            print(msg, file=log)
            continue

        seqid, mmin, mmax, depth = candidates[0]
        mmin, mmax = range_minmax([x[1:3] for x in candidates])

        if mmin >= mmax:
            msg = "Invalid (min, max) range"
            print("Invalid (min, max) range", file=log)
            continue

        if (mmax - mmin) > maxdist:
            msg = "(min, max) distance greater than library maxdist"
            print(msg, file=log)
            continue

        # Determine orientation by voting
        nplus, nminus = 0, 0
        arange = (seqid, mmin, mmax)
        for sid, start, end, sf, sc, fbstrand in ranges:
            brange = (sid, start, end)
            if range_overlap(arange, brange):
                if fbstrand == "+":
                    nplus += 1
                else:
                    nminus += 1

        fbstrand = "+" if nplus >= nminus else "-"

        candidate = (seqid, mmin, mmax, scf, depth, fbstrand)
        bedline = BedLine("\t".join((str(x) for x in candidate)))
        cbeds.append(bedline)
        print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log)
        print(candidate, file=log)

    candidatebed.extend(cbeds)
    logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed)))
    log.close()

    candidatebedfile = pf + ".candidate.bed"
    candidatebed.print_to_file(candidatebedfile, sorted=True)