Example #1
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option("--flank", default=5000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    goodfasta, = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print >> fw, "\t".join(str(x) for x in \
                               (bac, 0, min(flank, size), bac + "L"))
        print >> fw, "\t".join(str(x) for x in \
                               (bac, max(size - flank, 0), size, bac + "R"))
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Example #2
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option(
        "--flank",
        default=5000,
        type="int",
        help="Get the seq of size on two ends",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (goodfasta,) = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")), file=fw)
        print(
            "\t".join(str(x) for x in (bac, max(size - flank, 0), size, bac + "R")),
            file=fw,
        )
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Example #3
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Example #4
0
def fasta(args):
    """
    %prog fasta map.out scaffolds.fasta

    Extract marker sequences based on map.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fasta.__doc__)
    p.add_option(
        "--extend",
        default=1000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mapout, sfasta = args
    Flank = opts.extend
    pf = mapout.split(".")[0]
    mapbed = pf + ".bed"
    bm = BinMap(mapout)
    bm.print_to_bed(mapbed)

    bed = Bed(mapbed, sorted=False)
    markersbed = pf + ".markers.bed"
    fw = open(markersbed, "w")
    sizes = Sizes(sfasta).mapping
    for b in bed:
        accn = b.accn
        scf, pos = accn.split(".")
        pos = int(pos)
        start = max(0, pos - Flank)
        end = min(pos + Flank, sizes[scf])
        print("\t".join(str(x) for x in (scf, start, end, accn)), file=fw)

    fw.close()

    fastaFromBed(markersbed, sfasta, name=True)
Example #5
0
def flanks(args):
    """
    %prog flanks gaps.bed fastafile

    Create sequences flanking the gaps.
    """
    p = OptionParser(flanks.__doc__)
    p.add_option(
        "--extend",
        default=2000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, fastafile = args
    Ext = opts.extend
    sizes = Sizes(fastafile).mapping

    bed = Bed(gapsbed)
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for i, b in enumerate(bed):
        seqid = b.seqid
        gapname = b.accn
        size = sizes[seqid]

        prev_b = bed[i - 1] if i > 0 else None
        next_b = bed[i + 1] if i + 1 < len(bed) else None
        if prev_b and prev_b.seqid != seqid:
            prev_b = None
        if next_b and next_b.seqid != seqid:
            next_b = None

        start = prev_b.end + 1 if prev_b else 1
        start, end = max(start, b.start - Ext), b.start - 1
        print("\t".join(
            str(x) for x in (b.seqid, start - 1, end, gapname + "L")),
              file=fw)

        end = next_b.start - 1 if next_b else size
        start, end = b.end + 1, min(end, b.end + Ext)
        print("\t".join(
            str(x) for x in (b.seqid, start - 1, end, gapname + "R")),
              file=fw)
    fw.close()

    extfasta = fastaFromBed(extbed, fastafile, name=True)
    return extbed, extfasta
Example #6
0
def fill(args):
    """
    %prog fill gaps.bed bad.fasta

    Perform gap filling of one assembly (bad) using sequences from another.
    """
    p = OptionParser(fill.__doc__)
    p.add_option(
        "--extend",
        default=2000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, badfasta = args
    Ext = opts.extend

    gapdist = 2 * Ext + 1  # This is to prevent to replacement ranges intersect
    gapsbed = mergeBed(gapsbed, d=gapdist, nms=True)

    bed = Bed(gapsbed)
    sizes = Sizes(badfasta).mapping
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for b in bed:
        gapname = b.accn
        start, end = max(0, b.start - Ext - 1), b.start - 1
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")),
              file=fw)
        start, end = b.end, min(sizes[b.seqid], b.end + Ext)
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")),
              file=fw)
    fw.close()

    fastaFromBed(extbed, badfasta, name=True)
Example #7
0
def fasta(args):
    """
    %prog fasta map.out scaffolds.fasta

    Extract marker sequences based on map.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fasta.__doc__)
    p.add_option("--extend", default=1000, type="int",
                 help="Extend seq flanking the gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mapout, sfasta = args
    Flank = opts.extend
    pf = mapout.split(".")[0]
    mapbed = pf + ".bed"
    bm = BinMap(mapout)
    bm.print_to_bed(mapbed)

    bed = Bed(mapbed, sorted=False)
    markersbed = pf + ".markers.bed"
    fw = open(markersbed, "w")
    sizes = Sizes(sfasta).mapping
    for b in bed:
        accn = b.accn
        scf, pos = accn.split(".")
        pos = int(pos)
        start = max(0, pos - Flank)
        end = min(pos + Flank, sizes[scf])
        print >> fw, "\t".join(str(x) for x in \
                    (scf, start, end, accn))

    fw.close()

    fastaFromBed(markersbed, sfasta, name=True)
Example #8
0
def fill(args):
    """
    %prog fill gaps.bed bad.fasta

    Perform gap filling of one assembly (bad) using sequences from another.
    """
    p = OptionParser(fill.__doc__)
    p.add_option("--extend", default=2000, type="int",
                 help="Extend seq flanking the gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, badfasta = args
    Ext = opts.extend

    gapdist = 2 * Ext + 1  # This is to prevent to replacement ranges intersect
    gapsbed = mergeBed(gapsbed, d=gapdist, nms=True)

    bed = Bed(gapsbed)
    sizes = Sizes(badfasta).mapping
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for b in bed:
        gapname = b.accn
        start, end = max(0, b.start - Ext - 1), b.start - 1
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start, end, gapname + "L"))
        start, end = b.end, min(sizes[b.seqid], b.end + Ext)
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start, end, gapname + "R"))
    fw.close()

    fastaFromBed(extbed, badfasta, name=True)
Example #9
0
def flanks(args):
    """
    %prog flanks gaps.bed fastafile

    Create sequences flanking the gaps.
    """
    p = OptionParser(flanks.__doc__)
    p.add_option("--extend", default=2000, type="int",
                 help="Extend seq flanking the gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, fastafile = args
    Ext = opts.extend
    sizes = Sizes(fastafile).mapping

    bed = Bed(gapsbed)
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for i, b in enumerate(bed):
        seqid = b.seqid
        gapname = b.accn
        size = sizes[seqid]

        prev_b = bed[i - 1] if i > 0 else None
        next_b = bed[i + 1] if i + 1 < len(bed) else None
        if prev_b and prev_b.seqid != seqid:
            prev_b = None
        if next_b and next_b.seqid != seqid:
            next_b = None

        start = prev_b.end + 1 if prev_b else 1
        start, end = max(start, b.start - Ext), b.start - 1
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start - 1, end, gapname + "L"))

        end = next_b.start - 1 if next_b else size
        start, end = b.end + 1, min(end, b.end + Ext)
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start - 1, end, gapname + "R"))
    fw.close()

    extfasta = fastaFromBed(extbed, fastafile, name=True)
    return extbed, extfasta
Example #10
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(
        mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Example #11
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option(
        "--maxsize",
        default=300000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    p.add_option("--prefix", help="Prefix of the new object")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(
        blastfile, order, rclip=rclip, maxsize=maxsize
    )

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count("n") + x.seq.count("N")
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug(
        "Ignore {0} updates because of decreasing quality.".format(len(exclude))
    )

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #12
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option(
        "--cutoff",
        default=90,
        type="int",
        help="Coverage cutoff to call gene missing",
    )
    p.add_option(
        "--flank",
        default=2000,
        type="int",
        help="Get the seq of size on two ends",
    )
    p.add_option(
        "--maxsize",
        default=50000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print(
                        "\t".join(
                            str(x) for x in (chr, leftb.start, rightb.end, gb.accn)
                        ),
                        file=fwp,
                    )

                print(leftb, file=fwe)
                print(gb, file=fwe)
                print(rightb, file=fwe)
                print(
                    "L:{0} R:{1} [{2}]".format(
                        distance_to_left, distance_to_right, label
                    ),
                    file=fwe,
                )
                print(gb.accn, file=fw_ids)
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw)

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Example #13
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--maxsize", default=300000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix", help="Prefix of the new object [default: %default]")
    p.add_option("--strict", default=False, action="store_true",
            help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip,
                                           maxsize=maxsize)

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))


    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #14
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option("--cutoff", default=90, type="int",
                 help="Coverage cutoff to call gene missing [default: %default]")
    p.add_option("--flank", default=2000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    p.add_option("--maxsize", default=50000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and \
                   distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print >> fwp, "\t".join(str(x) for x in \
                                    (chr, leftb.start, rightb.end, gb.accn))

                print >> fwe, leftb
                print >> fwe, gb
                print >> fwe, rightb
                print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \
                            distance_to_right, label)
                print >> fw_ids, gb.accn
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name))

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Example #15
0
def annotate(args):
    """
    %prog annotate agpfile gaps.linkage.bed assembly.fasta

    Annotate AGP file with linkage info of `paired-end` or `map`.
    File `gaps.linkage.bed` is generated by assembly.gaps.estimate().
    """
    from jcvi.formats.agp import AGP, bed, tidy

    p = OptionParser(annotate.__doc__)
    p.add_option("--minsize", default=200,
                 help="Smallest component size [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, linkagebed, assemblyfasta = args
    linkagebed = Bed(linkagebed)
    spannedgaps = set()
    for b in linkagebed:
        score = int(b.score)
        if score == 0:
            spannedgaps.add((b.accn, b.start, b.end))

    agp = AGP(agpfile)
    newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp"
    newagp = open(newagpfile, "w")
    contig_id = 0
    minsize = opts.minsize
    for a in agp:
        if not a.is_gap:
            cs = a.component_span
            if cs < minsize:
                a.is_gap = True
                a.component_type = "N"
                a.gap_length = cs
                a.gap_type = "scaffold"
                a.linkage = "yes"
                a.linkage_evidence = []
            else:
                contig_id += 1
                a.component_id = "contig{0:04d}".format(contig_id)
                a.component_beg = 1
                a.component_end = cs
                a.component_type = "W"

            print >> newagp, a
            continue

        gapinfo = (a.object, a.object_beg, a.object_end)
        gaplen = a.gap_length

        if gaplen == 100 and gapinfo not in spannedgaps:
            a.component_type = "U"
            tag = "map"
        else:
            tag = "paired-ends"

        a.linkage_evidence.append(tag)
        print >> newagp, a

    newagp.close()
    logging.debug("Annotated AGP written to `{0}`.".format(newagpfile))

    contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed"
    bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed])

    contigfasta = fastaFromBed(bedfile, assemblyfasta, name=True, stranded=True)

    tidy([newagpfile, contigfasta])
Example #16
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)