Example #1
0
def phase(accession):
    gbdir = "gb"
    gbfile = op.join(gbdir, accession + ".gb")
    if not op.exists(gbfile):
        entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"])
    rec = SeqIO.parse(gbfile, "gb").next()
    ph, keywords = get_phase(rec)
    return ph, len(rec)
Example #2
0
def phase(accession):
    gbdir = "gb"
    gbfile = op.join(gbdir, accession + ".gb")
    if not op.exists(gbfile):
        entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"])
    rec = next(SeqIO.parse(gbfile, "gb"))
    ph, keywords = get_phase(rec)
    return ph, len(rec)
Example #3
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option(
        "--maxsize",
        default=300000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    p.add_option("--prefix", help="Prefix of the new object")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(
        blastfile, order, rclip=rclip, maxsize=maxsize
    )

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count("n") + x.seq.count("N")
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug(
        "Ignore {0} updates because of decreasing quality.".format(len(exclude))
    )

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #4
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--maxsize", default=300000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix", help="Prefix of the new object [default: %default]")
    p.add_option("--strict", default=False, action="store_true",
            help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip,
                                           maxsize=maxsize)

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))


    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #5
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int",
                 help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int",
                 help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()
    widgets = ['Parse reads: ', Percentage(), ' ',
               Bar(marker='>', left='[', right=']'), ' ', ETA()]
    pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else \
           SeqIO.parse(must_open(fastqfile), "fasta")
    for rec in it:
        k += 1
        if k % 1000 == 0:
            pr.update(k)
        if k > nreads:
            break
        if rec is None:
            break
        s = str(rec.seq)
        for i, a in enumerate(s[:N]):
            if a in p:
                a = p[a]
                L[a][i] += 1
        for j, a in enumerate(s[-N:][::-1]):
            if a in p:
                a = p[a]
                R[a][N - 1 - j] += 1
        l4, r3 = s[:4], s[-3:]
        L4[l4] += 1
        R3[r3] += 1
        print >> fw_L, ">{0}\n{1}".format(k, s[:N])
        print >> fw_R, ">{0}\n{1}".format(k, s[-N:])

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format('L', "First_10_bases"))
    sh(cmd.format('R', "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()
Example #6
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)
Example #7
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int",
                 help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int",
                 help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()
    widgets = ['Parse reads: ', Percentage(), ' ',
               Bar(marker='>', left='[', right=']'), ' ', ETA()]
    pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else \
           SeqIO.parse(must_open(fastqfile), "fasta")
    for rec in it:
        k += 1
        if k % 1000 == 0:
            pr.update(k)
        if k > nreads:
            break
        if rec is None:
            break
        s = str(rec.seq)
        for i, a in enumerate(s[:N]):
            if a in p:
                a = p[a]
                L[a][i] += 1
        for j, a in enumerate(s[-N:][::-1]):
            if a in p:
                a = p[a]
                R[a][N - 1 - j] += 1
        l4, r3 = s[:4], s[-3:]
        L4[l4] += 1
        R3[r3] += 1
        print >> fw_L, ">{0}\n{1}".format(k, s[:N])
        print >> fw_R, ">{0}\n{1}".format(k, s[-N:])

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format('L', "First_10_bases"))
    sh(cmd.format('R', "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()
Example #8
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from rich.progress import Progress

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int", help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int", help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastqfile,) = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else SeqIO.parse(must_open(fastqfile), "fasta")

    with Progress() as progress:
        progress.add_task("[green] Processing ...", start=False, total=nreads)
        for rec in it:
            k += 1
            if k > nreads:
                break
            if rec is None:
                break
            s = str(rec.seq)
            for i, a in enumerate(s[:N]):
                if a in p:
                    a = p[a]
                    L[a][i] += 1
            for j, a in enumerate(s[-N:][::-1]):
                if a in p:
                    a = p[a]
                    R[a][N - 1 - j] += 1
            l4, r3 = s[:4], s[-3:]
            L4[l4] += 1
            R3[r3] += 1
            print(">{0}\n{1}".format(k, s[:N]), file=fw_L)
            print(">{0}\n{1}".format(k, s[-N:]), file=fw_R)

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format("L", "First_10_bases"))
    sh(cmd.format("R", "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=",", fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=",", fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print("\t".join((p, str(c))), file=fw)
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print("\t".join((p, str(c))), file=fw)
    fw.close()