Example #1
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
Example #2
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Example #3
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Example #4
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
Example #5
0
def extract_ends(rec, sites, flank, fw, maxfragsize=800):
    """
    Extraction of ends of fragments above certain size.
    """
    nsites = len(sites)
    size = len(rec)
    for i, s in enumerate(sites):
        newid = "{0}:{1}".format(rec.name, s)
        recs = []

        if i == 0 or s - sites[i - 1] <= maxfragsize:
            newidL = newid + "L"
            left = max(s - flank, 0)
            right = s
            frag = rec.seq[left:right].strip("Nn")
            recL = SeqRecord(frag, id=newidL, description="")
            if i == 0 and s > maxfragsize:  # Contig L-end
                pass
            else:
                recs.append(recL)

        if i == nsites - 1 or sites[i + 1] - s <= maxfragsize:
            newidR = newid + "R"
            left = s
            right = min(s + flank, size)
            frag = rec.seq[left:right].strip("Nn")
            recR = SeqRecord(frag, id=newidR, description="")
            if i == nsites - 1 and size - s > maxfragsize:  # Contig R-end
                pass
            else:
                recs.append(recR)

        SeqIO.write(recs, fw, "fasta")
Example #6
0
def extract_ends(rec, sites, flank, fw, maxfragsize=800):
    """
    Extraction of ends of fragments above certain size.
    """
    nsites = len(sites)
    size = len(rec)
    for i, s in enumerate(sites):
        newid = "{0}:{1}".format(rec.name, s)
        recs = []

        if i == 0 or s - sites[i - 1] <= maxfragsize:
            newidL = newid + "L"
            left = max(s - flank, 0)
            right = s
            frag = rec.seq[left:right].strip("Nn")
            recL = SeqRecord(frag, id=newidL, description="")
            if i == 0 and s > maxfragsize:  # Contig L-end
                pass
            else:
                recs.append(recL)

        if i == nsites - 1 or sites[i + 1] - s <= maxfragsize:
            newidR = newid + "R"
            left = s
            right = min(s + flank, size)
            frag = rec.seq[left:right].strip("Nn")
            recR = SeqRecord(frag, id=newidR, description="")
            if i == nsites - 1 and size - s > maxfragsize:  # Contig R-end
                pass
            else:
                recs.append(recR)

        SeqIO.write(recs, fw, "fasta")
Example #7
0
def extract(args):
    """
    %prog extract gffile

    --contigs: Extract particular contig(s) from the gff file. If multiple contigs are
    involved, use "," to separate, e.g. "contig_12,contig_150"
    --names: Provide a file with IDs, one each line
    """
    p = OptionParser(extract.__doc__)
    p.add_option("--contigs",
                help="Extract features from certain contigs [default: %default]")
    p.add_option("--names",
                help="Extract features with certain names [default: %default]")
    p.add_option("--fasta", default=False, action="store_true",
                help="Write FASTA if available [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    contigID = opts.contigs
    namesfile = opts.names

    contigID = set(contigID.split(",")) if contigID else None
    names = set(x.strip() for x in open(namesfile)) if namesfile else None

    outfile = opts.outfile
    fp = open(gffile)
    fw = must_open(outfile, "w")
    for row in fp:
        atoms = row.split()
        if len(atoms) == 0:
            continue
        tag = atoms[0]
        if row[0] == "#":
            if not (tag == RegionTag and contigID and atoms[1] not in contigID):
                print >> fw, row.rstrip()
            if tag == FastaTag:
                break
            continue

        b = GffLine(row)
        is_right_contig = (contigID and tag in contigID) or (not contigID)
        is_right_names = (names and b.attributes["Name"][0] in names) or \
                         (not names)

        if is_right_contig and is_right_names:
            print >> fw, row.rstrip()

    if not opts.fasta:
        return

    f = Fasta(gffile)
    for s in contigID:
        if s in f:
            SeqIO.write([f[s]], fw, "fasta")
Example #8
0
def extract_full(rec, sites, flank, fw):
    """
    Full extraction of seq flanking the sites.
    """
    for s in sites:
        newid = "{0}:{1}".format(rec.name, s)
        left = max(s - flank, 0)
        right = min(s + flank, len(rec))
        frag = rec.seq[left:right].strip("Nn")
        newrec = SeqRecord(frag, id=newid, description="")
        SeqIO.write([newrec], fw, "fasta")
Example #9
0
def extract_full(rec, sites, flank, fw):
    """
    Full extraction of seq flanking the sites.
    """
    for s in sites:
        newid = "{0}:{1}".format(rec.name, s)
        left = max(s - flank, 0)
        right = min(s + flank, len(rec))
        frag = rec.seq[left:right].strip("Nn")
        newrec = SeqRecord(frag, id=newid, description="")
        SeqIO.write([newrec], fw, "fasta")
Example #10
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=2,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".format(
            nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug(
            "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".
            format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".format(
        percentage(totalassembled, totalreads)))
Example #11
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
Example #12
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
Example #13
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=2, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".\
                       format(nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\
                       format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".\
                  format(percentage(totalassembled, totalreads)))
Example #14
0
def circular(args):
    """
    %prog circular fastafile startpos

    Make circular genome, startpos is the place to start the sequence. This can
    be determined by mapping to a reference. Self overlaps are then resolved.
    Startpos is 1-based.
    """
    from jcvi.assembly.goldenpath import overlap

    p = OptionParser(circular.__doc__)
    p.add_option(
        "--flip",
        default=False,
        action="store_true",
        help="Reverse complement the sequence",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, startpos = args
    startpos = int(startpos)
    key, seq = next(parse_fasta(fastafile))
    aseq = seq[startpos:]
    bseq = seq[:startpos]
    aseqfile, bseqfile = "a.seq", "b.seq"

    for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
        fw = must_open(f, "w")
        print(">{0}\n{1}".format(f, s), file=fw)
        fw.close()

    o = overlap([aseqfile, bseqfile])
    seq = aseq[:o.qstop] + bseq[o.sstop:]
    seq = Seq(seq)

    if opts.flip:
        seq = seq.reverse_complement()

    for f in (aseqfile, bseqfile):
        os.remove(f)

    fw = must_open(opts.outfile, "w")
    rec = SeqRecord(seq, id=key, description="")
    SeqIO.write([rec], fw, "fasta")
    fw.close()
Example #15
0
def needle(args):
    """
    %prog needle nw.pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them
    Automatically writes output file `nw.scores`
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(needle.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    manager = mp.Manager()
    results = manager.list()
    needle_pool = mp.Pool(processes=mp.cpu_count())

    pairsfile, apep, bpep = args
    afasta, bfasta = Fasta(apep), Fasta(bpep)
    fp = must_open(pairsfile)
    for i, row in enumerate(fp):
        a, b = row.split()
        a, b = afasta[a], bfasta[b]
        fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i),
                           "w"), must_open(
                               "{0}_{1}_b.fasta".format(pairsfile, i), "w")
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()

        needlefile = "{0}_{1}_ab.needle".format(pairsfile, i)
        needle_pool.apply_async(
            _needle, (fa.name, fb.name, needlefile, a.id, b.id, results))

    needle_pool.close()
    needle_pool.join()

    fp.close()

    scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    fw = must_open(scoresfile, "w")
    for result in results:
        print(result, file=fw)
    fw.close()
Example #16
0
def needle(args):
    """
    %prog needle nw.pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them
    Automatically writes output file `nw.scores`
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(needle.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    manager = mp.Manager()
    results = manager.list()
    needle_pool = mp.Pool(processes=mp.cpu_count())

    pairsfile, apep, bpep = args
    afasta, bfasta = Fasta(apep), Fasta(bpep)
    fp = must_open(pairsfile)
    for i, row in enumerate(fp):
        a, b = row.split()
        a, b = afasta[a], bfasta[b]
        fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i), "w"), \
            must_open("{0}_{1}_b.fasta".format(pairsfile, i), "w")
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()

        needlefile = "{0}_{1}_ab.needle".format(pairsfile, i)
        needle_pool.apply_async(_needle, \
            (fa.name, fb.name, needlefile, a.id, b.id, results))

    needle_pool.close()
    needle_pool.join()

    fp.close()

    scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    fw = must_open(scoresfile, "w")
    for result in results:
        print(result, file=fw)
    fw.close()
Example #17
0
def phase(accession):
    gbdir = "gb"
    gbfile = op.join(gbdir, accession + ".gb")
    if not op.exists(gbfile):
        entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"])
    rec = SeqIO.parse(gbfile, "gb").next()
    ph, keywords = get_phase(rec)
    return ph, len(rec)
Example #18
0
def phase(accession):
    gbdir = "gb"
    gbfile = op.join(gbdir, accession + ".gb")
    if not op.exists(gbfile):
        entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"])
    rec = next(SeqIO.parse(gbfile, "gb"))
    ph, keywords = get_phase(rec)
    return ph, len(rec)
Example #19
0
def circular(args):
    """
    %prog circular fastafile startpos

    Make circular genome, startpos is the place to start the sequence. This can
    be determined by mapping to a reference. Self overlaps are then resolved.
    Startpos is 1-based.
    """
    from jcvi.assembly.goldenpath import overlap

    p = OptionParser(circular.__doc__)
    p.add_option("--flip", default=False, action="store_true",
                 help="Reverse complement the sequence")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, startpos = args
    startpos = int(startpos)
    key, seq = parse_fasta(fastafile).next()
    aseq = seq[startpos:]
    bseq = seq[:startpos]
    aseqfile, bseqfile = "a.seq", "b.seq"

    for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
        fw = must_open(f, "w")
        print >> fw, ">{0}\n{1}".format(f, s)
        fw.close()

    o = overlap([aseqfile, bseqfile])
    seq = aseq[:o.qstop] + bseq[o.sstop:]
    seq = Seq(seq)

    if opts.flip:
        seq = seq.reverse_complement()

    for f in (aseqfile, bseqfile):
        os.remove(f)

    fw = must_open(opts.outfile, "w")
    rec = SeqRecord(seq, id=key, description="")
    SeqIO.write([rec], fw, "fasta")
    fw.close()
Example #20
0
def needle(args):
    """
    %prog needle pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them.
    """
    from Bio.Emboss.Applications import NeedleCommandline

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.base import FileShredder

    p = OptionParser(needle.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pairsfile, apep, bpep = args
    afasta = Fasta(apep)
    bfasta = Fasta(bpep)
    fp = open(pairsfile)
    for row in fp:
        fa = open(pairsfile + "_a.fasta", "w")
        fb = open(pairsfile + "_b.fasta", "w")
        a, b = row.split()
        a = afasta[a]
        b = bfasta[b]
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()
        needlefile = pairsfile + "_ab.needle"
        needle_cline = NeedleCommandline(asequence=fa.name,
                                         bsequence=fb.name,
                                         gapopen=10,
                                         gapextend=0.5,
                                         outfile=needlefile)
        stdout, stderr = needle_cline()
        print >> sys.stderr, stdout + stderr
        #align = AlignIO.read(needlefile, "emboss")
        nh = NeedleHeader(needlefile)
        print "\t".join((a.id, b.id, nh.identity, nh.score))
        FileShredder([fa.name, fb.name, needlefile])
Example #21
0
File: emboss.py Project: rrane/jcvi
def needle(args):
    """
    %prog needle pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them.
    """
    from Bio.Emboss.Applications import NeedleCommandline

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.base import FileShredder

    p = OptionParser(needle.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pairsfile, apep, bpep = args
    afasta = Fasta(apep)
    bfasta = Fasta(bpep)
    fp = open(pairsfile)
    for row in fp:
        fa = open(pairsfile + "_a.fasta", "w")
        fb = open(pairsfile + "_b.fasta", "w")
        a, b = row.split()
        a = afasta[a]
        b = bfasta[b]
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()
        needlefile = pairsfile + "_ab.needle"
        needle_cline = NeedleCommandline(asequence=fa.name,
                            bsequence=fb.name,
                            gapopen=10, gapextend=0.5,
                            outfile=needlefile)
        stdout, stderr = needle_cline()
        print >> sys.stderr, stdout + stderr
        #align = AlignIO.read(needlefile, "emboss")
        nh = NeedleHeader(needlefile)
        print "\t".join((a.id, b.id, nh.identity, nh.score))
        FileShredder([fa.name, fb.name, needlefile])
Example #22
0
def overlapbatch(args):
    """
    %prog overlapbatch ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    f = Fasta(ctgfasta)
    for k, rec in f.iteritems_ordered():
        fastafile = k + ".fasta"
        fw = open(fastafile, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        overlap([fastafile, poolfasta])
Example #23
0
def overlapbatch(args):
    """
    %prog overlapbatch ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    f = Fasta(ctgfasta)
    for k, rec in f.iteritems_ordered():
        fastafile = k + ".fasta"
        fw = open(fastafile, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        overlap([fastafile, poolfasta])
Example #24
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from rich.progress import Progress

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int", help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int", help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastqfile,) = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else SeqIO.parse(must_open(fastqfile), "fasta")

    with Progress() as progress:
        progress.add_task("[green] Processing ...", start=False, total=nreads)
        for rec in it:
            k += 1
            if k > nreads:
                break
            if rec is None:
                break
            s = str(rec.seq)
            for i, a in enumerate(s[:N]):
                if a in p:
                    a = p[a]
                    L[a][i] += 1
            for j, a in enumerate(s[-N:][::-1]):
                if a in p:
                    a = p[a]
                    R[a][N - 1 - j] += 1
            l4, r3 = s[:4], s[-3:]
            L4[l4] += 1
            R3[r3] += 1
            print(">{0}\n{1}".format(k, s[:N]), file=fw_L)
            print(">{0}\n{1}".format(k, s[-N:]), file=fw_R)

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format("L", "First_10_bases"))
    sh(cmd.format("R", "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=",", fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=",", fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print("\t".join((p, str(c))), file=fw)
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print("\t".join((p, str(c))), file=fw)
    fw.close()
Example #25
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option(
        "--parents",
        dest="parents",
        default="mRNA",
        help="list of features to extract, use comma to separate (e.g."
        "'gene,mRNA') [default: %default]")
    p.add_option(
        "--children",
        dest="children",
        default="CDS",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
                 help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(
                dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()
Example #26
0
def extract(args):
    """
    %prog extract gffile

    --contigs: Extract particular contig(s) from the gff file. If multiple contigs are
    involved, use "," to separate, e.g. "contig_12,contig_150"
    --names: Provide a file with IDs, one each line
    """
    p = OptionParser(extract.__doc__)
    p.add_option(
        "--contigs",
        help="Extract features from certain contigs [default: %default]")
    p.add_option(
        "--names",
        help="Extract features with certain names [default: %default]")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Write FASTA if available [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    contigID = opts.contigs
    namesfile = opts.names

    contigID = set(contigID.split(",")) if contigID else None
    names = set(x.strip() for x in open(namesfile)) if namesfile else None

    outfile = opts.outfile
    fp = open(gffile)
    fw = must_open(outfile, "w")
    for row in fp:
        atoms = row.split()
        if len(atoms) == 0:
            continue
        tag = atoms[0]
        if row[0] == "#":
            if not (tag == RegionTag and contigID
                    and atoms[1] not in contigID):
                print >> fw, row.rstrip()
            if tag == FastaTag:
                break
            continue

        b = GffLine(row)
        is_right_contig = (contigID and tag in contigID) or (not contigID)
        is_right_names = (names and b.attributes["Name"][0] in names) or \
                         (not names)

        if is_right_contig and is_right_names:
            print >> fw, row.rstrip()

    if not opts.fasta:
        return

    f = Fasta(gffile)
    for s in contigID:
        if s in f:
            SeqIO.write([f[s]], fw, "fasta")
Example #27
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option("--parents", dest="parents", default="mRNA",
            help="list of features to extract, use comma to separate (e.g."
            "'gene,mRNA') [default: %default]")
    p.add_option("--children", dest="children", default="CDS",
            help="list of features to extract, use comma to separate (e.g."
            "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
            help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()
Example #28
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option(
        "--maxsize",
        default=300000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    p.add_option("--prefix", help="Prefix of the new object")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(
        blastfile, order, rclip=rclip, maxsize=maxsize
    )

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count("n") + x.seq.count("N")
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug(
        "Ignore {0} updates because of decreasing quality.".format(len(exclude))
    )

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #29
0
def prepare(args):
    """
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    ---------------------------------------------
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    """
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
    p.add_option("--rearray_lib",
                 default=None,
                 help="name of the rearrayed library [default: %default]")
    p.add_option(
        "--orig_lib_file",
        help=
        "fasta file containing reads from the original libraries [default: %default]"
    )

    g = OptionGroup(p, "Optional parameters")
    g.add_option(
        "--output_folder",
        default="to_assemble",
        help="output folder to write the FASTA files to [default: %default]")
    p.add_option_group(g)

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
        logging.error(
            "Original library reads file `{0}` does not exist!".format(
                origlibfile))
        sys.exit()

    lookuptblfile = rearraylib + '.lookup'
    logging.debug(lookuptblfile)
    if not op.isfile(lookuptblfile):
        logging.error(
            "Lookup table file `{0}` does not exist!".format(lookuptblfile))
        sys.exit()

    rearraylibfile = rearraylib + '.fasta'
    logging.debug(rearraylibfile)
    if not op.isfile(rearraylibfile):
        logging.error(
            "Rearrayed library reads file `{0}` does not exist!".format(
                rearraylibfile))
        sys.exit()

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
        logging.warning(
            "Output directory `{0}` missing. Creating it now...".format(
                opts.output_folder))
        os.makedirs(opts.output_folder)

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        ofp.close()
        print(outfile, file=log)

    log.close()
    logging.debug('Wrote log file `{0}`'.format(logfile))
Example #30
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.align import blast
    from jcvi.formats.fasta import SeqIO

    p = OptionParser(install.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--maxsize", default=300000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix", help="Prefix of the new object [default: %default]")
    p.add_option("--strict", default=False, action="store_true",
            help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip

    blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order
    beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip,
                                           maxsize=maxsize)

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))


    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Example #31
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Example #32
0
def install(args):
    """
    %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta

    Install patches into backbone, using sequences from alternative assembly.
    The patches sequences are generated via jcvi.assembly.patch.fill().

    The output is a bedfile that can be converted to AGP using
    jcvi.formats.agp.frombed().
    """
    from jcvi.apps.base import blast
    from jcvi.formats.blast import BlastSlow
    from jcvi.formats.fasta import SeqIO
    from jcvi.utils.iter import roundrobin

    p = OptionParser(install.__doc__)
    p.add_option(
        "--rclip",
        default=1,
        type="int",
        help="Pair ID is derived from rstrip N chars [default: %default]")
    p.add_option(
        "--maxsize",
        default=1000000,
        type="int",
        help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix",
                 help="Prefix of the new object [default: %default]")
    p.add_option(
        "--strict",
        default=False,
        action="store_true",
        help="Only update if replacement has no gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    pbed, pfasta, bbfasta, altfasta = args
    Max = opts.maxsize  # Max DNA size to replace gap
    rclip = opts.rclip
    prefix = opts.prefix

    blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"])
    order = Bed(pbed).order

    beforebed, afterbed = "before.bed", "after.bed"
    fwa = open(beforebed, "w")
    fwb = open(afterbed, "w")

    key1 = lambda x: x.query
    key2 = lambda x: x.query[:-rclip] if rclip else key1
    data = BlastSlow(blastfile)

    for pe, lines in groupby(data, key=key2):
        lines = list(lines)
        if len(lines) != 2:
            continue

        a, b = lines

        aquery, bquery = a.query, b.query
        asubject, bsubject = a.subject, b.subject
        if asubject != bsubject:
            continue

        astrand, bstrand = a.orientation, b.orientation
        assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery))

        ai, ax = order[aquery]
        bi, bx = order[bquery]
        qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1

        if astrand == '+' and bstrand == '+':
            sstart, sstop = a.sstart, b.sstop

        elif astrand == '-' and bstrand == '-':
            sstart, sstop = b.sstart, a.sstop

        else:
            continue

        if sstart > sstop:
            continue

        if sstop > sstart + Max:
            continue

        name = aquery[:-1] + "LR"
        print >> fwa, "\t".join(str(x) for x in \
                    (ax.seqid, qstart - 1, qstop, name, 1000, "+"))
        print >> fwb, "\t".join(str(x) for x in \
                    (asubject, sstart - 1, sstop, name, 1000, astrand))

    fwa.close()
    fwb.close()

    beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True)
    afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True)

    # Exclude the replacements that contain more Ns than before
    ah = SeqIO.parse(beforefasta, "fasta")
    bh = SeqIO.parse(afterfasta, "fasta")
    count_Ns = lambda x: x.seq.count('n') + x.seq.count('N')
    exclude = set()
    for arec, brec in zip(ah, bh):
        an = count_Ns(arec)
        bn = count_Ns(brec)
        if opts.strict:
            if bn == 0:
                continue

        elif bn < an:
            continue

        id = arec.id
        exclude.add(id)

    logging.debug("Ignore {0} updates because of decreasing quality."\
                    .format(len(exclude)))

    abed = Bed(beforebed, sorted=False)
    bbed = Bed(afterbed, sorted=False)
    abed = [x for x in abed if x.accn not in exclude]
    bbed = [x for x in bbed if x.accn not in exclude]

    abedfile = "before.filtered.bed"
    bbedfile = "after.filtered.bed"
    afbed = Bed()
    afbed.extend(abed)
    bfbed = Bed()
    bfbed.extend(bbed)

    afbed.print_to_file(abedfile)
    bfbed.print_to_file(bbedfile)

    # Shuffle the two bedfiles together
    sz = Sizes(bbfasta)
    sizes = sz.mapping
    shuffled = "shuffled.bed"
    border = bfbed.order

    all = []
    afbed.sort(key=afbed.nullkey)
    totalids = len(sizes)
    import math
    pad = int(math.log10(totalids)) + 1
    cj = 0
    seen = set()
    accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad)

    for seqid, aa in afbed.sub_beds():
        cj += 1
        abeds, bbeds, beds = [], [], []
        size = sizes[seqid]
        ranges = [(x.seqid, x.start, x.end) for x in aa]
        cranges = range_interleave(ranges, sizes={seqid: size})
        for seqid, start, end in cranges:
            bedline = "\t".join(str(x) for x in (seqid, start - 1, end))
            abeds.append(BedLine(bedline))

        for a in aa:
            gapid = a.accn
            bi, b = border[gapid]
            bbeds.append(b)

        a = abeds[0] if abeds else []
        assert abs(len(abeds) - len(bbeds)) <= 1
        if (not a) or a.start > 1:
            abeds, bbeds = bbeds, abeds

        beds = list(roundrobin(abeds, bbeds))
        if prefix:
            for b in beds:
                b.accn = accn(cj)

        all.extend(beds)
        seen.add(seqid)

    # Singletons
    for seqid, size in sz.iter_sizes():
        if seqid in seen:
            continue

        bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj)))
        b = BedLine(bedline)

        cj += 1
        if prefix:
            b.accn = accn(cj)

        all.append(b)

    shuffledbed = Bed()
    shuffledbed.extend(all)
    shuffledbed.print_to_file(shuffled)
Example #33
0
def prepare(args):
    """
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    ---------------------------------------------
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    """
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
    p.add_option("--rearray_lib", default=None,
            help="name of the rearrayed library [default: %default]")
    p.add_option("--orig_lib_file",
            help="fasta file containing reads from the original libraries [default: %default]")

    g = OptionGroup(p, "Optional parameters")
    g.add_option("--output_folder", default="to_assemble",
            help="output folder to write the FASTA files to [default: %default]")
    p.add_option_group(g)

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
        logging.error("Original library reads file `{0}` does not exist!".format(origlibfile))
        sys.exit()

    lookuptblfile  = rearraylib + '.lookup'
    logging.debug(lookuptblfile)
    if not op.isfile(lookuptblfile):
        logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile))
        sys.exit()

    rearraylibfile = rearraylib + '.fasta'
    logging.debug(rearraylibfile)
    if not op.isfile(rearraylibfile):
        logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile))
        sys.exit()

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
        logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder))
        os.makedirs(opts.output_folder)

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        ofp.close()
        print >>log, outfile

    log.close()
    logging.debug('Wrote log file `{0}`'.format(logfile))
Example #34
0
def longest(args):
    """
    %prog longest pasa.fasta output.subclusters.out

    Find the longest PASA assembly and label it as full-length. Also removes
    transcripts shorter than half the length of the longest, or shorter than
    200bp. The assemblies for the same locus is found in
    `output.subclusters.out`. In particular the lines that look like:

    sub-cluster: asmbl_25 asmbl_26 asmbl_27
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--prefix", default="pasa",
                 help="Replace asmbl_ with prefix [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, subclusters = args
    prefix = fastafile.rsplit(".", 1)[0]

    idsfile = prefix + ".fl.ids"
    fw = open(idsfile, "w")
    sizes = Sizes(fastafile).mapping

    name_convert = lambda x: x.replace("asmbl", opts.prefix)

    keep = set()  # List of IDs to write
    fp = open(subclusters)
    nrecs = 0
    for row in fp:
        if not row.startswith("sub-cluster:"):
            continue
        asmbls = row.split()[1:]
        longest_asmbl = max(asmbls, key=lambda x: sizes[x])
        longest_size = sizes[longest_asmbl]
        print(name_convert(longest_asmbl), file=fw)
        nrecs += 1
        cutoff = max(longest_size / 2, 200)
        keep.update(set(x for x in asmbls if sizes[x] >= cutoff))

    fw.close()
    logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile))

    f = Fasta(fastafile, lazy=True)
    newfastafile = prefix + ".clean.fasta"
    fw = open(newfastafile, "w")
    nrecs = 0
    for name, rec in f.iteritems_ordered():
        if name not in keep:
            continue

        rec.id = name_convert(name)
        rec.description = ""
        SeqIO.write([rec], fw, "fasta")
        nrecs += 1

    fw.close()
    logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
Example #35
0
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
Example #36
0
def lcn(args):
    """
    %prog lcn Orthogroups/Orthogroups.tsv Orthogroup_Sequences/ lcn/
    """
    p = OptionParser(lcn.__doc__)
    p.add_option("--min-single-ratio",
                 default=0.9,
                 help="Single copy ratio must be > ")
    p.add_option("--max-zero-ratio",
                 default=0,
                 help="Zero copy ratio must be < ")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    (groups_tsv, sequence_dir, lcn_dir) = args
    selected = []
    # Read in the orthogroup definition and selected based on counts
    with open(groups_tsv) as fp:
        reader = csv.reader(fp, delimiter="\t")
        header = next(reader, None)
        species_names = header[1:]
        for row in reader:
            counts = [
                len(x.split(", ")) if x.strip() != "" else 0 for x in row[1:]
            ]
            single_ratio = sum([x == 1 for x in counts]) / len(counts)
            zero_ratio = sum([x == 0 for x in counts]) / len(counts)
            if single_ratio < opts.min_single_ratio:
                continue
            if zero_ratio > opts.max_zero_ratio:
                continue
            print(row[0], single_ratio, zero_ratio, counts, file=sys.stderr)
            selected.append(row)

    logging.debug("A total of {} orthogroups selected".format(len(selected)))

    # Collect the FASTA sequences now
    mkdir(lcn_dir)
    for row in selected:
        orthogroup = row[0]
        orthogroup_fasta = "{}.fa".format(orthogroup)
        input_fasta = op.join(sequence_dir, orthogroup_fasta)
        fasta = Fasta(input_fasta)
        selected_seqs = []
        for gene_names, species_name in zip(row[1:], species_names):
            gene_names = gene_names.split(", ")
            if len(gene_names) == 1:
                selected, = gene_names
            else:
                max_length, selected = max(
                    (len(fasta[x]), x) for x in gene_names)
            selected_seq = fasta[selected]
            # Set gene name to species name so we can later combine them in supermatrix
            selected_seq.id = species_name
            selected_seq.name = species_name
            selected_seq.description = ""
            selected_seqs.append(selected_seq)

        output_fasta = op.join(lcn_dir, orthogroup_fasta)
        with open(output_fasta, "w") as fw:
            SeqIO.write(selected_seqs, fw, "fasta")
        print(
            "{}: {} => {} ({})".format(orthogroup, len(fasta),
                                       len(selected_seqs), output_fasta),
            file=sys.stderr,
        )
Example #37
0
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile,
                                                 idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile,
                                       idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([
        fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)
    ])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
Example #38
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int",
                 help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int",
                 help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()
    widgets = ['Parse reads: ', Percentage(), ' ',
               Bar(marker='>', left='[', right=']'), ' ', ETA()]
    pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else \
           SeqIO.parse(must_open(fastqfile), "fasta")
    for rec in it:
        k += 1
        if k % 1000 == 0:
            pr.update(k)
        if k > nreads:
            break
        if rec is None:
            break
        s = str(rec.seq)
        for i, a in enumerate(s[:N]):
            if a in p:
                a = p[a]
                L[a][i] += 1
        for j, a in enumerate(s[-N:][::-1]):
            if a in p:
                a = p[a]
                R[a][N - 1 - j] += 1
        l4, r3 = s[:4], s[-3:]
        L4[l4] += 1
        R3[r3] += 1
        print >> fw_L, ">{0}\n{1}".format(k, s[:N])
        print >> fw_R, ">{0}\n{1}".format(k, s[-N:])

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format('L', "First_10_bases"))
    sh(cmd.format('R', "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()
Example #39
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Example #40
0
def weblogo(args):
    """
    %prog weblogo [fastafile|fastqfile]

    Extract base composition for reads
    """
    import numpy as np
    from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA

    p = OptionParser(weblogo.__doc__)
    p.add_option("-N", default=10, type="int",
                 help="Count the first and last N bases")
    p.add_option("--nreads", default=1000000, type="int",
                 help="Parse first N reads")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    N = opts.N
    nreads = opts.nreads

    pat = "ATCG"
    L = np.zeros((4, N), dtype="int32")
    R = np.zeros((4, N), dtype="int32")
    p = dict((a, i) for (i, a) in enumerate(pat))
    L4, R3 = Counter(), Counter()
    widgets = ['Parse reads: ', Percentage(), ' ',
               Bar(marker='>', left='[', right=']'), ' ', ETA()]
    pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start()

    k = 0
    fw_L = open("L.fasta", "w")
    fw_R = open("R.fasta", "w")
    fastq = fastqfile.endswith(".fastq")
    it = iter_fastq(fastqfile) if fastq else \
           SeqIO.parse(must_open(fastqfile), "fasta")
    for rec in it:
        k += 1
        if k % 1000 == 0:
            pr.update(k)
        if k > nreads:
            break
        if rec is None:
            break
        s = str(rec.seq)
        for i, a in enumerate(s[:N]):
            if a in p:
                a = p[a]
                L[a][i] += 1
        for j, a in enumerate(s[-N:][::-1]):
            if a in p:
                a = p[a]
                R[a][N - 1 - j] += 1
        l4, r3 = s[:4], s[-3:]
        L4[l4] += 1
        R3[r3] += 1
        print >> fw_L, ">{0}\n{1}".format(k, s[:N])
        print >> fw_R, ">{0}\n{1}".format(k, s[-N:])

    fw_L.close()
    fw_R.close()

    cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png"
    cmd += " --color-scheme classic --composition none -U probability"
    cmd += " --title {1}"
    sh(cmd.format('L', "First_10_bases"))
    sh(cmd.format('R', "Last_10_bases"))

    np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d")
    np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d")

    fw = open("L4.common", "w")
    for p, c in L4.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()

    fw = open("R3.common", "w")
    for p, c in R3.most_common(N):
        print >> fw, "\t".join((p, str(c)))
    fw.close()