コード例 #1
0
ファイル: cdhit.py プロジェクト: biologyguy/jcvi
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
コード例 #2
0
ファイル: goldenpath.py プロジェクト: Hensonmw/jcvi
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
コード例 #3
0
ファイル: kmer.py プロジェクト: biologyguy/jcvi
def dump(args):
    """
    %prog dump fastafile

    Convert FASTA sequences to list of K-mers.
    """
    p = OptionParser(dump.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    K = opts.K
    fw = must_open(opts.outfile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> fw, "\n".join(kmers)
    fw.close()
コード例 #4
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
コード例 #5
0
ファイル: sam.py プロジェクト: arvin580/jcvi
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
コード例 #6
0
def score(args):
    """
    %prog score blastfile query.fasta A.ids

    Add up the scores for each query seq. Go through the lines and for each
    query sequence, add up the scores when subject is in each pile by A.ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.fasta import Fasta

    p = OptionParser(score.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, fastafile, idsfile = args
    ids = SetFile(idsfile)

    blast = Blast(blastfile)
    scores = defaultdict(int)
    for b in blast:
        query = b.query
        subject = b.subject
        if subject not in ids:
            continue
        scores[query] += b.score

    logging.debug("A total of {0} ids loaded.".format(len(ids)))

    f = Fasta(fastafile)
    for s in f.iterkeys_ordered():
        sc = scores.get(s, 0)
        print "\t".join((s, str(sc)))
コード例 #7
0
ファイル: tgbs.py プロジェクト: zachary-zzc/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.formats.fasta import Fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        sizes.append(int(size))

    s = SummaryStats(sizes)
    print >> sys.stderr, s
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #8
0
ファイル: sam.py プロジェクト: zhaotao1987/jcvi
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
コード例 #9
0
ファイル: blast.py プロジェクト: ascendo/jcvi
def score(args):
    """
    %prog score blastfile query.fasta A.ids

    Add up the scores for each query seq. Go through the lines and for each
    query sequence, add up the scores when subject is in each pile by A.ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.fasta import Fasta

    p = OptionParser(score.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, fastafile, idsfile = args
    ids = SetFile(idsfile)

    blast = Blast(blastfile)
    scores = defaultdict(int)
    for b in blast:
        query = b.query
        subject = b.subject
        if subject not in ids:
            continue
        scores[query] += b.score

    logging.debug("A total of {0} ids loaded.".format(len(ids)))

    f = Fasta(fastafile)
    for s in f.iterkeys_ordered():
        sc = scores.get(s, 0)
        print "\t".join((s, str(sc)))
コード例 #10
0
ファイル: restriction.py プロジェクト: zjwang6/jcvi
def digest(args):
    """
    %prog digest fastafile NspI,BfuCI

    Digest fasta sequences to map restriction site positions.
    """
    p = OptionParser(digest.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzymes = args
    enzymes = enzymes.split(",")
    enzymes = [x for x in AllEnzymes if str(x) in enzymes]
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")

    header = ["Contig", "Length"] + [str(x) for x in enzymes]
    print("\t".join(header), file=fw)
    for name, rec in f.iteritems_ordered():
        row = [name, len(rec)]
        for e in enzymes:
            pos = e.search(rec.seq)
            pos = "na" if not pos else "|".join(str(x) for x in pos)
            row.append(pos)
        print("\t".join(str(x) for x in row), file=fw)
コード例 #11
0
ファイル: cdhit.py プロジェクト: kvefimov/jcvi_062915
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
コード例 #12
0
ファイル: tgbs.py プロジェクト: biologyguy/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        sizes.append(int(size))

    s = SummaryStats(sizes)
    print >> sys.stderr, s
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #13
0
def fragment(args):
    """
    %prog fragment fastafile enzyme

    Cut the fastafile using the specified enzyme, and grab upstream and
    downstream nucleotide sequence along with the cut site. In this case, the
    sequences extracted are:

                |- PstI
    ============|===========
            (-------)

    Sometimes we need to limit the size of the restriction fragments, for
    example the GBS protocol does not allow fragments larger than 800bp.

           |-PstI        |- PstI              |- PstI
    ~~~====|=============|==========~~~~~~~===|============
           (---)     (---)

    In this case, the second fragment is longer than 800bp, therefore the two
    ends are NOT extracted, as in the first fragment.
    """
    p = OptionParser(fragment.__doc__)
    p.add_option(
        "--flank",
        default=150,
        type="int",
        help="Extract flanking bases of the cut sites",
    )
    p.add_option(
        "--full",
        default=False,
        action="store_true",
        help="The full extraction mode",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzyme = args
    flank = opts.flank
    assert flank > 0
    extract = extract_full if opts.full else extract_ends
    tag = "full" if opts.full else "ends"

    assert enzyme in set(str(x) for x in AllEnzymes)
    fragfastafile = fastafile.split(".")[0] + ".{0}.flank{1}.{2}.fasta".format(
        enzyme, flank, tag
    )
    enzyme = [x for x in AllEnzymes if str(x) == enzyme][0]

    f = Fasta(fastafile, lazy=True)
    fw = open(fragfastafile, "w")
    for name, rec in f.iteritems_ordered():
        a = Analysis([enzyme], rec.seq)
        sites = a.full()[enzyme]
        extract(rec, sites, flank, fw)

    logging.debug("Fragments written to `{0}`.".format(fragfastafile))
コード例 #14
0
ファイル: discont.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    s = [str(x.seq) for k, x in f.iteritems_ordered()]
    m = s[0]
    for z in s[1:]:
        m = m.replace(z, "")
    print Seq(m).translate().strip("*")
コード例 #15
0
def main(arg):
    f = Fasta(arg)
    G = {}
    iG = set()
    for a in f.keys():
        for b in f.keys():
            if a == b:
                continue

            ov = get_overlap(a, b, f)
            if not ov:
                continue
            a, b, i = ov
            G[a] = (a, b, i)
            iG.add(b)

    # linearize graph
    start = set(f.keys()) - iG
    assert len(start) == 1
    z = list(start)[0]
    seq = str(f[z].seq)

    while z in G:
        a, b, i = G[z]
        seq = seq[:-i] + str(f[b].seq)
        z = b
    print seq
コード例 #16
0
def main(arg):
    f = Fasta(arg)
    for a in f.keys():
        for b in f.keys():
            if a == b:
                continue
            if check_overlap(a, b, f):
                print a, b
コード例 #17
0
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0",
        default=False,
        action="store_true",
        help="Print size and L50 to stdout",
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(float(row.split()[-1]))
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print(", ".join(args), file=sys.stderr)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print(
        " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)),
        file=sys.stderr,
    )
    loghistogram(ctgsizes)

    if opts.print0:
        print("\t".join(str(x) for x in (",".join(args), sumsize, l50)))

    return zip(header, summary)
コード例 #18
0
ファイル: gc.py プロジェクト: tanghaibao/online-judge
def main(filename):
    f = Fasta(filename)
    gc_store = []
    for key, rec in f.iteritems():
        gc = sum(rec.seq.count(x) for x in 'GCgc') * 100. / len(rec.seq)
        gc_store.append((gc, key))
    gc, key = max(gc_store)
    print key
    print gc
コード例 #19
0
def frombed(args):
    """
    %prog frombed bedfile contigfasta readfasta

    Convert read placement to contig format. This is useful before running BAMBUS.
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.bed import Bed
    from jcvi.utils.cbook import fill

    p = OptionParser(frombed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    bedfile, contigfasta, readfasta = args
    prefix = bedfile.rsplit(".", 1)[0]
    contigfile = prefix + ".contig"
    idsfile = prefix + ".ids"

    contigfasta = Fasta(contigfasta)
    readfasta = Fasta(readfasta)

    bed = Bed(bedfile)
    checksum = "00000000 checksum."
    fw_ids = open(idsfile, "w")
    fw = open(contigfile, "w")

    for ctg, reads in bed.sub_beds():
        ctgseq = contigfasta[ctg]
        ctgline = "##{0} {1} {2} bases, {3}".format(\
                ctg, len(reads), len(ctgseq), checksum)

        print >> fw_ids, ctg
        print >> fw, ctgline
        print >> fw, fill(ctgseq.seq)

        for b in reads:
            read = b.accn
            strand = b.strand
            readseq = readfasta[read]
            rc = " [RC]" if strand == "-" else ""
            readlen = len(readseq)
            rstart, rend = 1, readlen
            if strand == "-":
                rstart, rend = rend, rstart

            readrange = "{{{0} {1}}}".format(rstart, rend)
            conrange = "<{0} {1}>".format(b.start, b.end)
            readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\
                    read, rc, readlen, checksum, readrange, conrange)
            print >> fw, readline
            print >> fw, fill(readseq.seq)

    logging.debug("Mapped contigs written to `{0}`.".format(contigfile))
    logging.debug("Contig IDs written to `{0}`.".format(idsfile))
コード例 #20
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
コード例 #21
0
ファイル: ks.py プロジェクト: LongZhao1992/jcvi
def prepare(args):
    """
    %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta

    Pick sequences from cdsfile to form pairs, ready to be calculated. The
    pairsfile can be generated from formats.blast.cscore(). The first two
    columns contain the pair.
    """
    from jcvi.formats.fasta import Fasta

    p = OptionParser(prepare.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)
    outfile = opts.outfile

    if len(args) == 2:
        pairsfile, cdsfile = args
        pepfile = None
    elif len(args) == 3:
        pairsfile, cdsfile, pepfile = args
    else:
        sys.exit(not p.print_help())

    f = Fasta(cdsfile)
    fp = open(pairsfile)
    fw = must_open(outfile, "w")
    if pepfile:
        assert outfile != "stdout", "Please specify outfile name."
        f2 = Fasta(pepfile)
        fw2 = must_open(outfile + ".pep", "w")
    for row in fp:
        if row[0] == '#':
            continue
        a, b = row.split()[:2]
        if a == b:
            logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b))
            continue

        if a not in f:
            a = find_first_isoform(a, f)
            assert a, a
        if b not in f:
            b = find_first_isoform(b, f)
            assert b, b

        acds = f[a]
        bcds = f[b]
        SeqIO.write((acds, bcds), fw, "fasta")
        if pepfile:
            apep = f2[a]
            bpep = f2[b]
            SeqIO.write((apep, bpep), fw2, "fasta")
    fw.close()
    if pepfile:
        fw2.close()
コード例 #22
0
ファイル: stats.py プロジェクト: zhaotao1987/jcvi
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
コード例 #23
0
def minimap(args):
    """
    %prog minimap ref.fasta query.fasta

    Wrap minimap2 aligner using query against sequences. When query and ref
    is the same, we are in "self-scan" mode (e.g. useful for finding internal
    duplications resulted from mis-assemblies).
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.formats.fasta import Fasta

    p = OptionParser(minimap.__doc__)
    p.add_option(
        "--chunks",
        type="int",
        default=2000000,
        help="Split ref.fasta into chunks of size in self-scan mode",
    )
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    chunks = opts.chunks
    outdir = opts.outdir
    if ref != query:
        raise NotImplementedError

    # "self-scan" mode
    # build faidx (otherwise, parallel make may complain)
    sh("samtools faidx {}".format(ref))
    f = Fasta(ref)
    mkdir(outdir)
    mm = MakeManager()
    for name, size in f.itersizes():
        start = 0
        for end in range(chunks, size, chunks):
            fafile = op.join(outdir,
                             "{}_{}_{}.fa".format(name, start + 1, end))
            cmd = "samtools faidx {} {}:{}-{} -o {}".format(
                ref, name, start + 1, end, fafile)
            mm.add(ref, fafile, cmd)

            paffile = fafile.rsplit(".", 1)[0] + ".paf"
            cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile)
            mm.add(fafile, paffile, cmd)

            epsfile = fafile.rsplit(".", 1)[0] + ".eps"
            cmd = "minidot {} > {}".format(paffile, epsfile)
            mm.add(paffile, epsfile, cmd)
            start += chunks

    mm.write()
コード例 #24
0
ファイル: assembly.py プロジェクト: JinfengChen/jcvi
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option("--overwrite", default=False, action="store_true",
            help="overwrite .rplot file if exists [default: %default]")
    p.add_option("--cutoff", default=0, type="int", dest="cutoff",
            help="use contigs above certain size [default: %default]")
    p.add_option("--stepsize", default=10, type="int", dest="stepsize",
            help="stepsize for the distribution [default: %default]")
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                "Counts")
        statsrows = []
        print >>fw, header
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum,
                counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(\
                    op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]):
                print >> fw, "\t".join((str(i), str(s / 1000000.), tag))
        fw.close()

        table = loadtable(statsheader, statsrows)
        print >> sys.stderr, table

    generate_plot(rplot)
コード例 #25
0
ファイル: revpalind.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    key, rev = f.iteritems().next()
    s = rev.seq
    for i in xrange(len(s)):
        for l in xrange(4, 13):
            if i + l > len(s):
                continue
            ns = s[i:i+l]
            if str(ns) == str(ns.reverse_complement()):
                print i + 1, l
コード例 #26
0
ファイル: ks.py プロジェクト: bennyyu/jcvi
def get_GC3(cdsfile):
    from jcvi.formats.fasta import Fasta

    f = Fasta(cdsfile, lazy=True)
    GC3 = {}
    for name, rec in f.iteritems_ordered():
        positions = rec.seq[2::3].upper()
        gc_counts = sum(1 for x in positions if x in "GC")
        gc_ratio = gc_counts * 1. / len(positions)
        GC3[name] = gc_ratio

    return GC3
コード例 #27
0
ファイル: ks.py プロジェクト: LongZhao1992/jcvi
def get_GC3(cdsfile):
    from jcvi.formats.fasta import Fasta

    f = Fasta(cdsfile, lazy=True)
    GC3 = {}
    for name, rec in f.iteritems_ordered():
        positions = rec.seq[2::3].upper()
        gc_counts = sum(1 for x in positions if x in "GC")
        gc_ratio = gc_counts * 1. / len(positions)
        GC3[name] = gc_ratio

    return GC3
コード例 #28
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=2,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".format(
            nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug(
            "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".
            format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".format(
        percentage(totalassembled, totalreads)))
コード例 #29
0
ファイル: kmercount.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    key, rec = f.iteritems().next()
    s = rec.seq
    store = defaultdict(int)
    for i in xrange(len(s) - 3):
        kmer = s[i:i+4]
        assert len(kmer) == 4
        store[kmer] += 1

    counts = [store.get("".join(x), 0) for x in product("ACGT", repeat=4)]
    print " ".join(str(x) for x in counts)
コード例 #30
0
ファイル: base.py プロジェクト: rrane/jcvi
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    from jcvi.graphics.histogram import loghistogram

    p = OptionParser(n50.__doc__)
    p.add_option(
        "--print0", default=False, action="store_true", help="Print size and L50 to stdout [default: %default]"
    )

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = probe == ">"
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary))
    loghistogram(ctgsizes)

    if opts.print0:
        print "\t".join(str(x) for x in (",".join(args), sumsize, l50))

    return zip(header, summary)
コード例 #31
0
ファイル: restriction.py プロジェクト: rrane/jcvi
def fragment(args):
    """
    %prog fragment fastafile enzyme

    Cut the fastafile using the specified enzyme, and grab upstream and
    downstream nucleotide sequence along with the cut site. In this case, the
    sequences extracted are:

                |- PstI
    ============|===========
            (-------)

    Sometimes we need to limit the size of the restriction fragments, for
    example the GBS protocol does not allow fragments larger than 800bp.

           |-PstI        |- PstI              |- PstI
    ~~~====|=============|==========~~~~~~~===|============
           (---)     (---)

    In this case, the second fragment is longer than 800bp, therefore the two
    ends are NOT extracted, as in the first fragment.
    """
    p = OptionParser(fragment.__doc__)
    p.add_option("--flank", default=150, type="int",
            help="Extract flanking bases of the cut sites [default: %default]")
    p.add_option("--full", default=False, action="store_true",
            help="The full extraction mode [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzyme = args
    flank = opts.flank
    assert flank > 0
    extract = extract_full if opts.full else extract_ends
    tag = "full" if opts.full else "ends"

    assert enzyme in set(str(x) for x in AllEnzymes)
    fragfastafile = fastafile.split(".")[0] + \
        ".{0}.flank{1}.{2}.fasta".format(enzyme, flank, tag)
    enzyme = [x for x in AllEnzymes if str(x) == enzyme][0]

    f = Fasta(fastafile, lazy=True)
    fw = open(fragfastafile, "w")
    for name, rec in f.iteritems_ordered():
        a = Analysis([enzyme], rec.seq)
        sites = a.full()[enzyme]
        extract(rec, sites, flank, fw)

    logging.debug("Fragments written to `{0}`.".format(fragfastafile))
コード例 #32
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
コード例 #33
0
ファイル: stats.py プロジェクト: tanghaibao/jcvi
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
コード例 #34
0
ファイル: gff.py プロジェクト: linlifeng/jcvi
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
コード例 #35
0
ファイル: cdhit.py プロジェクト: tanghaibao/jcvi
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=2, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".\
                       format(nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\
                       format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".\
                  format(percentage(totalassembled, totalreads)))
コード例 #36
0
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print("\n".join(kmers), file=proc.stdin)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug(
        "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format(
            K, fastafile, jfdb, binfile
        )
    )
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
コード例 #37
0
def needle(args):
    """
    %prog needle nw.pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them
    Automatically writes output file `nw.scores`
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(needle.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    manager = mp.Manager()
    results = manager.list()
    needle_pool = mp.Pool(processes=mp.cpu_count())

    pairsfile, apep, bpep = args
    afasta, bfasta = Fasta(apep), Fasta(bpep)
    fp = must_open(pairsfile)
    for i, row in enumerate(fp):
        a, b = row.split()
        a, b = afasta[a], bfasta[b]
        fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i),
                           "w"), must_open(
                               "{0}_{1}_b.fasta".format(pairsfile, i), "w")
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()

        needlefile = "{0}_{1}_ab.needle".format(pairsfile, i)
        needle_pool.apply_async(
            _needle, (fa.name, fb.name, needlefile, a.id, b.id, results))

    needle_pool.close()
    needle_pool.join()

    fp.close()

    scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    fw = must_open(scoresfile, "w")
    for result in results:
        print(result, file=fw)
    fw.close()
コード例 #38
0
ファイル: orf.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    key, s = f.iteritems().next()
    s = s.seq
    res = set()
    for z in (s, s.reverse_complement()):
        for frame in xrange(3):
            p = z[frame:].translate()
            for i in xrange(len(p)):
                if p[i] != 'M':
                    continue
                for j in xrange(i + 1, len(p)):
                    if p[j] == '*':
                        res.add(p[i:j])
                        break
    print "\n".join(str(x) for x in res)
コード例 #39
0
def prepare(args):
    """
    %prog prepare pairsfile cdsfile > paired.cds.fasta

    Pick sequences from cdsfile to form pairs, ready to be calculated. The
    pairsfile can be generated from formats.blast.cscore(). The first two
    columns contain the pair.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, cdsfile = args

    f = Fasta(cdsfile)
    fp = open(pairsfile)
    fw = sys.stdout
    for row in fp:
        a, b = row.split()[:2]
        arec = f[a]
        brec = f[b]
        SeqIO.write((arec, brec), fw, "fasta")
コード例 #40
0
ファイル: snp.py プロジェクト: zhaotao1987/jcvi
def freebayes(args):
    """
    %prog freebayes prefix ref.fa *.bam

    Call SNPs using freebayes.
    """
    p = OptionParser(freebayes.__doc__)
    p.add_option("--mindepth", default=3, type="int",
                 help="Minimum depth [default: %default]")
    p.add_option("--minqual", default=20, type="int",
                 help="Minimum quality [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix, ref = args[0:2]
    bams = args[2:]
    cmd = "bamaddrg -R {0}"
    cmd += " " + " ".join("-b {0}".format(x) for x in bams)
    fmd = "freebayes --stdin -C {0} -f {1}".format(opts.mindepth, ref)
    seqids = list(Fasta(ref).iterkeys_ordered())
    for s in seqids:
        outfile = prefix + ".{0}.vcf".format(s)
        print cmd.format(s), "|", fmd + " -r {0} -v {1}".format(s, outfile)
コード例 #41
0
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option(
        "--erate",
        default=0.01,
        type="float",
        help="Base error rate of the read",
    )
    p.add_option(
        "--noerrors",
        default=False,
        action="store_true",
        help="Simulate reads with no errors",
    )
    p.add_option(
        "--genomesize",
        type="int",
        help="Genome size in Mb [default: estimate from data]",
    )
    add_sim_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile,) = args
    pf = op.basename(fastafile).split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = int(math.ceil(size * depth / (2 * readlen)))

    distance = opts.distance
    stdev = distance / 10

    outpf = opts.outfile or "{0}.{1}bp.{2}x".format(pf, distance, depth)

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)
コード例 #42
0
    def build_all(self, componentfasta, targetfasta, newagp=None):
        f = Fasta(componentfasta, index=False)
        fw = open(targetfasta, "w")

        for ob, lines_with_same_ob in groupby(self, key=lambda x: x.object):

            lines = list(lines_with_same_ob)
            self.build_one(ob, lines, f, fw, newagp=newagp)
コード例 #43
0
ファイル: tgbs.py プロジェクト: zjwang6/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    p.add_option("--csv", help="Write depth per contig to file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    csv = open(opts.csv, "w") if opts.csv else None

    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue

        # consensus_for_cluster_0 with 63 sequences
        if "with" in desc:
            name, w, size, seqs = desc.split()
            if csv:
                print("\t".join(str(x) for x in (name, size, len(rec))),
                      file=csv)
            assert w == "with"
            sizes.append(int(size))
        # MRD85:00603:02472;size=167;
        else:
            name, size, tail = desc.split(";")
            sizes.append(int(size.replace("size=", "")))

    if csv:
        csv.close()
        logging.debug("File written to `%s`.", opts.csv)

    s = SummaryStats(sizes)
    print(s, file=sys.stderr)
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #44
0
ファイル: kmer.py プロジェクト: zachary-zzc/jcvi
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> proc.stdin, "\n".join(kmers)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\
                    format(K, fastafile, jfdb, binfile))
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
コード例 #45
0
ファイル: tgbs.py プロジェクト: tanghaibao/jcvi
def count(args):
    """
    %prog count cdhit.consensus.fasta

    Scan the headers for the consensus clusters and count the number of reads.
    """
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(count.__doc__)
    p.add_option("--csv", help="Write depth per contig to file")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    csv = open(opts.csv, "w") if opts.csv else None

    f = Fasta(fastafile, lazy=True)
    sizes = []
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            sizes.append(1)
            continue

        # consensus_for_cluster_0 with 63 sequences
        if "with" in desc:
            name, w, size, seqs = desc.split()
            if csv:
                print("\t".join(str(x)
                                for x in (name, size, len(rec))), file=csv)
            assert w == "with"
            sizes.append(int(size))
        # MRD85:00603:02472;size=167;
        else:
            name, size, tail = desc.split(";")
            sizes.append(int(size.replace("size=", "")))

    if csv:
        csv.close()
        logging.debug("File written to `{0}`".format(opts.csv))

    s = SummaryStats(sizes)
    print(s, file=sys.stderr)
    stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
コード例 #46
0
ファイル: sizes.py プロジェクト: wroldwiedbwe/jcvi
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print("\t".join((k, str(size))), file=fw)
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
コード例 #47
0
ファイル: sizes.py プロジェクト: xuanblo/jcvi
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print >> fw, "\t".join((k, str(size)))
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
コード例 #48
0
ファイル: base.py プロジェクト: linlifeng/jcvi
def n50(args):
    """
    %prog n50 filename

    Given a file with a list of numbers denoting contig lengths, calculate N50.
    Input file can be both FASTA or a list of sizes.
    """
    p = OptionParser(n50.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    ctgsizes = []

    # Guess file format
    probe = open(args[0]).readline()[0]
    isFasta = (probe == '>')
    if isFasta:
        for filename in args:
            f = Fasta(filename)
            ctgsizes += list(b for a, b in f.itersizes())

    else:
        for row in must_open(args):
            try:
                ctgsize = int(row.split()[-1])
            except ValueError:
                continue
            ctgsizes.append(ctgsize)

    a50, l50, nn50 = calculate_A50(ctgsizes)
    sumsize = sum(ctgsizes)
    minsize = min(ctgsizes)
    maxsize = max(ctgsizes)
    n = len(ctgsizes)
    print >> sys.stderr, ", ".join(args)

    summary = (sumsize, l50, nn50, minsize, maxsize, n)
    print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in \
                        zip(header, summary))
    loghistogram(ctgsizes, summary=False)

    return zip(header, summary)
コード例 #49
0
ファイル: correction.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    store = defaultdict(int)
    for key, rec in f.iteritems():
        store[str(rec.seq)] += 1
        store[str(rec.seq.reverse_complement())] += 1

    f = Fasta(arg)
    for key, rec in f.iteritems():
        s = str(rec.seq)
        for t, v in store.items():
            if v < 2:
                continue
            if s == t:
                continue
            if hamming(s, t) > 1:
                continue
            print "{0}->{1}".format(s, t)
コード例 #50
0
ファイル: kmp.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    k, s = f.iteritems().next()
    s = str(s.seq)
    P = [0] * len(s)
    i = 1
    j = 0
    while i < len(s):
        if s[i] == s[j]:
            P[i] = j + 1
            i += 1
            j += 1
        elif j != 0:
            j = P[j - 1]
        else:
            P[i] = 0
            i += 1
    print " ".join(str(x) for x in P)
コード例 #51
0
ファイル: tandem.py プロジェクト: linlifeng/jcvi
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))
コード例 #52
0
ファイル: tandem.py プロジェクト: bennyyu/jcvi
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))
コード例 #53
0
def summary(args):
    """
    %prog summary fastafile

    Report the number of bases and sequences masked.
    """
    p = OptionParser(summary.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    f = Fasta(fastafile, index=False)

    halfmaskedseqs = set()
    allmasked = 0
    allbases = 0
    cutoff = 50
    for key, seq in f.iteritems():
        masked = 0
        for base in seq:
            if base not in "AGCT":
                masked += 1
        seqlen = len(seq)
        if masked * 100.0 / seqlen > cutoff:
            halfmaskedseqs.add(key)
        allmasked += masked
        allbases += seqlen

    seqnum = len(f)
    maskedseqnum = len(halfmaskedseqs)

    print(
        "Total masked bases: {0}".format(percentage(allmasked, allbases)),
        file=sys.stderr,
    )
    print(
        "Total masked sequences (contain > {0}% masked): {1}".format(
            cutoff, percentage(maskedseqnum, seqnum)),
        file=sys.stderr,
    )
コード例 #54
0
ファイル: emboss.py プロジェクト: radaniba/jcvi
def needle(args):
    """
    %prog needle pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them.
    """
    from Bio.Emboss.Applications import NeedleCommandline

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.base import FileShredder

    p = OptionParser(needle.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pairsfile, apep, bpep = args
    afasta = Fasta(apep)
    bfasta = Fasta(bpep)
    fp = open(pairsfile)
    for row in fp:
        fa = open(pairsfile + "_a.fasta", "w")
        fb = open(pairsfile + "_b.fasta", "w")
        a, b = row.split()
        a = afasta[a]
        b = bfasta[b]
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()
        needlefile = pairsfile + "_ab.needle"
        needle_cline = NeedleCommandline(asequence=fa.name,
                                         bsequence=fb.name,
                                         gapopen=10,
                                         gapextend=0.5,
                                         outfile=needlefile)
        stdout, stderr = needle_cline()
        print >> sys.stderr, stdout + stderr
        #align = AlignIO.read(needlefile, "emboss")
        nh = NeedleHeader(needlefile)
        print "\t".join((a.id, b.id, nh.identity, nh.score))
        FileShredder([fa.name, fb.name, needlefile])
コード例 #55
0
ファイル: base.py プロジェクト: sophy7074/jcvi
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option("--erate", default=.02, type="float",
                 help="Base error rate of the read [default: %default]")
    p.add_option("--distance", default=500, type="int",
                 help="Outer distance between the two ends [default: %default]")
    p.add_option("--genomesize", type="int",
                 help="Genome size in Mb [default: estimate from data]")
    p.add_option("--readlen", default=100, type="int",
                 help="Length of the read [default: %default]")
    p.add_option("--noerrors", default=False, action="store_true",
                 help="Simulate reads with no errors [default: %default]")
    p.set_depth(depth=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    pf = fastafile.split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = size * depth / (2 * readlen)

    distance = opts.distance
    stdev = distance / 5

    outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth)
    distance -= 2 * readlen  # Outer distance => Inner distance
    assert distance >= 0, "Outer distance must be >= 2 * readlen"

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)
コード例 #56
0
ファイル: ca.py プロジェクト: rrane/jcvi
def clr(args):
    """
    %prog blastfile fastafiles

    Calculate the vector clear range file based BLAST to the vectors.
    """
    p = OptionParser(clr.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    blastfile = args[0]
    fastafiles = args[1:]

    sizes = {}
    for fa in fastafiles:
        f = Fasta(fa)
        sizes.update(f.itersizes())

    b = Blast(blastfile)
    seen = set()
    for query, hits in b.iter_hits():

        qsize = sizes[query]
        vectors = list((x.qstart, x.qstop) for x in hits)
        vmin, vmax = range_minmax(vectors)

        left_size = vmin - 1
        right_size = qsize - vmax

        if left_size > right_size:
            clr_start, clr_end = 0, vmin
        else:
            clr_start, clr_end = vmax, qsize

        print "\t".join(str(x) for x in (query, clr_start, clr_end))
        del sizes[query]

    for q, size in sorted(sizes.items()):
        print "\t".join(str(x) for x in (q, 0, size))
コード例 #57
0
ファイル: postprocess.py プロジェクト: Nicholas-NVS/jcvi
def overlapbatch(args):
    """
    %prog overlapbatch ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    f = Fasta(ctgfasta)
    for k, rec in f.iteritems_ordered():
        fastafile = k + ".fasta"
        fw = open(fastafile, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        overlap([fastafile, poolfasta])
コード例 #58
0
ファイル: mask.py プロジェクト: bennyyu/jcvi
def summary(args):
    """
    %prog summary fastafile

    Report the number of bases and sequences masked.
    """
    p = OptionParser(summary.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, index=False)

    halfmaskedseqs = set()
    allmasked = 0
    allbases = 0
    cutoff = 50
    others = 0
    for key, seq in f.iteritems():
        masked = 0
        for base in seq:
            if base not in "AGCT":
                masked += 1
        seqlen = len(seq)
        if masked * 100. / seqlen > cutoff:
            halfmaskedseqs.add(key)
        allmasked += masked
        allbases += seqlen

    seqnum = len(f)
    maskedseqnum = len(halfmaskedseqs)

    print >> sys.stderr, "Total masked bases: {0}".\
            format(percentage(allmasked, allbases))
    print >> sys.stderr, "Total masked sequences (contain > {0}% masked): {1}".\
            format(cutoff, percentage(maskedseqnum, seqnum))