コード例 #1
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
コード例 #2
0
ファイル: discont.py プロジェクト: tanghaibao/online-judge
def main(arg):
    f = Fasta(arg)
    s = [str(x.seq) for k, x in f.iteritems_ordered()]
    m = s[0]
    for z in s[1:]:
        m = m.replace(z, "")
    print Seq(m).translate().strip("*")
コード例 #3
0
ファイル: kmer.py プロジェクト: biologyguy/jcvi
def dump(args):
    """
    %prog dump fastafile

    Convert FASTA sequences to list of K-mers.
    """
    p = OptionParser(dump.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    K = opts.K
    fw = must_open(opts.outfile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> fw, "\n".join(kmers)
    fw.close()
コード例 #4
0
def fragment(args):
    """
    %prog fragment fastafile enzyme

    Cut the fastafile using the specified enzyme, and grab upstream and
    downstream nucleotide sequence along with the cut site. In this case, the
    sequences extracted are:

                |- PstI
    ============|===========
            (-------)

    Sometimes we need to limit the size of the restriction fragments, for
    example the GBS protocol does not allow fragments larger than 800bp.

           |-PstI        |- PstI              |- PstI
    ~~~====|=============|==========~~~~~~~===|============
           (---)     (---)

    In this case, the second fragment is longer than 800bp, therefore the two
    ends are NOT extracted, as in the first fragment.
    """
    p = OptionParser(fragment.__doc__)
    p.add_option(
        "--flank",
        default=150,
        type="int",
        help="Extract flanking bases of the cut sites",
    )
    p.add_option(
        "--full",
        default=False,
        action="store_true",
        help="The full extraction mode",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzyme = args
    flank = opts.flank
    assert flank > 0
    extract = extract_full if opts.full else extract_ends
    tag = "full" if opts.full else "ends"

    assert enzyme in set(str(x) for x in AllEnzymes)
    fragfastafile = fastafile.split(".")[0] + ".{0}.flank{1}.{2}.fasta".format(
        enzyme, flank, tag
    )
    enzyme = [x for x in AllEnzymes if str(x) == enzyme][0]

    f = Fasta(fastafile, lazy=True)
    fw = open(fragfastafile, "w")
    for name, rec in f.iteritems_ordered():
        a = Analysis([enzyme], rec.seq)
        sites = a.full()[enzyme]
        extract(rec, sites, flank, fw)

    logging.debug("Fragments written to `{0}`.".format(fragfastafile))
コード例 #5
0
ファイル: restriction.py プロジェクト: zjwang6/jcvi
def digest(args):
    """
    %prog digest fastafile NspI,BfuCI

    Digest fasta sequences to map restriction site positions.
    """
    p = OptionParser(digest.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzymes = args
    enzymes = enzymes.split(",")
    enzymes = [x for x in AllEnzymes if str(x) in enzymes]
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")

    header = ["Contig", "Length"] + [str(x) for x in enzymes]
    print("\t".join(header), file=fw)
    for name, rec in f.iteritems_ordered():
        row = [name, len(rec)]
        for e in enzymes:
            pos = e.search(rec.seq)
            pos = "na" if not pos else "|".join(str(x) for x in pos)
            row.append(pos)
        print("\t".join(str(x) for x in row), file=fw)
コード例 #6
0
ファイル: goldenpath.py プロジェクト: Hensonmw/jcvi
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
コード例 #7
0
ファイル: ks.py プロジェクト: LongZhao1992/jcvi
def get_GC3(cdsfile):
    from jcvi.formats.fasta import Fasta

    f = Fasta(cdsfile, lazy=True)
    GC3 = {}
    for name, rec in f.iteritems_ordered():
        positions = rec.seq[2::3].upper()
        gc_counts = sum(1 for x in positions if x in "GC")
        gc_ratio = gc_counts * 1. / len(positions)
        GC3[name] = gc_ratio

    return GC3
コード例 #8
0
ファイル: ks.py プロジェクト: bennyyu/jcvi
def get_GC3(cdsfile):
    from jcvi.formats.fasta import Fasta

    f = Fasta(cdsfile, lazy=True)
    GC3 = {}
    for name, rec in f.iteritems_ordered():
        positions = rec.seq[2::3].upper()
        gc_counts = sum(1 for x in positions if x in "GC")
        gc_ratio = gc_counts * 1. / len(positions)
        GC3[name] = gc_ratio

    return GC3
コード例 #9
0
ファイル: restriction.py プロジェクト: rrane/jcvi
def fragment(args):
    """
    %prog fragment fastafile enzyme

    Cut the fastafile using the specified enzyme, and grab upstream and
    downstream nucleotide sequence along with the cut site. In this case, the
    sequences extracted are:

                |- PstI
    ============|===========
            (-------)

    Sometimes we need to limit the size of the restriction fragments, for
    example the GBS protocol does not allow fragments larger than 800bp.

           |-PstI        |- PstI              |- PstI
    ~~~====|=============|==========~~~~~~~===|============
           (---)     (---)

    In this case, the second fragment is longer than 800bp, therefore the two
    ends are NOT extracted, as in the first fragment.
    """
    p = OptionParser(fragment.__doc__)
    p.add_option("--flank", default=150, type="int",
            help="Extract flanking bases of the cut sites [default: %default]")
    p.add_option("--full", default=False, action="store_true",
            help="The full extraction mode [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzyme = args
    flank = opts.flank
    assert flank > 0
    extract = extract_full if opts.full else extract_ends
    tag = "full" if opts.full else "ends"

    assert enzyme in set(str(x) for x in AllEnzymes)
    fragfastafile = fastafile.split(".")[0] + \
        ".{0}.flank{1}.{2}.fasta".format(enzyme, flank, tag)
    enzyme = [x for x in AllEnzymes if str(x) == enzyme][0]

    f = Fasta(fastafile, lazy=True)
    fw = open(fragfastafile, "w")
    for name, rec in f.iteritems_ordered():
        a = Analysis([enzyme], rec.seq)
        sites = a.full()[enzyme]
        extract(rec, sites, flank, fw)

    logging.debug("Fragments written to `{0}`.".format(fragfastafile))
コード例 #10
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
コード例 #11
0
ファイル: gff.py プロジェクト: linlifeng/jcvi
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
コード例 #12
0
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print("\n".join(kmers), file=proc.stdin)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug(
        "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format(
            K, fastafile, jfdb, binfile
        )
    )
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
コード例 #13
0
ファイル: kmer.py プロジェクト: zachary-zzc/jcvi
def count(args):
    """
    %prog count fastafile jf.db

    Run dump - jellyfish - bin - bincount in serial.
    """
    from bitarray import bitarray

    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, jfdb = args
    K = get_K(jfdb)
    cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
    t = must_open("tmp", "w")
    proc = Popen(cmd, stdin=PIPE, stdout=t)
    t.flush()

    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> proc.stdin, "\n".join(kmers)
    proc.stdin.close()
    logging.debug(cmd)
    proc.wait()

    a = bitarray()
    binfile = ".".join((fastafile, jfdb, "bin"))
    fw = open(binfile, "w")
    t.seek(0)
    for row in t:
        c = row.strip()
        a.append(int(c))
    a.tofile(fw)
    logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
    fw.close()
    sh("rm {0}".format(t.name))

    logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\
                    format(K, fastafile, jfdb, binfile))
    cntfile = ".".join((fastafile, jfdb, "cnt"))
    bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
    logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
コード例 #14
0
ファイル: postprocess.py プロジェクト: radaniba/jcvi
def overlapbatch(args):
    """
    %prog overlapbatch ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    f = Fasta(ctgfasta)
    for k, rec in f.iteritems_ordered():
        fastafile = k + ".fasta"
        fw = open(fastafile, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        overlap([fastafile, poolfasta])
コード例 #15
0
ファイル: postprocess.py プロジェクト: Nicholas-NVS/jcvi
def overlapbatch(args):
    """
    %prog overlapbatch ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    f = Fasta(ctgfasta)
    for k, rec in f.iteritems_ordered():
        fastafile = k + ".fasta"
        fw = open(fastafile, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        overlap([fastafile, poolfasta])
コード例 #16
0
ファイル: kmer.py プロジェクト: arvin580/jcvi
def dump(args):
    """
    %prog dump fastafile

    Convert FASTA sequences to list of K-mers.
    """
    p = OptionParser(dump.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    K = opts.K
    fw = must_open(opts.outfile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> fw, "\n".join(kmers)
    fw.close()
コード例 #17
0
ファイル: ca.py プロジェクト: arvin580/jcvi
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]")
    p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)

    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1.0 / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile
コード例 #18
0
ファイル: postprocess.py プロジェクト: radaniba/jcvi
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile,
                                                 idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile,
                                       idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([
        fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)
    ])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
コード例 #19
0
ファイル: preprocess.py プロジェクト: Nicholas-NVS/jcvi
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
コード例 #20
0
ファイル: postprocess.py プロジェクト: Nicholas-NVS/jcvi
def overlap(args):
    """
    %prog overlap ctgfasta poolfasta

    Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
    Mix and combine using `minimus2`.
    """
    p = OptionParser(overlap.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, poolfasta = args
    prefix = ctgfasta.split(".")[0]
    rid = list(Fasta(ctgfasta).iterkeys())
    assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"

    rid = rid[0]
    splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
    ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)

    # Run BLAST
    blastfile = ctgfasta + ".blast"
    run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)

    # Extract contigs and merge using minimus2
    closuredir = prefix + ".closure"
    closure = False
    if need_update(blastfile, closuredir):
        mkdir(closuredir, overwrite=True)
        closure = True

    if closure:
        idsfile = op.join(closuredir, prefix + ".ids")
        cmd = "cut -f2 {0} | sort -u".format(blastfile)
        sh(cmd, outfile=idsfile)

        idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
        cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
        sh(cmd)

        # This step is a hack to weight the bases from original sequences more
        # than the pulled sequences, by literally adding another copy to be used
        # in consensus calls.
        redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
        format([ctgfasta, redundantfastafile, "--prefix=RED."])

        mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
        cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile)
        sh(cmd, outfile=mergedfastafile)

        afgfile = op.join(closuredir, prefix + ".afg")
        cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
        sh(cmd)

        cwd = os.getcwd()
        os.chdir(closuredir)
        cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
        cmd += " -D OVERLAP=100 -D MINID=98"
        sh(cmd)
        os.chdir(cwd)

    # Analyze output, make sure that:
    # + Get the singletons of the original set back
    # + Drop any contig that is comprised entirely of pulled set
    originalIDs = set(Fasta(ctgfasta).iterkeys())
    minimuscontig = op.join(closuredir, prefix + ".contig")
    c = ContigFile(minimuscontig)
    excludecontigs = set()
    for rec in c.iter_records():
        reads = set(x.id for x in rec.reads)
        if reads.isdisjoint(originalIDs):
            excludecontigs.add(rec.id)

    logging.debug("Exclude contigs: {0}".\
            format(", ".join(sorted(excludecontigs))))

    finalfasta = prefix + ".improved.fasta_"
    fw = open(finalfasta, "w")
    minimusfasta = op.join(closuredir, prefix + ".fasta")
    f = Fasta(minimusfasta)
    for id, rec in f.iteritems_ordered():
        if id in excludecontigs:
            continue
        SeqIO.write([rec], fw, "fasta")

    singletonfile = op.join(closuredir, prefix + ".singletons")
    singletons = set(x.strip() for x in open(singletonfile))
    leftovers = singletons & originalIDs

    logging.debug("Pull leftover singletons: {0}".\
            format(", ".join(sorted(leftovers))))

    f = Fasta(ctgfasta)
    for id, rec in f.iteritems_ordered():
        if id not in leftovers:
            continue
        SeqIO.write([rec], fw, "fasta")

    fw.close()

    fastafile = finalfasta
    finalfasta = fastafile.rstrip("_")
    format([fastafile, finalfasta, "--sequential", "--pad0=3",
        "--prefix={0}_".format(rid)])

    logging.debug("Improved FASTA written to `{0}`.".format(finalfasta))

    n50([ctgfasta])
    n50([finalfasta])

    errlog = "error.log"
    for f in (fastafile, blastfile, errlog):
        if op.exists(f):
            os.remove(f)
コード例 #21
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
コード例 #22
0
def longest(args):
    """
    %prog longest pasa.fasta output.subclusters.out

    Find the longest PASA assembly and label it as full-length. Also removes
    transcripts shorter than half the length of the longest, or shorter than
    200bp. The assemblies for the same locus is found in
    `output.subclusters.out`. In particular the lines that look like:

    sub-cluster: asmbl_25 asmbl_26 asmbl_27
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--prefix", default="pasa",
                 help="Replace asmbl_ with prefix [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, subclusters = args
    prefix = fastafile.rsplit(".", 1)[0]

    idsfile = prefix + ".fl.ids"
    fw = open(idsfile, "w")
    sizes = Sizes(fastafile).mapping

    name_convert = lambda x: x.replace("asmbl", opts.prefix)

    keep = set()  # List of IDs to write
    fp = open(subclusters)
    nrecs = 0
    for row in fp:
        if not row.startswith("sub-cluster:"):
            continue
        asmbls = row.split()[1:]
        longest_asmbl = max(asmbls, key=lambda x: sizes[x])
        longest_size = sizes[longest_asmbl]
        print(name_convert(longest_asmbl), file=fw)
        nrecs += 1
        cutoff = max(longest_size / 2, 200)
        keep.update(set(x for x in asmbls if sizes[x] >= cutoff))

    fw.close()
    logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile))

    f = Fasta(fastafile, lazy=True)
    newfastafile = prefix + ".clean.fasta"
    fw = open(newfastafile, "w")
    nrecs = 0
    for name, rec in f.iteritems_ordered():
        if name not in keep:
            continue

        rec.id = name_convert(name)
        rec.description = ""
        SeqIO.write([rec], fw, "fasta")
        nrecs += 1

    fw.close()
    logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
コード例 #23
0
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen",
                 default=1000,
                 type="int",
                 help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen",
                 default=0,
                 type="int",
                 help="Ignore contig sequence less than [default: %default]")
    p.add_option(
        "--shift",
        default=50,
        type="int",
        help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)
    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1. / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID,
                                                      begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile