def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def extract_ends(rec, sites, flank, fw, maxfragsize=800): """ Extraction of ends of fragments above certain size. """ nsites = len(sites) size = len(rec) for i, s in enumerate(sites): newid = "{0}:{1}".format(rec.name, s) recs = [] if i == 0 or s - sites[i - 1] <= maxfragsize: newidL = newid + "L" left = max(s - flank, 0) right = s frag = rec.seq[left:right].strip("Nn") recL = SeqRecord(frag, id=newidL, description="") if i == 0 and s > maxfragsize: # Contig L-end pass else: recs.append(recL) if i == nsites - 1 or sites[i + 1] - s <= maxfragsize: newidR = newid + "R" left = s right = min(s + flank, size) frag = rec.seq[left:right].strip("Nn") recR = SeqRecord(frag, id=newidR, description="") if i == nsites - 1 and size - s > maxfragsize: # Contig R-end pass else: recs.append(recR) SeqIO.write(recs, fw, "fasta")
def extract(args): """ %prog extract gffile --contigs: Extract particular contig(s) from the gff file. If multiple contigs are involved, use "," to separate, e.g. "contig_12,contig_150" --names: Provide a file with IDs, one each line """ p = OptionParser(extract.__doc__) p.add_option("--contigs", help="Extract features from certain contigs [default: %default]") p.add_option("--names", help="Extract features with certain names [default: %default]") p.add_option("--fasta", default=False, action="store_true", help="Write FASTA if available [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args contigID = opts.contigs namesfile = opts.names contigID = set(contigID.split(",")) if contigID else None names = set(x.strip() for x in open(namesfile)) if namesfile else None outfile = opts.outfile fp = open(gffile) fw = must_open(outfile, "w") for row in fp: atoms = row.split() if len(atoms) == 0: continue tag = atoms[0] if row[0] == "#": if not (tag == RegionTag and contigID and atoms[1] not in contigID): print >> fw, row.rstrip() if tag == FastaTag: break continue b = GffLine(row) is_right_contig = (contigID and tag in contigID) or (not contigID) is_right_names = (names and b.attributes["Name"][0] in names) or \ (not names) if is_right_contig and is_right_names: print >> fw, row.rstrip() if not opts.fasta: return f = Fasta(gffile) for s in contigID: if s in f: SeqIO.write([f[s]], fw, "fasta")
def extract_full(rec, sites, flank, fw): """ Full extraction of seq flanking the sites. """ for s in sites: newid = "{0}:{1}".format(rec.name, s) left = max(s - flank, 0) right = min(s + flank, len(rec)) frag = rec.seq[left:right].strip("Nn") newrec = SeqRecord(frag, id=newid, description="") SeqIO.write([newrec], fw, "fasta")
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".format( nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug( "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]". format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".format( percentage(totalassembled, totalreads)))
def merge(args): """ %prog merge gffiles Merge several gff files into one. When only one file is given, it is assumed to be a file with a list of gff files. """ p = OptionParser(merge.__doc__) set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if nargs < 1: sys.exit(not p.print_help()) if nargs == 1: listfile, = args fp = open(listfile) gffiles = [x.strip() for x in fp] else: gffiles = args outfile = opts.outfile deflines = set() fw = must_open(outfile, "w") fastarecs = {} for gffile in gffiles: fp = open(gffile) for row in fp: row = row.rstrip() if row[0] == '#': if row == FastaTag: break if row in deflines: continue else: deflines.add(row) print >> fw, row f = Fasta(gffile, lazy=True) for key, rec in f.iteritems_ordered(): if key in fastarecs.keys(): continue fastarecs[key] = rec print >> fw, FastaTag SeqIO.write(fastarecs.values(), fw, "fasta")
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".\ format(nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\ format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".\ format(percentage(totalassembled, totalreads)))
def circular(args): """ %prog circular fastafile startpos Make circular genome, startpos is the place to start the sequence. This can be determined by mapping to a reference. Self overlaps are then resolved. Startpos is 1-based. """ from jcvi.assembly.goldenpath import overlap p = OptionParser(circular.__doc__) p.add_option( "--flip", default=False, action="store_true", help="Reverse complement the sequence", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, startpos = args startpos = int(startpos) key, seq = next(parse_fasta(fastafile)) aseq = seq[startpos:] bseq = seq[:startpos] aseqfile, bseqfile = "a.seq", "b.seq" for f, s in zip((aseqfile, bseqfile), (aseq, bseq)): fw = must_open(f, "w") print(">{0}\n{1}".format(f, s), file=fw) fw.close() o = overlap([aseqfile, bseqfile]) seq = aseq[:o.qstop] + bseq[o.sstop:] seq = Seq(seq) if opts.flip: seq = seq.reverse_complement() for f in (aseqfile, bseqfile): os.remove(f) fw = must_open(opts.outfile, "w") rec = SeqRecord(seq, id=key, description="") SeqIO.write([rec], fw, "fasta") fw.close()
def needle(args): """ %prog needle nw.pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them Automatically writes output file `nw.scores` """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) manager = mp.Manager() results = manager.list() needle_pool = mp.Pool(processes=mp.cpu_count()) pairsfile, apep, bpep = args afasta, bfasta = Fasta(apep), Fasta(bpep) fp = must_open(pairsfile) for i, row in enumerate(fp): a, b = row.split() a, b = afasta[a], bfasta[b] fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i), "w"), must_open( "{0}_{1}_b.fasta".format(pairsfile, i), "w") SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = "{0}_{1}_ab.needle".format(pairsfile, i) needle_pool.apply_async( _needle, (fa.name, fb.name, needlefile, a.id, b.id, results)) needle_pool.close() needle_pool.join() fp.close() scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) fw = must_open(scoresfile, "w") for result in results: print(result, file=fw) fw.close()
def needle(args): """ %prog needle nw.pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them Automatically writes output file `nw.scores` """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) manager = mp.Manager() results = manager.list() needle_pool = mp.Pool(processes=mp.cpu_count()) pairsfile, apep, bpep = args afasta, bfasta = Fasta(apep), Fasta(bpep) fp = must_open(pairsfile) for i, row in enumerate(fp): a, b = row.split() a, b = afasta[a], bfasta[b] fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i), "w"), \ must_open("{0}_{1}_b.fasta".format(pairsfile, i), "w") SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = "{0}_{1}_ab.needle".format(pairsfile, i) needle_pool.apply_async(_needle, \ (fa.name, fb.name, needlefile, a.id, b.id, results)) needle_pool.close() needle_pool.join() fp.close() scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) fw = must_open(scoresfile, "w") for result in results: print(result, file=fw) fw.close()
def circular(args): """ %prog circular fastafile startpos Make circular genome, startpos is the place to start the sequence. This can be determined by mapping to a reference. Self overlaps are then resolved. Startpos is 1-based. """ from jcvi.assembly.goldenpath import overlap p = OptionParser(circular.__doc__) p.add_option("--flip", default=False, action="store_true", help="Reverse complement the sequence") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, startpos = args startpos = int(startpos) key, seq = parse_fasta(fastafile).next() aseq = seq[startpos:] bseq = seq[:startpos] aseqfile, bseqfile = "a.seq", "b.seq" for f, s in zip((aseqfile, bseqfile), (aseq, bseq)): fw = must_open(f, "w") print >> fw, ">{0}\n{1}".format(f, s) fw.close() o = overlap([aseqfile, bseqfile]) seq = aseq[:o.qstop] + bseq[o.sstop:] seq = Seq(seq) if opts.flip: seq = seq.reverse_complement() for f in (aseqfile, bseqfile): os.remove(f) fw = must_open(opts.outfile, "w") rec = SeqRecord(seq, id=key, description="") SeqIO.write([rec], fw, "fasta") fw.close()
def needle(args): """ %prog needle pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them. """ from Bio.Emboss.Applications import NeedleCommandline from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.base import FileShredder p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pairsfile, apep, bpep = args afasta = Fasta(apep) bfasta = Fasta(bpep) fp = open(pairsfile) for row in fp: fa = open(pairsfile + "_a.fasta", "w") fb = open(pairsfile + "_b.fasta", "w") a, b = row.split() a = afasta[a] b = bfasta[b] SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = pairsfile + "_ab.needle" needle_cline = NeedleCommandline(asequence=fa.name, bsequence=fb.name, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() print >> sys.stderr, stdout + stderr #align = AlignIO.read(needlefile, "emboss") nh = NeedleHeader(needlefile) print "\t".join((a.id, b.id, nh.identity, nh.score)) FileShredder([fa.name, fb.name, needlefile])
def overlapbatch(args): """ %prog overlapbatch ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args f = Fasta(ctgfasta) for k, rec in f.iteritems_ordered(): fastafile = k + ".fasta" fw = open(fastafile, "w") SeqIO.write([rec], fw, "fasta") fw.close() overlap([fastafile, poolfasta])
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option( "--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option( "--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([ fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid) ]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def extract(args): """ %prog extract gffile --contigs: Extract particular contig(s) from the gff file. If multiple contigs are involved, use "," to separate, e.g. "contig_12,contig_150" --names: Provide a file with IDs, one each line """ p = OptionParser(extract.__doc__) p.add_option( "--contigs", help="Extract features from certain contigs [default: %default]") p.add_option( "--names", help="Extract features with certain names [default: %default]") p.add_option("--fasta", default=False, action="store_true", help="Write FASTA if available [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args contigID = opts.contigs namesfile = opts.names contigID = set(contigID.split(",")) if contigID else None names = set(x.strip() for x in open(namesfile)) if namesfile else None outfile = opts.outfile fp = open(gffile) fw = must_open(outfile, "w") for row in fp: atoms = row.split() if len(atoms) == 0: continue tag = atoms[0] if row[0] == "#": if not (tag == RegionTag and contigID and atoms[1] not in contigID): print >> fw, row.rstrip() if tag == FastaTag: break continue b = GffLine(row) is_right_contig = (contigID and tag in contigID) or (not contigID) is_right_names = (names and b.attributes["Name"][0] in names) or \ (not names) if is_right_contig and is_right_names: print >> fw, row.rstrip() if not opts.fasta: return f = Fasta(gffile) for s in contigID: if s in f: SeqIO.write([f[s]], fw, "fasta")
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option("--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option("--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option( "--orig_lib_file", help= "fasta file containing reads from the original libraries [default: %default]" ) g = OptionGroup(p, "Optional parameters") g.add_option( "--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error( "Original library reads file `{0}` does not exist!".format( origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error( "Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error( "Rearrayed library reads file `{0}` does not exist!".format( rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning( "Output directory `{0}` missing. Creating it now...".format( opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print(outfile, file=log) log.close() logging.debug('Wrote log file `{0}`'.format(logfile))
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid)]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option("--orig_lib_file", help="fasta file containing reads from the original libraries [default: %default]") g = OptionGroup(p, "Optional parameters") g.add_option("--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error("Original library reads file `{0}` does not exist!".format(origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print >>log, outfile log.close() logging.debug('Wrote log file `{0}`'.format(logfile))
def longest(args): """ %prog longest pasa.fasta output.subclusters.out Find the longest PASA assembly and label it as full-length. Also removes transcripts shorter than half the length of the longest, or shorter than 200bp. The assemblies for the same locus is found in `output.subclusters.out`. In particular the lines that look like: sub-cluster: asmbl_25 asmbl_26 asmbl_27 """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--prefix", default="pasa", help="Replace asmbl_ with prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, subclusters = args prefix = fastafile.rsplit(".", 1)[0] idsfile = prefix + ".fl.ids" fw = open(idsfile, "w") sizes = Sizes(fastafile).mapping name_convert = lambda x: x.replace("asmbl", opts.prefix) keep = set() # List of IDs to write fp = open(subclusters) nrecs = 0 for row in fp: if not row.startswith("sub-cluster:"): continue asmbls = row.split()[1:] longest_asmbl = max(asmbls, key=lambda x: sizes[x]) longest_size = sizes[longest_asmbl] print(name_convert(longest_asmbl), file=fw) nrecs += 1 cutoff = max(longest_size / 2, 200) keep.update(set(x for x in asmbls if sizes[x] >= cutoff)) fw.close() logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile)) f = Fasta(fastafile, lazy=True) newfastafile = prefix + ".clean.fasta" fw = open(newfastafile, "w") nrecs = 0 for name, rec in f.iteritems_ordered(): if name not in keep: continue rec.id = name_convert(name) rec.description = "" SeqIO.write([rec], fw, "fasta") nrecs += 1 fw.close() logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
def lcn(args): """ %prog lcn Orthogroups/Orthogroups.tsv Orthogroup_Sequences/ lcn/ """ p = OptionParser(lcn.__doc__) p.add_option("--min-single-ratio", default=0.9, help="Single copy ratio must be > ") p.add_option("--max-zero-ratio", default=0, help="Zero copy ratio must be < ") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) (groups_tsv, sequence_dir, lcn_dir) = args selected = [] # Read in the orthogroup definition and selected based on counts with open(groups_tsv) as fp: reader = csv.reader(fp, delimiter="\t") header = next(reader, None) species_names = header[1:] for row in reader: counts = [ len(x.split(", ")) if x.strip() != "" else 0 for x in row[1:] ] single_ratio = sum([x == 1 for x in counts]) / len(counts) zero_ratio = sum([x == 0 for x in counts]) / len(counts) if single_ratio < opts.min_single_ratio: continue if zero_ratio > opts.max_zero_ratio: continue print(row[0], single_ratio, zero_ratio, counts, file=sys.stderr) selected.append(row) logging.debug("A total of {} orthogroups selected".format(len(selected))) # Collect the FASTA sequences now mkdir(lcn_dir) for row in selected: orthogroup = row[0] orthogroup_fasta = "{}.fa".format(orthogroup) input_fasta = op.join(sequence_dir, orthogroup_fasta) fasta = Fasta(input_fasta) selected_seqs = [] for gene_names, species_name in zip(row[1:], species_names): gene_names = gene_names.split(", ") if len(gene_names) == 1: selected, = gene_names else: max_length, selected = max( (len(fasta[x]), x) for x in gene_names) selected_seq = fasta[selected] # Set gene name to species name so we can later combine them in supermatrix selected_seq.id = species_name selected_seq.name = species_name selected_seq.description = "" selected_seqs.append(selected_seq) output_fasta = op.join(lcn_dir, orthogroup_fasta) with open(output_fasta, "w") as fw: SeqIO.write(selected_seqs, fw, "fasta") print( "{}: {} => {} ({})".format(orthogroup, len(fasta), len(selected_seqs), output_fasta), file=sys.stderr, )