def extract_ends(rec, sites, flank, fw, maxfragsize=800): """ Extraction of ends of fragments above certain size. """ nsites = len(sites) size = len(rec) for i, s in enumerate(sites): newid = "{0}:{1}".format(rec.name, s) recs = [] if i == 0 or s - sites[i - 1] <= maxfragsize: newidL = newid + "L" left = max(s - flank, 0) right = s frag = rec.seq[left:right].strip("Nn") recL = SeqRecord(frag, id=newidL, description="") if i == 0 and s > maxfragsize: # Contig L-end pass else: recs.append(recL) if i == nsites - 1 or sites[i + 1] - s <= maxfragsize: newidR = newid + "R" left = s right = min(s + flank, size) frag = rec.seq[left:right].strip("Nn") recR = SeqRecord(frag, id=newidR, description="") if i == nsites - 1 and size - s > maxfragsize: # Contig R-end pass else: recs.append(recR) SeqIO.write(recs, fw, "fasta")
def emitFragment(fw, fragID, libID, shredded_seq, clr=None, qvchar="l", fasta=False): """ Print out the shredded sequence. """ if fasta: s = SeqRecord(shredded_seq, id=fragID, description="") SeqIO.write([s], fw, "fasta") return seq = str(shredded_seq) slen = len(seq) qvs = qvchar * slen # shredded reads have default low qv if clr is None: clr_beg, clr_end = 0, slen else: clr_beg, clr_end = clr print( frgTemplate.format( fragID=fragID, libID=libID, seq=seq, qvs=qvs, clr_beg=clr_beg, clr_end=clr_end, ), file=fw, )
def extract_full(rec, sites, flank, fw): """ Full extraction of seq flanking the sites. """ for s in sites: newid = "{0}:{1}".format(rec.name, s) left = max(s - flank, 0) right = min(s + flank, len(rec)) frag = rec.seq[left:right].strip("Nn") newrec = SeqRecord(frag, id=newid, description="") SeqIO.write([newrec], fw, "fasta")
def circular(args): """ %prog circular fastafile startpos Make circular genome, startpos is the place to start the sequence. This can be determined by mapping to a reference. Self overlaps are then resolved. Startpos is 1-based. """ from jcvi.assembly.goldenpath import overlap p = OptionParser(circular.__doc__) p.add_option( "--flip", default=False, action="store_true", help="Reverse complement the sequence", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, startpos = args startpos = int(startpos) key, seq = next(parse_fasta(fastafile)) aseq = seq[startpos:] bseq = seq[:startpos] aseqfile, bseqfile = "a.seq", "b.seq" for f, s in zip((aseqfile, bseqfile), (aseq, bseq)): fw = must_open(f, "w") print(">{0}\n{1}".format(f, s), file=fw) fw.close() o = overlap([aseqfile, bseqfile]) seq = aseq[:o.qstop] + bseq[o.sstop:] seq = Seq(seq) if opts.flip: seq = seq.reverse_complement() for f in (aseqfile, bseqfile): os.remove(f) fw = must_open(opts.outfile, "w") rec = SeqRecord(seq, id=key, description="") SeqIO.write([rec], fw, "fasta") fw.close()
def emitFragment(fw, fragID, libID, shredded_seq, fasta=False): """ Print out the shredded sequence. """ if fasta: s = SeqRecord(shredded_seq, id=fragID, description="") SeqIO.write([s], fw, "fasta") return seq = str(shredded_seq) slen = len(seq) qvs = DEFAULTQV * slen # shredded reads have default low qv print >> fw, frgTemplate.format(fragID=fragID, libID=libID, seq=fill(seq), qvs=fill(qvs), slen=slen)
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option( "--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option( "--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()