def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
opts, args = p.parse_args(args) try: agpfile, componentfasta, targetfasta = args except Exception, e: sys.exit(p.print_help()) agp = AGP(agpfile) build = Fasta(targetfasta) bacs = Fasta(componentfasta, index=False) # go through this line by line for aline in agp: try: build_seq = build.sequence(dict(chr=aline.object, start=aline.object_beg, stop=aline.object_end)) if aline.is_gap: assert build_seq.upper() == aline.gap_length * 'N', \ "gap mismatch: %s" % aline else: bac_seq = bacs.sequence(dict(chr=aline.component_id, start=aline.component_beg, stop=aline.component_end, strand=aline.orientation)) assert build_seq.upper() == bac_seq.upper(), \ "sequence mismatch: %s" % aline logging.debug("%s:%d-%d verified" % (aline.object, aline.object_beg, aline.object_end))
def frommaf(args): """ %prog frommaf maffile Convert to four-column tabular format from MAF. """ p = OptionParser(frommaf.__doc__) p.add_option("--validate", help="Validate coordinates against FASTA [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) maf, = args snpfile = maf.rsplit(".", 1)[0] + ".vcf" fp = open(maf) fw = open(snpfile, "w") total = 0 id = "." qual = 20 filter = "PASS" info = "DP=20" print >> fw, "##fileformat=VCFv4.0" print >> fw, "#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t") for row in fp: atoms = row.split() c, pos, ref, alt = atoms[:4] try: c = int(c) except: continue c = "chr{0:02d}".format(c) pos = int(pos) print >> fw, "\t".join(str(x) for x in \ (c, pos, id, ref, alt, qual, filter, info)) total += 1 fw.close() validate = opts.validate if not validate: return from jcvi.utils.cbook import percentage f = Fasta(validate) fp = open(snpfile) nsnps = 0 for row in fp: if row[0] == '#': continue c, pos, id, ref, alt, qual, filter, info = row.split("\t") pos = int(pos) feat = dict(chr=c, start=pos, stop=pos) s = f.sequence(feat) s = str(s) assert s == ref, "Validation error: {0} is {1} (expect: {2})".\ format(feat, s, ref) nsnps += 1 if nsnps % 50000 == 0: logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total))) logging.debug("A total of {0} SNPs validated and written to `{1}`.".\ format(nsnps, snpfile))
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option("--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option("--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()
opts, args = p.parse_args(args) try: agpfile, componentfasta, targetfasta = args except Exception, e: sys.exit(p.print_help()) agp = AGP(agpfile) build = Fasta(targetfasta) bacs = Fasta(componentfasta, index=False) # go through this line by line for aline in agp: try: build_seq = build.sequence( dict(chr=aline.object, start=aline.object_beg, stop=aline.object_end)) if aline.is_gap: assert build_seq.upper() == aline.gap_length * 'N', \ "gap mismatch: %s" % aline else: bac_seq = bacs.sequence( dict(chr=aline.component_id, start=aline.component_beg, stop=aline.component_end, strand=aline.orientation)) assert build_seq.upper() == bac_seq.upper(), \ "sequence mismatch: %s" % aline
def load(args): ''' %prog load gff_file fasta_file [--options] Parses the selected features out of GFF, with subfeatures concatenated. For example, to get the CDS sequences, do this:: $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS ''' from jcvi.formats.fasta import Seq, SeqRecord p = OptionParser(load.__doc__) p.add_option( "--parents", dest="parents", default="mRNA", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option( "--children", dest="children", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") p.add_option("--attribute", help="The attribute field to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) gff_file, fasta_file = args g = make_index(gff_file) f = Fasta(fasta_file, index=False) fw = must_open(opts.outfile, "w") parents = set(opts.parents.split(',')) children_list = set(opts.children.split(',')) attr = opts.attribute for feat in get_parents(gff_file, parents): children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) description = ",".join(feat.attributes[attr]) \ if attr and attr in feat.attributes else "" description = description.replace("\"", "") rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description) SeqIO.write([rec], fw, "fasta") fw.flush()