def iterator_over_file_from_extension(filename): import gzip openers = {"gz" : gzip.open} ext = filename.split(".") opener = openers.get(ext[-1], open) i_o_f = iterator_over_file(filename, opener) if "fa" in ext or "fasta" in ext: from jbio.fasta import record_iterator as fasta_iterator return fasta_iterator(i_o_f) elif "fq" in ext or "fastq" in ext: from jbio.fastq import record_iterator as fastq_iterator return fastq_iterator(i_o_f) else: raise Exception("Unknown File Extension \'%s\'" % ext[-1])
if not len(sys.argv) == 3: print "gene_fasta.py input.fa input.gff" sys.exit(1) #FIELDS = ["ID","Alias","orf_classification","gene","Note"] FIELDS = ["ID","Note"] fa_fn,gff_fn = sys.argv[1:3] #read fasta records into memory def fasta_clean_getter(fasta_entry): name = fasta_entry.name.split()[0] return (name, fasta_entry.seq) fasta_records = dict(imap(fasta_clean_getter,fasta_iterator(iterator_over_file(fa_fn)))) gene_entries = ifilter(lambda x: x.feature == "gene", gff_iterator(iterator_over_file(gff_fn))) for gene_record in gene_entries: attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";"))) header = ">" + attrs["Name"] fields = FIELDS field_getter_func = lambda x : unquote(attrs.get(x,"None")) if x =="Note" else attrs.get(x,"None") field_getter = imap(field_getter_func, fields) header += " " + " ".join(imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter))) start, end = gene_record.start-1, gene_record.end-1 seq = fasta_records[gene_record.seqname][start:end+1] print header
#!/usr/bin/env python import sys from jbio.io.file import iterator_over_file from jbio.fasta import record_iterator as fasta_iterator if not len(sys.argv) == 2: sys.exit("fasta_to_line.py in.fa") for record in fasta_iterator(iterator_over_file(sys.argv[1])): print "\t".join([record.name, record.seq])
sys.exit(1) #FIELDS = ["ID","Alias","orf_classification","gene","Note"] FIELDS = ["ID", "Note"] fa_fn, gff_fn = sys.argv[1:3] #read fasta records into memory def fasta_clean_getter(fasta_entry): name = fasta_entry.name.split()[0] return (name, fasta_entry.seq) fasta_records = dict( imap(fasta_clean_getter, fasta_iterator(iterator_over_file(fa_fn)))) gene_entries = ifilter(lambda x: x.feature == "gene", gff_iterator(iterator_over_file(gff_fn))) for gene_record in gene_entries: attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";"))) header = ">" + attrs["Name"] fields = FIELDS field_getter_func = lambda x: unquote(attrs.get( x, "None")) if x == "Note" else attrs.get(x, "None") field_getter = imap(field_getter_func, fields) header += " " + " ".join( imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter))) start, end = gene_record.start - 1, gene_record.end - 1
#!/usr/bin/env python import sys from itertools import imap from jbio.io.file import iterator_over_file from jbio.fasta import record_iterator as fasta_iterator ##Create Kmers if not len(sys.argv) == 3: sys.exit("Usage: kmer.py k-size in.fa\n") fn = sys.argv[2] ksize = int(sys.argv[1]) for record in fasta_iterator(iterator_over_file(fn)): seq = record.seq starts = range(len(seq)-ksize+1) kmers = imap(lambda start: seq[start:start+ksize], starts) for kmer in kmers: print kmer