def calc_cai(sequence, genbank, cai_freqs=CAI_FREQS): """Return the CAI for a given genome.""" # create CAI index cai_index = {} for codons in SynonymousCodons.values(): codons = list(codons) codon_freqs = np.array([cai_freqs[x] for x in codons]) max_freq = max(codon_freqs) codon_freqs = codon_freqs / max_freq for i, x in enumerate(codons): cai_index[x] = codon_freqs[i] cai_table = CodonAdaptationIndex() cai_table.set_cai_index(cai_index) # concatenate ORFs orfs = [x for x in genbank.features if x.type.lower() == "cds"] cds_seq = "" for orf in orfs: cds_seq += proc_sequence( str(orf.extract(sequence)).upper().replace("U", "T")) # return cai return cai_table.cai_for_gene(cds_seq)
Pro CCG 15778 5.40 0.12 Pro CCA 51993 17.79 0.41 Pro CCT 39685 13.58 0.31 Pro CCC 20139 6.89 0.16''' import csv index = {} for aa, cn, n1,n2, f in csv.reader([x for x in cai_table.splitlines() if x.strip()], delimiter='\t'): index[cn] = float(n2) from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex from pprint import pprint cai = CodonAdaptationIndex() cai.set_cai_index(index) from Bio import SeqIO genes = list(SeqIO.parse("genes.fasta", "fasta")) from Bio.SeqUtils import GC for g in genes: print g.id, " ",cai.cai_for_gene( g.seq.tostring() )#, GC(g.seq.tostring())
'CGT': 0.08, 'CGC': 0.18, 'CGA': 0.11, 'CGG': 0.20, 'AGT': 0.15, 'AGC': 0.24, 'AGA': 0.21, 'AGG': 0.21, 'GGT': 0.16, 'GGC': 0.34, 'GGA': 0.25, 'GGG': 0.25 } CAI = CodonAdaptationIndex() CAI.set_cai_index(Codon_Index) def is_it_an_orf(s): is_it = True is_it = is_it and len(s) % 3 == 0 is_it = is_it and s.upper().startswith("ATG") is_it = is_it and (s.upper().endswith("TGA") or s.upper().endswith("TAG") or s.upper().endswith("TAA")) return (is_it) stopz = {"TGA": "opal", "TAG": "amber", "TAA": "ochre"} parser = argparse.ArgumentParser() parser.add_argument("-i",