예제 #1
0
 def setUpClass(cls):
     # Example of crc64 collision from Sebastian Bassi using the
     # immunoglobulin lambda light chain variable region from H**o sapiens
     # Both sequences share the same CRC64 checksum: 44CAAD88706CC153
     cls.str_light_chain_one = (
         "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEGSKRPSGV"
         "SNRFSGSKSGNTASLTISGLQAEDEADYYCSSYAGSSTLVFGGGTKLTVL")
     cls.str_light_chain_two = (
         "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEGSKRPSGV"
         "SNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSSTWVFGGGTKLTVL")
     X = CodonAdaptationIndex()
     path = os.path.join("CodonUsage", "HighlyExpressedGenes.txt")
     X.generate_index(path)
     cls.X = X
예제 #2
0
    def test_codon_usage_custom(self):
        """Test Codon Adaptation Index (CAI) using FASTA file for background."""
        # We need a FASTA file of CDS sequences to count the codon usage...
        dna_fasta_filename = "fasta.tmp"
        dna_genbank_filename = "GenBank/NC_005816.gb"
        record = SeqIO.read(dna_genbank_filename, "genbank")
        records = []
        for feature in record.features:
            if feature.type == "CDS" and len(feature.location.parts) == 1:
                start = feature.location.start.position
                end = feature.location.end.position
                table = int(feature.qualifiers["transl_table"][0])
                if feature.strand == -1:
                    seq = record.seq[start:end].reverse_complement()
                else:
                    seq = record.seq[start:end]
                # Double check we have the CDS sequence expected
                # TODO - Use any cds_start option if/when added to deal with the met
                a = "M" + str(seq[3:].translate(table))
                b = feature.qualifiers["translation"][0] + "*"
                self.assertEqual(a, b, "%r vs %r" % (a, b))
                records.append(
                    SeqRecord(
                        seq,
                        id=feature.qualifiers["protein_id"][0],
                        description=feature.qualifiers["product"][0],
                    )
                )

        with open(dna_fasta_filename, "w") as handle:
            SeqIO.write(records, handle, "fasta")

        CAI = CodonAdaptationIndex()
        # Note - this needs a FASTA file which containing non-ambiguous DNA coding
        # sequences - which should each be a whole number of codons.
        CAI.generate_index(dna_fasta_filename)
        # Now check codon usage index (CAI) using this species
        self.assertEqual(
            record.annotations["source"], "Yersinia pestis biovar Microtus str. 91001"
        )
        value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
        self.assertAlmostEqual(value, 0.67213, places=5)
        os.remove(dna_fasta_filename)
예제 #3
0
def calc_cai(sequence, genbank, cai_freqs=CAI_FREQS):
    """Return the CAI for a given genome."""
    # create CAI index
    cai_index = {}
    for codons in SynonymousCodons.values():
        codons = list(codons)
        codon_freqs = np.array([cai_freqs[x] for x in codons])
        max_freq = max(codon_freqs)
        codon_freqs = codon_freqs / max_freq
        for i, x in enumerate(codons):
            cai_index[x] = codon_freqs[i]
    cai_table = CodonAdaptationIndex()
    cai_table.set_cai_index(cai_index)

    # concatenate ORFs
    orfs = [x for x in genbank.features if x.type.lower() == "cds"]
    cds_seq = ""
    for orf in orfs:
        cds_seq += proc_sequence(
            str(orf.extract(sequence)).upper().replace("U", "T"))

    # return cai
    return cai_table.cai_for_gene(cds_seq)
예제 #4
0
    def test_codon_usage_custom(self):
        """Test Codon Adaptation Index (CAI) using FASTA file for background."""
        #We need a FASTA file of CDS sequences to count the codon usage...
        dna_fasta_filename = "fasta.tmp"
        dna_genbank_filename = "GenBank/NC_005816.gb"
        record = SeqIO.read(dna_genbank_filename, "genbank")
        records = []
        for feature in record.features:
            if feature.type == "CDS" and not feature.sub_features:
                start = feature.location.start.position
                end = feature.location.end.position
                table = int(feature.qualifiers["transl_table"][0])
                if feature.strand == -1:
                    seq = record.seq[start:end].reverse_complement()
                else:
                    seq = record.seq[start:end]
                #Double check we have the CDS sequence expected
                #TODO - Use any cds_start option if/when added to deal with the met
                a = "M" + str(seq[3:].translate(table))
                b = feature.qualifiers["translation"][0] + "*"
                self.assertEqual(a, b, "%r vs %r" % (a, b))
                records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
                                        description=feature.qualifiers["product"][0]))

        with open(dna_fasta_filename, "w") as handle:
            SeqIO.write(records, handle, "fasta")

        CAI = CodonAdaptationIndex()
        # Note - this needs a FASTA file which containing non-ambiguous DNA coding
        # sequences - which should each be a whole number of codons.
        CAI.generate_index(dna_fasta_filename)
        # Now check codon usage index (CAI) using this species
        self.assertEqual(record.annotations["source"],
                         "Yersinia pestis biovar Microtus str. 91001")
        self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
                         "0.67213")
        os.remove(dna_fasta_filename)
예제 #5
0
tuple_records = quick_FASTA_reader(dna_fasta_filename)
assert len(tuple_records)==3
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records):
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
print "Example CAI %0.5f using E. coli (default)" \
      % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")

#We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(open(dna_genbank_filename), "genbank")
records = []
for feature in record.features:
    if feature.type == "CDS" \
    and not feature.sub_features:
        start = feature.location.start.position
        end = feature.location.end.position
        table = int(feature.qualifiers["transl_table"][0])
예제 #6
0
	
Pro	CCG	15778	5.40	0.12
Pro	CCA	51993	17.79	0.41
Pro	CCT	39685	13.58	0.31
Pro	CCC	20139	6.89	0.16'''
import csv

index = {}
for aa, cn, n1,n2, f in csv.reader([x for x in cai_table.splitlines() if x.strip()], delimiter='\t'):
    index[cn] = float(n2)

from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex
from pprint import pprint


cai = CodonAdaptationIndex()

cai.set_cai_index(index)

from Bio import SeqIO

genes = list(SeqIO.parse("genes.fasta", "fasta"))

from Bio.SeqUtils import GC

for g in genes:
    print g.id, "         ",cai.cai_for_gene( g.seq.tostring() )#, GC(g.seq.tostring())


예제 #7
0
tuple_records = quick_FASTA_reader(dna_fasta_filename)
assert len(tuple_records)==3
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records) :
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
print "Example CAI %0.5f using E. coli (default)" \
      % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")

#We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(open(dna_genbank_filename), "genbank")
records = []
for feature in record.features :
    if feature.type == "CDS" \
    and not feature.sub_features :
        start = feature.location.start.position
        end = feature.location.end.position
        table = int(feature.qualifiers["transl_table"][0])
예제 #8
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
                      "0.09978")
예제 #9
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     self.assertEqual(
         "%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
         "0.09978")
예제 #10
0
 def test_codon_usage_ecoli(self):
     """Test Codon Adaptation Index (CAI) using default E. coli data."""
     CAI = CodonAdaptationIndex()
     value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")
     self.assertAlmostEqual(value, 0.09978, places=5)
예제 #11
0
 def __init__(self, verbose=False):
     CodonAdaptationIndex.__init__(self)
     self.verbose = verbose
예제 #12
0
'''

index={}

for codon in CodonsDict.keys():
    DNA_codon = codon.replace("T","U")
    i = cai_table.find( DNA_codon )    
    index[codon] = float(cai_table[i+4:i+8])/100
  

from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqUtils.CodonUsageIndices import SharpEcoliIndex
from pprint import pprint


cai = CodonAdaptationIndex()

cai.set_cai_index(index)

print cai.cai_for_gene("ATGACTGAATTCAAGGCCGGTTCTGCTAAGAAAGGTGCTACACTTTTCAAGACTAGATGTCTACAATGCCACACCGTGGAAAAGGGTGGCCCACATAAGGTTGGTCCAAACTTGCATGGTATCTTTGGCAGACACTCTGGTCAAGCTGAAGGGTATTCGTACACAGATGCCAATATCAAGAAAAACGTGTTGTGGGACGAAAATAACATGTCAGAGTACTTGACTAACCCAAAGAAATATATTCCTGGTACCAAGATGGCCTTTGGTGGGTTGAAGAAGGAAAAAGACAGAAACGACTTAATTACCTACTTGAAAAAAGCCTGTGAGTAA")



# http://www.genscript.com/cgi-bin/tools/rare_codon_analysis
# cai = 0.79


'''
UUU 26.1(170666)  UCU 23.5(153557)  UAU 18.8(122728)  UGU  8.1( 52903)
UUC 18.4(120510)  UCC 14.2( 92923)  UAC 14.8( 96596)  UGC  4.8( 31095)
UUA 26.2(170884)  UCA 18.7(122028)  UAA  1.1(  6913)  UGA  0.7(  4447)
예제 #13
0
####
import os
import sys
from Bio import SeqIO
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqUtils
from Bio.SeqUtils import CodonUsage
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex, SynonymousCodons, CodonsDict
from types import MethodType

Entrez.email = "*****@*****.**"

#define new CAI class
CAI = CodonAdaptationIndex()


def _count_codons2(self, fasta_file):
    with open(fasta_file, 'r') as handle:

        # make the codon dictionary local
        self.codon_count = CodonsDict.copy()

        # iterate over sequence and count all the codons in the FastaFile.
        for cur_record in SeqIO.parse(handle, "fasta"):
            # make sure the sequence is lower case
            if str(cur_record.seq).islower():
                dna_sequence = str(cur_record.seq).upper()
            else:
                dna_sequence = str(cur_record.seq)
    'TGG': 1.00,
    'CGT': 0.08,
    'CGC': 0.18,
    'CGA': 0.11,
    'CGG': 0.20,
    'AGT': 0.15,
    'AGC': 0.24,
    'AGA': 0.21,
    'AGG': 0.21,
    'GGT': 0.16,
    'GGC': 0.34,
    'GGA': 0.25,
    'GGG': 0.25
}

CAI = CodonAdaptationIndex()
CAI.set_cai_index(Codon_Index)


def is_it_an_orf(s):
    is_it = True
    is_it = is_it and len(s) % 3 == 0
    is_it = is_it and s.upper().startswith("ATG")
    is_it = is_it and (s.upper().endswith("TGA") or s.upper().endswith("TAG")
                       or s.upper().endswith("TAA"))
    return (is_it)


stopz = {"TGA": "opal", "TAG": "amber", "TAA": "ochre"}

parser = argparse.ArgumentParser()