Example #1
0
 def optimize(self, codon_table):
     self.optimize_frequent(codon_table)
     # return
     opt_codons = self.__vaccine_codons_gen.copy()
     self.__vaccine_codons_gen.clear()
     vac_strand = self.get_strand(opt_codons)
     #vir_strand = self.get_strand(self.__virus_codons)
     codon_table = pct.get_codons_table(codon_table)
     problem = DnaOptimizationProblem(
         sequence=vac_strand,
         constraints=[
             EnforceTranslation(genetic_table='Standard',
                                start_codon='ATG'),
             EnforceGCContent(mini=0.54, maxi=0.9, window=120)
         ],
         objectives=[
             CodonOptimize(method="use_best_codon",
                           codon_usage_table=codon_table)
         ]
     )
     problem.resolve_constraints()
     problem.optimize()
     self.__vaccine_codons_gen = []
     count = 1
     vcodon = ""
     for x in problem.sequence:
         if count % 3 == 0:
             vcodon += x
             self.__vaccine_codons_gen.append(vcodon)
             vcodon = ""
         else:
             vcodon += x
         count += 1
     return
Example #2
0
    def __init__(self, species=None, location=None, mode='best_codon',
                 codon_usage_table=None, boost=1.0):
        self.mode = mode
        self.boost = boost
        if isinstance(location, tuple):
            location = Location.from_tuple(location, default_strand=+1)
        self.location = location
        self.species = species
        if species is not None:
            codon_usage_table = get_codons_table(self.species)
            # codon_usage_table = CODON_USAGE_TABLES[self.species]
        if codon_usage_table is None:
            raise ValueError("Provide either an species name or a codon "
                             "usage table")

        # PRECOMPUTE SOME TABLES (WILL BE PASSED ON TO LOCALIZED PROBLEMS)
        self.codon_usage_table = codon_usage_table
        if 'codons_frequencies' not in self.codon_usage_table:
                self.codon_usage_table['codons_frequencies'] = dict([
                item for aa_data in self.codon_usage_table.values()
                for item in aa_data.items()
            ])
        if 'best_frequencies' not in self.codon_usage_table:
            self.codon_usage_table['best_frequencies'] = {
                aa: max(aa_data.values())
                for aa, aa_data in self.codon_usage_table.items()
                if aa != 'codons_frequencies'
            }
Example #3
0
 def get_codons_table(species, codon_usage_table):
     if codon_usage_table is None:
         if species is None:
             raise ValueError(
                 "Provide either an species name or a codon usage table")
         else:
             codon_usage_table = get_codons_table(species)
     return codon_usage_table
Example #4
0
    def get_table(self, identifier: str) -> CodonFrequencyTable:
        """Retrieve a codon frequency table by organism identifier.

        Arguments:
            identifier  A identifier to select a codon table
        Return: `CodonFrequencyTable`
        """
        table = get_codons_table(identifier)
        return CodonFrequencyTable(identifier, table)
def test_basics():

    # LOAD ONE TABLE BY NAME
    table = pct.get_codons_table("b_subtilis_1423")
    assert table['T']['ACA'] == 0.4
    assert table['*']['TAA'] == 0.61

    # LOAD ALL TABLES AT ONCE
    codon_tables = pct.get_all_available_codons_tables()
    assert codon_tables['c_elegans_6239']['L']['CTA'] == 0.09
Example #6
0
def test_basics():

    # LOAD ONE TABLE BY NAME
    table = pct.get_codons_table("b_subtilis_1423")
    assert table["T"]["ACA"] == 0.4
    assert table["*"]["TAA"] == 0.61

    # LOAD ALL TABLES AT ONCE
    codon_tables = pct.get_all_available_codons_tables()
    assert codon_tables["c_elegans_6239"]["L"]["CTA"] == 0.09
def test_codon_optimize_with_custom_table():
    table = get_codons_table('b_subtilis')
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(1200, seed=123),
        constraints=[EnforceTranslation()],
        objectives=[CodonOptimize(codon_usage_table=table)]
    )
    assert (problem.objective_scores_sum() < -10)
    problem.optimize()
    assert (problem.objective_scores_sum() == 0)
Example #8
0
def optimize_remap_pct(vir):
    ct = pct.get_codons_table(species)
    r = []
    for a in vir:
        b = biotools.translate(a)
        t = [
            (k, v)
            for k, v in sorted(ct[b].items(), key=lambda i: i[1], reverse=True)
        ]
        r.append(t[0][0])
    return r
def test_readme_example():
    table = pct.get_codons_table("b_subtilis_1423")
    assert table['T']['ACA'] == 0.4
    assert table['*']['TAA'] == 0.61

    # LOAD ALL TABLES AT ONCE
    codons_tables = pct.get_all_available_codons_tables()
    assert codons_tables['c_elegans_6239']['L']['CTA'] == 0.09

    # GET A TABLE DIRECTLY FROM THE INTERNET
    table = pct.download_codons_table(taxid=316407)
    assert table['*']['UGA'] == 0.29
Example #10
0
def test_readme_example():
    table = pct.get_codons_table("b_subtilis_1423")
    assert table["T"]["ACA"] == 0.4
    assert table["*"]["TAA"] == 0.61

    # LOAD ALL TABLES AT ONCE
    codons_tables = pct.get_all_available_codons_tables()
    assert codons_tables["c_elegans_6239"]["L"]["CTA"] == 0.09

    # GET A TABLE DIRECTLY FROM THE INTERNET
    table = pct.download_codons_table(taxid=316407)
    assert table["*"]["UGA"] == 0.29
Example #11
0
 def optimize_frequent(self, codon_table):
     table = pct.get_codons_table(codon_table)
     self.__vaccine_codons_gen.clear()
     for codon in self.__virus_codons:
         amino_acid = self.__amino_acids[codon]
         if amino_acid == "s":
             amino_acid = "*"
         sorted_amino = dict(
             sorted(table[amino_acid].items(), key=lambda item: item[1], reverse=True))
         new_codon = list(sorted_amino.keys())[0]
         self.__vaccine_codons_gen.append(new_codon)
     return
Example #12
0
def most_freq(species):
    tables = {}
    codons_table = pct.get_codons_table(species)
    for ac in codons_table:
        freq_table = codons_table[ac]
        codons = sorted(freq_table)
        max_freq = 0
        most_freq_cd = None
        for codon in codons:
            frequency = freq_table[codon]
            if frequency > max_freq:
                max_freq = frequency
                most_freq_cd = codon
        for codon in codons:
            tables[codon] = most_freq_cd
    return tables
Example #13
0
File: ESO.py Project: NivAmi/iGEM
def EFM_optimizer(seq, miniGC=0.3, maxiGC=0.7, window_size_GC=None, method='use_best_codon',
                  organism_name="not_specified",
                  df_recombination=pd.DataFrame(), df_slippage=pd.DataFrame(), df_methylation=pd.DataFrame(),
                  curr_output_path=None,
                  filename='Optimization_report_staubility.zip', with_report=False, indexes=None):
    """
    Description
    ----------
    This function optimizes the input sequence (string or SeqRecord) based on the identified suspected area for
    recombination, slippage and methylation, that are given as inputs (df_recombination, df_slippage, df_methylation).
    The identification of those areas is based on the principles that are described in the EFM calculator web tool and
    the article cited below.
    Those areas contain patterns that are likely to go under recombination, slippage or methylation, thus avoiding those
    patterns should increase the stability of the input gene.
    The translation of the gene is kept (no changes to the amino acid sequence).
    Citation: Benjamin R. Jack, Sean P. Leonard, Dennis M. Mishler, Brian A. Renda, Dacia Leon, Gabriel A. Suรกrez, and
    Jeffrey E Barrick (2015). Predicting the genetic stability of engineered DNA sequences with the EFM Calculator.
    ACS Synthetic Biology. Just Accepted Manuscript. DOI: 10.1021/acssynbio.5b00068

    Other aspects that this optimizer takes into consideration are:
    a. If the sequence is divisible by 3, the optimization parameter is "Codon usage fraction":
    the relative frequency of the codon in the host genome (specified by the organism_name input).
    Meaning, each codon is optimized based on its frequency in the host, in one of the methods below.
    The frequency data is from kazusa database (http://www.kazusa.or.jp/codon/readme_codon.html).
    Citation: Codon usage tabulated from the international DNA sequence databases, status for the year 2000.,
    Nakamura, Y., Gojobori, T. and Ikemura, T. (2000) Nucl. Acids Res. 28, 292.
    b. Codon Optimization method (for sequences that are divisible by 3):
        - For method = "use_best_codon", every codon will be replaced by the "best" (i.e. most frequent) synonymous codon
          in the target organism. This is equivalent to Codon Adaptation Index (CAI) optimization.
        - For method = "match_codon_usage", the final sequence's codon usage will match as much as possible the codon usage
          profile of the target species (this method is used throughout the literature,
          see for instance Hale and Thomson 1998).
        - For method = "harmonize_rca", Each codon will be replaced by a synonymous codon whose usage in the target organism
          matches the usage of the original codon in its host organism (as per Claassens 2017).
        Those methods are provided through the use of DNAchisel package for the optimization task.
    c. GC content (optional) - the requested range (minimum and maximum) of the percentage of nitrogenous bases in the
       sequence. the algorithm will split the sequence to windows of a specified size and on optimize each window.
       Basically, The lower the GC content, the more stable is the sequence, so one should take that into consideration.

    Parameters
    ----------
    seq: string of ACGT alphabet (copy-paste of DNA sequence), or SeqRecord (originated from a fasta file)
    miniGC and maxiGC: a numerical value from 0 to 1, specifies the range of GC content.
    window_size_GC: numerical. The window size (number of nucleotides) in which the requested GC content is to
       be maintained.
    method: optimization method.
       This is a string from the following: {"use_best_codon", "match_codon_usage", "harmonize_rca"}.
    organism_name: the name of the host of the gene. The codon optimization is done according to the host codon's frequency.
        This is a string from the following: {'b_subtilis', 'c_elegans', 'd_melanogaster', 'e_coli',
        'g_gallus', 'h_sapiens', 'm_musculus', 'm_musculus_domesticus', 's_cerevisiae','not_specified'}.
        One can access this list (aside from 'not_specified') via "python_codon_tables.available_codon_tables_names".
        If the organism is 'not_specified' then the codon optimization objective will not be defined.
    df_recombination, df_slippage, df_methylation: dataframes, each containing a list of patterns and their locations that
                                                may influence the genetic stability of the gene and thus should be avoided.
                                                These dataframes will be saved as csv. files in a different function,
                                                as the output of the EFM calculator.
    indexes: a tuple that specifies the ORF indices. For the sub-sequence in those indices, the optimizer enforces
        translation (keeps amino-acid sequence), and optimizes the codon usage. The rest of the sequence is optimized by means
        of GC content and EFM constraints (if provided).
    curr_output_path: a path in which the output report will be saved (see "returns" below).
    filename: the file name (of the output report). with a ".zip" suffix.
    with_report: a flag that indicates if the function will output a pdf report or just optimize the sequence.

    Returns
    ----------
    final_sequence: the optimized sequence
    final_record: a brief summary of the changes, includes sequence edits
    exported file named 'Translation_report.zip' if the input sequence is a string,
        or 'Translation_report_seqID.zip' if the input is FASTA format with an id.
        The file will be saved in "curr_output_path" folder.
        This file contains a report of the changes in anotated genbank format, in a pdf format and csv lists,
        all including a detailed description of the changes from the constraints and objectives."""

    # set a default value for the window size as 1/50 of the sequence length.
    if window_size_GC is None:
        window_size_GC = round(len(seq) / 50)

    # Match the weight table for the organism (when the organism is specified):
    if organism_name == 'not_specified':
        obj = []
    else:
        codon_usage_table = pct.get_codons_table(organism_name).copy()
        # objective function:
        obj = [CodonOptimize(species=organism_name, location=indexes, codon_usage_table=codon_usage_table.copy(),
                             method=method)]

    ## Define area for codon optimization while keeping amino-acid translation:
    if indexes == None:
        indexes = (0, len(seq))

        # DEFINE THE CONTSTRAINTS:
    cnst = [
        EnforceGCContent(mini=miniGC, maxi=maxiGC, window=window_size_GC),
        # enforce GC content between 30% to 50% (default)
        EnforceTranslation(location=indexes)  # Enforce a specific amino-acid sequence translation.
    ]

    ### EFM constraints (if given):
    # recombination:
    if not df_recombination.empty:
        # change column names to the same name in each df and then send to a function that produces the patterns dictionary:
        df_rec = df_recombination[0:10].copy()[['start_1', 'end_1', 'sequence']].rename(
            columns={'start_1': 'start', 'end_1': 'end'})
        # convert df to a list of constraints to be used as input for the optimization problem:
        cnst_rec = convert_df_to_constraints(df_rec)
        # add to the constraints:
        cnst.extend(cnst_rec)

    # slippage:
    if not df_slippage.empty:
        # change column names to the same name in each df and then send to a function that produces the patterns dictionary:
        # convert the slippage dataframe to the basic repeated units for the constrains:
        df_slip = df_slippage[0:10].copy()
        df_slip = df_slip.loc[df_slip.log10_prob_slippage_ecoli > -9]  # only 'severe' constraints.
        df_slip = modify_df_slippage(df_slip)
        # convert df to a list of constraints to be used as input for the optimization problem:
        cnst_slip = convert_df_to_constraints(df_slip)
        # add to the constraints:
        cnst.extend(cnst_slip)

    # methylation:
    if not df_methylation.empty:
        # change column names to the same name in each df and then send to a function that produces the patterns dictionary:
        df_meth = df_methylation[0:10].copy()[['start_index', 'end_index', 'actual_site']].rename(
            columns={'start_index': 'start', 'end_index': 'end', 'actual_site': 'sequence'})
        df_meth.loc[:, 'start'] = df_meth['start'].astype(int)
        df_meth.loc[:, 'end'] = df_meth['end'].astype(int)

        # convert df to a list of constraints to be used as input for the optimization problem:
        cnst_meth = convert_df_to_constraints(df_meth)

        # add to the constraints:
        cnst.extend(cnst_meth)

    # DEFINE THE OPTIMIZATION PROBLEM
    flag = 1
    while flag < 30:  # while there are less than 10 hard constraints that were not satisfied
        problem = DnaOptimizationProblem(
            sequence=seq,
            constraints=cnst,
            objectives=obj
        )

        # SOLVE THE CONSTRAINTS, OPTIMIZE WITH RESPECT TO THE OBJECTIVE AND PRODUCE A REPORT
        try:
            problem.resolve_constraints()
            flag = 0  # all constraints passed.
            break
        except NoSolutionError as e:
            cnst.remove(e.constraint)  # remove the problematic contstraint
            # print("Warning "+ str(flag) + ": The constraint " +str(e.constraint)+" has been failed. trying to solve the problem without it.")
            flag = flag + 1
    else:
        raise NoSolutionError(
            "Unfortunately, more than 30 hard constraints were not satistied." + str(flag),
            problem=problem
        )

    ## OPTIMIZE OBJECTIVE FUNCTION:
    if with_report == False:
        problem.optimize()  # without a report.
    else:
        target = join(curr_output_path, filename)  # define the exported file name (and path)
        try:
            reports.optimization_reports.write_optimization_report(target=target, problem=problem,
                                                                   project_name="staubility_EFM_optimizer",
                                                                   plot_figure=True)
        except FileNotFoundError:
            # The sequence is too long or perhaps there are too many changes to be displayed in an annotated figure,
            # so the report will be produced without it.
            # The annotated sequence after changes can be obtained from the exported genebank file.
            reports.optimization_reports.write_optimization_report(target=target, problem=problem,
                                                                   project_name="staubility_EFM_optimizer",
                                                                   plot_figure=False)

    # GET THE FINAL SEQUENCE (AS STRING OR ANNOTATED BIOPYTHON RECORDS)
    final_sequence = problem.sequence  # string
    if with_report == True:
        plot_sequenticons(seq, final_sequence, curr_output_path, target)  ##plot the sequenticons

    # final_record = problem.to_record(with_sequence_edits=True)
    # if isinstance(seq, SeqRecord):  # if the original sequence is a fasta file:
    #     final_record.id = seq.id
    #     final_record.name = seq.name
    #     final_record.description = seq.description
    return final_sequence
Example #14
0
import python_codon_tables

from dnachisel import *

# Subbed in `CCTCCT` for `AAAGTT` to account for proline substitution
virus = 'ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACCCTCCTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAA'
vaccine = 'ATGTTCGTGTTCCTGGTGCTGCTGCCTCTGGTGTCCAGCCAGTGTGTGAACCTGACCACCAGAACACAGCTGCCTCCAGCCTACACCAACAGCTTTACCAGAGGCGTGTACTACCCCGACAAGGTGTTCAGATCCAGCGTGCTGCACTCTACCCAGGACCTGTTCCTGCCTTTCTTCAGCAACGTGACCTGGTTCCACGCCATCCACGTGTCCGGCACCAATGGCACCAAGAGATTCGACAACCCCGTGCTGCCCTTCAACGACGGGGTGTACTTTGCCAGCACCGAGAAGTCCAACATCATCAGAGGCTGGATCTTCGGCACCACACTGGACAGCAAGACCCAGAGCCTGCTGATCGTGAACAACGCCACCAACGTGGTCATCAAAGTGTGCGAGTTCCAGTTCTGCAACGACCCCTTCCTGGGCGTCTACTACCACAAGAACAACAAGAGCTGGATGGAAAGCGAGTTCCGGGTGTACAGCAGCGCCAACAACTGCACCTTCGAGTACGTGTCCCAGCCTTTCCTGATGGACCTGGAAGGCAAGCAGGGCAACTTCAAGAACCTGCGCGAGTTCGTGTTTAAGAACATCGACGGCTACTTCAAGATCTACAGCAAGCACACCCCTATCAACCTCGTGCGGGATCTGCCTCAGGGCTTCTCTGCTCTGGAACCCCTGGTGGATCTGCCCATCGGCATCAACATCACCCGGTTTCAGACACTGCTGGCCCTGCACAGAAGCTACCTGACACCTGGCGATAGCAGCAGCGGATGGACAGCTGGTGCCGCCGCTTACTATGTGGGCTACCTGCAGCCTAGAACCTTCCTGCTGAAGTACAACGAGAACGGCACCATCACCGACGCCGTGGATTGTGCTCTGGATCCTCTGAGCGAGACAAAGTGCACCCTGAAGTCCTTCACCGTGGAAAAGGGCATCTACCAGACCAGCAACTTCCGGGTGCAGCCCACCGAATCCATCGTGCGGTTCCCCAATATCACCAATCTGTGCCCCTTCGGCGAGGTGTTCAATGCCACCAGATTCGCCTCTGTGTACGCCTGGAACCGGAAGCGGATCAGCAATTGCGTGGCCGACTACTCCGTGCTGTACAACTCCGCCAGCTTCAGCACCTTCAAGTGCTACGGCGTGTCCCCTACCAAGCTGAACGACCTGTGCTTCACAAACGTGTACGCCGACAGCTTCGTGATCCGGGGAGATGAAGTGCGGCAGATTGCCCCTGGACAGACAGGCAAGATCGCCGACTACAACTACAAGCTGCCCGACGACTTCACCGGCTGTGTGATTGCCTGGAACAGCAACAACCTGGACTCCAAAGTCGGCGGCAACTACAATTACCTGTACCGGCTGTTCCGGAAGTCCAATCTGAAGCCCTTCGAGCGGGACATCTCCACCGAGATCTATCAGGCCGGCAGCACCCCTTGTAACGGCGTGGAAGGCTTCAACTGCTACTTCCCACTGCAGTCCTACGGCTTTCAGCCCACAAATGGCGTGGGCTATCAGCCCTACAGAGTGGTGGTGCTGAGCTTCGAACTGCTGCATGCCCCTGCCACAGTGTGCGGCCCTAAGAAAAGCACCAATCTCGTGAAGAACAAATGCGTGAACTTCAACTTCAACGGCCTGACCGGCACCGGCGTGCTGACAGAGAGCAACAAGAAGTTCCTGCCATTCCAGCAGTTTGGCCGGGATATCGCCGATACCACAGACGCCGTTAGAGATCCCCAGACACTGGAAATCCTGGACATCACCCCTTGCAGCTTCGGCGGAGTGTCTGTGATCACCCCTGGCACCAACACCAGCAATCAGGTGGCAGTGCTGTACCAGGACGTGAACTGTACCGAAGTGCCCGTGGCCATTCACGCCGATCAGCTGACACCTACATGGCGGGTGTACTCCACCGGCAGCAATGTGTTTCAGACCAGAGCCGGCTGTCTGATCGGAGCCGAGCACGTGAACAATAGCTACGAGTGCGACATCCCCATCGGCGCTGGAATCTGCGCCAGCTACCAGACACAGACAAACAGCCCTCGGAGAGCCAGAAGCGTGGCCAGCCAGAGCATCATTGCCTACACAATGTCTCTGGGCGCCGAGAACAGCGTGGCCTACTCCAACAACTCTATCGCTATCCCCACCAACTTCACCATCAGCGTGACCACAGAGATCCTGCCTGTGTCCATGACCAAGACCAGCGTGGACTGCACCATGTACATCTGCGGCGATTCCACCGAGTGCTCCAACCTGCTGCTGCAGTACGGCAGCTTCTGCACCCAGCTGAATAGAGCCCTGACAGGGATCGCCGTGGAACAGGACAAGAACACCCAAGAGGTGTTCGCCCAAGTGAAGCAGATCTACAAGACCCCTCCTATCAAGGACTTCGGCGGCTTCAATTTCAGCCAGATTCTGCCCGATCCTAGCAAGCCCAGCAAGCGGAGCTTCATCGAGGACCTGCTGTTCAACAAAGTGACACTGGCCGACGCCGGCTTCATCAAGCAGTATGGCGATTGTCTGGGCGACATTGCCGCCAGGGATCTGATTTGCGCCCAGAAGTTTAACGGACTGACAGTGCTGCCTCCTCTGCTGACCGATGAGATGATCGCCCAGTACACATCTGCCCTGCTGGCCGGCACAATCACAAGCGGCTGGACATTTGGAGCAGGCGCCGCTCTGCAGATCCCCTTTGCTATGCAGATGGCCTACCGGTTCAACGGCATCGGAGTGACCCAGAATGTGCTGTACGAGAACCAGAAGCTGATCGCCAACCAGTTCAACAGCGCCATCGGCAAGATCCAGGACAGCCTGAGCAGCACAGCAAGCGCCCTGGGAAAGCTGCAGGACGTGGTCAACCAGAATGCCCAGGCACTGAACACCCTGGTCAAGCAGCTGTCCTCCAACTTCGGCGCCATCAGCTCTGTGCTGAACGATATCCTGAGCAGACTGGACCCTCCTGAGGCCGAGGTGCAGATCGACAGACTGATCACAGGCAGACTGCAGAGCCTCCAGACATACGTGACCCAGCAGCTGATCAGAGCCGCCGAGATTAGAGCCTCTGCCAATCTGGCCGCCACCAAGATGTCTGAGTGTGTGCTGGGCCAGAGCAAGAGAGTGGACTTTTGCGGCAAGGGCTACCACCTGATGAGCTTCCCTCAGTCTGCCCCTCACGGCGTGGTGTTTCTGCACGTGACATATGTGCCCGCTCAAGAGAAGAATTTCACCACCGCTCCAGCCATCTGCCACGACGGCAAAGCCCACTTTCCTAGAGAAGGCGTGTTCGTGTCCAACGGCACCCATTGGTTCGTGACACAGCGGAACTTCTACGAGCCCCAGATCATCACCACCGACAACACCTTCGTGTCTGGCAACTGCGACGTCGTGATCGGCATTGTGAACAATACCGTGTACGACCCTCTGCAGCCCGAGCTGGACAGCTTCAAAGAGGAACTGGACAAGTACTTTAAGAACCACACAAGCCCCGACGTGGACCTGGGCGATATCAGCGGAATCAATGCCAGCGTCGTGAACATCCAGAAAGAGATCGACCGGCTGAACGAGGTGGCCAAGAATCTGAACGAGAGCCTGATCGACCTGCAAGAACTGGGGAAGTACGAGCAGTACATCAAGTGGCCCTGGTACATCTGGCTGGGCTTTATCGCCGGACTGATTGCCATCGTGATGGTCACAATCATGCTGTGTTGCATGACCAGCTGCTGTAGCTGCCTGAAGGGCTGTTGTAGCTGTGGCAGCTGCTGCAAGTTCGACGAGGACGATTCTGAGCCCGTGCTGAAGGGCGTGAAACTGCACTACACATGA'

codon_usage_table = python_codon_tables.get_codons_table("h_sapiens_9606")


def optimize_virus():
    problem = DnaOptimizationProblem(
        sequence=virus,
        constraints=[
            EnforceGCContent(mini=0.4, maxi=0.7, window=100),
            EnforceTranslation(),
            AvoidHairpins(),
            # AvoidStopCodons(),
            # EnforceMeltingTemperature(),
        ],
        objectives=[
            CodonOptimize(codon_usage_table=codon_usage_table)
        ]
    )
    problem.resolve_constraints()
    problem.optimize()
    return problem.sequence

if __name__ == "__main__":
    sequence = optimize_virus()
def test_get_codons_table():
    for table_name in (1423, "1423", "b_subtilis", "b_subtilis_1423"):
        table = pct.get_codons_table(table_name)
        assert table['T']['ACA'] == 0.4
        assert table['*']['TAA'] == 0.61
Example #16
0
from dnachisel import *
import python_codon_tables
import mmap

codon_usage_table = python_codon_tables.get_codons_table(57486)
species = 'h_sapiens'
#TaxIDs: 10090,57486 (mus musculus) 9544 (macaca mulatta) 9606 (h**o sapiens)


def mmap_io(filename):
    with open(filename, mode="r", encoding="utf8") as file_obj:
        with mmap.mmap(file_obj.fileno(), length=0,
                       access=mmap.ACCESS_READ) as mmap_obj:
            return mmap_obj.read().decode("UTF-8").split("\n")[:-1]


input = mmap_io("codon.txt")

virus, vaccine = input[0], input[1]


def compute_match(one, two):
    num_mismatches = 0
    for base1, base2 in zip(one, two):
        if base1 != base2:
            num_mismatches = num_mismatches + 1
    return (1 - float(num_mismatches) / float(len(one)))


def optimize_virus():
    problem = DnaOptimizationProblem(
Example #17
0
import python_codon_tables as pct

# PRINT THE LIST OF NAMES OF ALL AVAILABLE TABLES
print('Available tables:', pct.available_codon_tables_names)

# LOAD ONE TABLE BY NAME
table = pct.get_codons_table("b_subtilis_1423")
print(table['T']['ACA'])  # returns 0.4
print(table['*']['UAA'])  # returns 0.61

# LOAD ALL TABLES AT ONCE
codon_tables = pct.get_all_available_codons_tables()
print(codon_tables['c_elegans_6239']['L']['CUA'])  # returns 0.09
def test_replace_U_by_T():
    table = pct.get_codons_table("b_subtilis_1423", replace_U_by_T=False)
    assert table['*']['UAA'] == 0.61
Example #19
0
def test_get_codons_table():
    for table_name in (1423, "1423", "b_subtilis", "b_subtilis_1423"):
        table = pct.get_codons_table(table_name)
        assert table["T"]["ACA"] == 0.4
        assert table["*"]["TAA"] == 0.61