def test_AvoidPattern_on_strands():

    # Negative strand only
    sequence = "CATGCTATGC"
    problem = DnaOptimizationProblem(
        sequence,
        constraints=[AvoidPattern("CAT", location=(0, 10, -1))],
        logger=None,
    )
    problem.resolve_constraints()
    assert "CAT" in problem.sequence
    assert "ATG" not in problem.sequence

    # Negative strand only
    sequence = "CATGCTATGC"
    problem = DnaOptimizationProblem(
        sequence,
        constraints=[AvoidPattern("CAT", location=(0, 10, -1))],
        logger=None,
    )
    problem.resolve_constraints()
    assert "CAT" in problem.sequence
    assert "ATG" not in problem.sequence

    # Both strands
    sequence = "CATGCTATGC"
    problem = DnaOptimizationProblem(
        sequence,
        constraints=[AvoidPattern("CAT")],
        logger=None,
    )
    problem.resolve_constraints()
    assert "CAT" not in problem.sequence
    assert "ATG" not in problem.sequence
Beispiel #2
0
def test_avoid_repeated_small_kmers():
    problem = DnaOptimizationProblem(
        sequence="AGAAGAAGAAGAAGAAGATTTTTTTTTTTTTGGAGGAGGAGGACCCCCCCCCCCCGAGG",
        constraints=[AvoidPattern(RepeatedKmerPattern(3, 3))])
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #3
0
def test_no_solution_error_frozen_region():
    problem = DnaOptimizationProblem(
        sequence="AAAAATCGTCTCTTTT",
        constraints=[AvoidChanges(), AvoidPattern('BsmBI_site')]
    )
    with pytest.raises(NoSolutionError) as err:
        problem.resolve_constraints()
    assert 'region that cannot be mutated' in str(err.value)
Beispiel #4
0
def test_no_solution_error_frozen_region():
    problem = DnaOptimizationProblem(
        sequence="AAAAATCGTCTCTTTT",
        constraints=[AvoidChanges(),
                     AvoidPattern(enzyme='BsmBI')])
    with pytest.raises(NoSolutionError) as err:
        problem.resolve_constraints()
    assert 'Constraint breach in frozen region' in str(err.value)
def test_avoid_pattern_basics():
    numpy.random.seed(123)
    problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000,
                                                                  seed=123),
                                     constraints=[AvoidPattern(enzyme="BsaI")])
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #6
0
def test_constraints_text_summary():
    problem = DnaOptimizationProblem(sequence="ATTGCCATATGCGC",
                                     constraints=[
                                         EnforceGCContent(mini=0.4, maxi=0.6),
                                         AvoidPattern('ATT')
                                     ])
    text = problem.constraints_text_summary()
    assert 'FAILURE: 1 constraints evaluations failed' in text
def test_avoid_pattern_overlapping_locations():
    numpy.random.seed(123)
    problem = DnaOptimizationProblem(
        sequence="AGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG",
        constraints=[AvoidPattern("NAN")])
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
    assert "A" not in problem.sequence[1:-1]
Beispiel #8
0
def test_random_compatible_dna_sequence():
    constraints = [
        EnforceGCContent(mini=0.4, maxi=0.6, window=50),
        AvoidPattern('ATC')
    ]
    seq = random_compatible_dna_sequence(1000, constraints=constraints)
    problem = DnaOptimizationProblem(sequence=seq, constraints=constraints)
    assert ("ATC" not in seq)
    assert problem.all_constraints_pass()
def test_EnforceTranlation():
    numpy.random.seed(1234)
    sequence = reverse_translate(random_protein_sequence(50, seed=123))
    problem = DnaOptimizationProblem(
        sequence=sequence,
        constraints=[AvoidPattern("AAA"), EnforceTranslation()],
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #10
0
def test_pattern_and_reverse():
    bsmbi = "CGTCTC"
    bsmbi_rev = "GAGACG"
    sequence = 10 * bsmbi + 25 * bsmbi_rev + 15 * bsmbi + 15 * bsmbi_rev
    problem = DnaOptimizationProblem(sequence,
                                     constraints=[AvoidPattern('BsmBI_site')],
                                     objectives=[AvoidChanges()])
    problem.resolve_constraints()
    problem.optimize()
    assert sum(problem.sequence_edits_as_array()) < 70
 def compute_forbidden_patterns_locations(self, record):
     """Return an array where ``arr[i] == 1`` means that i is surrounded by
     a user-forbidden pattern."""
     pattern_constraints = [
         AvoidPattern(homopolymer_pattern(c, 5)) for c in 'ATGC'
     ]
     kmer_constraints = [
         AvoidPattern(repeated_kmers(k, n))
         for k, n in [(4, 2), (3, 3), (2, 4)]
     ]
     problem = DnaOptimizationProblem(sequence=record,
                                      constraints=pattern_constraints +
                                      kmer_constraints)
     constraints_breaches = group_overlapping_segments([
         (f.location.start, f.location.end)
         for ev in problem.constraints_evaluations()
         for f in ev.locations_to_biopython_features() if not ev.passes
     ])
     return segments_to_array(constraints_breaches, len(record))
def test_optimize_with_report(tmpdir):
    problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000,
                                                                  seed=123),
                                     constraints=[AvoidPattern('BsmBI_site')])

    target = os.path.join(str(tmpdir), 'with_solution')
    os.mkdir(target)
    assert os.listdir(target) == []
    success, message, data = problem.optimize_with_report(target)
    assert success
    assert os.listdir(target) != []
def test_optimization_with_report_no_solution(tmpdir):
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(10000, seed=123),
        constraints=[AvoidPattern(enzyme='BsmBI'), AvoidChanges()]
    )
    target = os.path.join(str(tmpdir), 'no_solution')
    os.mkdir(target)
    assert os.listdir(target) == []
    success, message, data = optimization_with_report(target, problem)
    assert not success
    assert os.listdir(target) != []
Beispiel #14
0
def test_basics():
    numpy.random.seed(123)
    probas = {'A': 0.2, 'T': 0.2, 'G': 0.3, 'C': 0.3}
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(10000, probas=probas, seed=123),
        constraints=[
            AvoidPattern(enzyme="BsaI"),
            EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50)
        ])
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
def test_AvoidPattern_with_jaspar_motifs():
    stringio = StringIO(JASPAR_CONTENT)
    motif_patterns = MotifPssmPattern.list_from_file(stringio,
                                                     file_format="jaspar",
                                                     relative_threshold=0.9)
    problem = DnaOptimizationProblem(
        sequence="GGGGGGGGGGTGCGTGATTAAAGGGGG",
        constraints=[AvoidPattern(p) for p in motif_patterns],
    )
    assert 2 == len(problem.constraints_evaluations().all_locations())
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #16
0
def test_optimize_with_report_no_solution(tmpdir):
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(10000, seed=123),
        constraints=[AvoidPattern("BsmBI_site"), AvoidChanges()],
        logger=None,
    )
    target = os.path.join(str(tmpdir), "no_solution")
    os.mkdir(target)
    assert os.listdir(target) == []
    success, message, data = problem.optimize_with_report(target)
    assert not success
    assert os.listdir(target) != []
Beispiel #17
0
def load_user_options(args, location):

	assert(isinstance(location, Location))
	#set enforce translation to the whole thing
	constraints = []
	objectives = []

	if args.harmonized:
		opt_mode = 'harmonized'
	else:
		opt_mode = 'best_codon'
	objectives += [
		CodonOptimize(species=args.species, location=location, mode=opt_mode)
	]
	constraints += [
		EnforceTranslation(location=location)
	]

	if args.avoid_homopolymers:
		constraints += [
		AvoidPattern(HomopolymerPattern("A",args.avoid_homopolymers),location=location),
		AvoidPattern(HomopolymerPattern("T",args.avoid_homopolymers),location=location),
		AvoidPattern(HomopolymerPattern("G",args.avoid_homopolymers),location=location),
		AvoidPattern(HomopolymerPattern("C",args.avoid_homopolymers),location=location)]

	if args.avoid_hairpins:
		constraints += [AvoidHairpins(location=location)]

	if args.avoid_patterns:
		constraints += [AvoidPattern(pattern,location=location) for pattern in args.avoid_patterns]

	#NOTE! Printing this to a template is broken
	if args.avoid_restriction_sites:
		constraints += [AvoidPattern(EnzymeSitePattern(enzy),location=location) for enzy in args.avoid_restriction_sites]

	if args.constrain_global_GC_content:
		constraints += [EnforceGCContent(mini=args.global_GC_content_min, maxi=args.global_GC_content_max, location=location)]

	if args.constrain_local_GC_content:
		constraints += [EnforceGCContent(mini=args.local_GC_content_min, maxi=args.global_GC_content_max, window=args.local_GC_content_window, location=location)]

	if args.constrain_terminal_GC_content:
		constraints += [EnforceTerminalGCContent(mini=args.terminal_GC_content_min, maxi=args.terminal_GC_content_max, window_size=8, location=location)]

	if args.constrain_CAI:
		constraints += [ConstrainCAI(species=args.species, minimum=args.constrain_CAI_minimum, location=location)]

	if args.optimize_dicodon_frequency:
		objectives += [MaximizeDicodonAdaptiveIndex()]

	if args.kmers:
		objectives += [MinimizeKmerScore(k=args.kmers, boost=args.avoid_kmers_boost, location=location)]

	if args.avoid_secondary_structure:
		objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_secondary_structure_max_e, location=location, boost=args.avoid_secondary_structure_boost)]

	if args.avoid_initiator_secondary_structure:
		objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_initiator_secondary_structure_max_e, location=location, optimize_initiator=True, boost=args.avoid_initiator_secondary_structure_boost)]

	return objectives, constraints
def test_AvoidPattern_with_regular_expression():
    sequence = ("ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTG"
                "GTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGC"
                "GAGGGCGAGGGCGATGCCACCAACGGCAAGCTGACCCTGAAGTTCATC")
    problem = DnaOptimizationProblem(
        sequence=sequence,
        constraints=[EnforceTranslation(),
                     AvoidPattern(r"GGT(.*)GAT")],
        logger=None,
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #19
0
def test_EnforceSequence():
    # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which
    # enzyme will be chosen and inserted in the sequence depends on the other
    # constraint on GC content
    numpy.random.seed(1234)
    for symbol, nucleotides in [("W", "AT"), ("S", "GC")]:
        n_nucleotides = 15
        start = 50
        location = (start, start + n_nucleotides)
        problem = DnaOptimizationProblem(
            sequence=25 * "ATGC",
            constraints=[
                AvoidPattern("ATGC"),
                AvoidPattern("AAA"),
                AvoidPattern("GGG"),
                EnforceSequence(n_nucleotides * symbol, location=location),
            ],
        )
        problem.max_random_iters = 10000
        problem.resolve_constraints()
        s, e = start, start + n_nucleotides
        assert all([n in nucleotides for n in problem.sequence[s:e]])

    # Test -1 strand:
    seq = "ATG" + "CAG" + "AGCAAGGTGCTGCT"
    problem = DnaOptimizationProblem(
        sequence=seq,
        constraints=[
            EnforcePatternOccurence(
                pattern="CTG",  # CAG on strand +1
                occurences=2,
                strand=-1,
                location=Location(start=0, end=50),
            )
        ],
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
def test_EnforceSequence():
    # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which
    # enzyme will be chosen and inserted in the sequence depends on the other
    # constraint on GC content
    numpy.random.seed(1234)
    for symbol, nucleotides in [('W', 'AT'), ('S', 'GC')]:
        n_nucleotides = 15
        start = 50
        location = (start, start + n_nucleotides)
        problem = DnaOptimizationProblem(sequence=25 * "ATGC",
                                         constraints=[
                                             AvoidPattern("ATGC"),
                                             AvoidPattern("AAA"),
                                             AvoidPattern("GGG"),
                                             EnforceSequence(n_nucleotides *
                                                             symbol,
                                                             location=location)
                                         ])
        problem.max_random_iters = 10000
        problem.resolve_constraints()
        s, e = start, start + n_nucleotides
        assert all([n in nucleotides for n in problem.sequence[s:e]])
def test_EnforceGCContents():
    numpy.random.seed(123)
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(10000, seed=123),
        constraints=[
            AvoidPattern(enzyme="BsaI"),
            EnforceGCContent(mini=0.3, maxi=0.7, window=50)
        ],
        objectives=[EnforceGCContent(target=0.4)]
    )

    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
def test_EnforceTranlationReversed():
    numpy.random.seed(1234)
    sequence = reverse_translate(random_protein_sequence(50, seed=123))
    rev_sequence = reverse_complement(sequence)
    problem = DnaOptimizationProblem(
        sequence=rev_sequence,
        constraints=[
            AvoidPattern("AGC"),
            EnforceTranslation(location=(0, len(sequence), -1))
        ],
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #23
0
def test_basics():
    numpy.random.seed(123)
    probas = {"A": 0.2, "T": 0.2, "G": 0.3, "C": 0.3}
    problem = DnaOptimizationProblem(
        sequence=random_dna_sequence(10000, probas=probas, seed=123),
        constraints=[
            AvoidPattern("BsaI_site"),
            EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50),
        ],
        logger=None,
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #24
0
def test_AvoidChanges_with_max_edits():
    numpy.random.seed(1)
    problem = DnaOptimizationProblem(
        sequence="ATATATATATA",
        constraints=[
            AvoidChanges(max_edits=2),
            AvoidPattern("ATATA"),
            EnforcePatternOccurence("A", occurences=6, location=(0, 11, 1)),
            EnforcePatternOccurence("T", occurences=4, location=(0, 11, 1)),
        ],
        logger=None,
    )
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
def test_EnforceSequence_as_objective():
    # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which
    # enzyme will be chosen and inserted in the sequence depends on the other
    # constraint on GC content
    numpy.random.seed(1234)
    n_nucleotides = 15
    start = 50
    location = (start, start + n_nucleotides)
    problem = DnaOptimizationProblem(
        sequence=25 * "ATGC",
        constraints=[AvoidPattern("ATGC")],
        objectives=[EnforceSequence("W" * n_nucleotides, location=location)])
    assert problem.objective_scores_sum() < 0
    problem.resolve_constraints()
    problem.optimize()
    assert problem.objective_scores_sum() == 0
 def __init__(
         self,
         left_overhang,
         right_overhang,
         left_addition="",
         right_addition="",
         enzyme="BsmBI",
         extra_avoided_sites=(),
         description="Golden Gate domesticator",
         name="unnamed_domesticator",
         cds_by_default=False,
         constraints=(),
         objectives=(),
 ):
     self.enzyme = enzyme
     self.left_overhang = left_overhang
     left_overhang = sequence_to_biopython_record(left_overhang)
     self.right_overhang = right_overhang
     right_overhang = sequence_to_biopython_record(right_overhang)
     for seq in [left_overhang, right_overhang]:
         annotate_record(seq, label=str(seq.seq))
     enzyme_seq = Restriction.__dict__[enzyme].site
     enzyme_seq = sequence_to_biopython_record(enzyme_seq)
     annotate_record(enzyme_seq, label=enzyme)
     self.enzyme_seq = enzyme_seq
     left_flank = self.enzyme_seq + "A" + left_overhang + left_addition
     right_flank = (right_addition + right_overhang +
                    (self.enzyme_seq + "A").reverse_complement())
     self.extra_avoided_sites = extra_avoided_sites
     constraints = list(constraints) + [(lambda seq: AvoidPattern(
         EnzymeSitePattern(enzyme),
         location=Location(len(left_flank),
                           len(left_flank) + len(seq)),
     )) for enz in ([enzyme] + list(extra_avoided_sites))]
     PartDomesticator.__init__(
         self,
         left_flank=left_flank,
         right_flank=right_flank,
         constraints=constraints,
         objectives=objectives,
         description=description,
         name=name,
         cds_by_default=cds_by_default,
     )
Beispiel #27
0
    def create_new_sequence(
        self,
        naive_target_sequence : str,
        codon_usage_table : Optional[str],
        existing_sequences : List[str]
    ) -> str:
        """Run DNAChisel to create a new codon optimized DNA sequence

        """
        constraints=[
            EnforceTranslation(),
            #EnforceGCContent(mini=0.4, maxi=0.6, window=60),
        ]

        constraints.extend([
            AvoidPattern(sequence)
            for sequence in existing_sequences
        ])

        problem = DnaOptimizationProblem(
            sequence=naive_target_sequence,
            constraints=constraints,
            objectives=[MatchTargetCodonUsage(species="s_cerevisiae")],
        )

        #print("\nBefore optimization:\n")
        #print(problem.constraints_text_summary())
        #print(problem.objectives_text_summary())

        problem.resolve_constraints(final_check=True)
        problem.optimize()

        #print("\nAfter optimization:\n")
        #print(problem.constraints_text_summary())
        #print(problem.objectives_text_summary())

        return problem.sequence
from dnachisel import DnaOptimizationProblem, AvoidPattern, random_dna_sequence
from urllib import request

# DOWNLOAD THE LIST OF TF BINDING SITES
url = "http://regulondb.ccg.unam.mx/menu/download/datasets/files/PSSMSet.txt"
data = request.urlopen(url).read().decode('utf-8')

# PARSE THE DATA LINE BY LINE TO OBTAIN A LIST OF TF BINDING SEQUENCES
tf_binding_sequences = [
    line for line in data.splitlines() if set() < set(line) <= set("ATGC")
]

# DEFINE AND SOLVE THE OPTIMIZATION PROBLEM
problem = DnaOptimizationProblem(
    sequence=random_dna_sequence(50000),
    constraints=[AvoidPattern(pattern) for pattern in tf_binding_sequences])
problem.resolve_constraints()
problem.to_record("sequence_without_tf_binding_sites.gb")
Beispiel #29
0
url = (
    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
    + "db=nucleotide&id=48994873&rettype=gb&retmode=txt"
)
genbank_data = request.urlopen(url).read().decode("utf-8")
genbank_record = load_record(StringIO(genbank_data), file_format="genbank")

print("INITIALIZING THE PROBLEM WITH CONSTRAINTS FOR EACH GENE...")

constraints = []
for feature in genbank_record.features:
    if feature.type == "gene" and len(feature.location.parts) == 1:
        location = Location.from_biopython_location(feature.location)
        if (len(location) % 3 == 0) and len(location) > 100:
            gene_constraints = [
                EnforceTranslation(location = location),
                AvoidPattern("BsmBI_site", location),
                EnforceGCContent(
                    mini=0.40, maxi=0.60, window=150, location=location
                ),
            ]
            constraints.extend(gene_constraints)
problem = DnaOptimizationProblem(genbank_record, constraints)

print("RESOLVING THE CONSTRAINTS...")

problem.logger.ignore_bars_under = 50
problem.resolve_constraints()
problem.to_record("ecoli_genes_optimization.gb")
Beispiel #30
0
"""In this script we create a random sequence rid of the 6pb enzyme restriction
sites listed in Biopython."""

from dnachisel import DnaOptimizationProblem, AvoidPattern, random_dna_sequence
from Bio.Restriction import AllEnzymes

# CREATE AN AvoidPattern CONSTRAINT FOR EACH ENZYME SITE OF LENGTH 6

constraints = [
    AvoidPattern("%s_site" % enzyme)
    for enzyme in AllEnzymes
    if enzyme.size == 6
]

# CREATE AN RESOLVE THE PROBLEM:

problem = DnaOptimizationProblem(
    sequence=random_dna_sequence(5000),
    constraints=constraints,
    logger=None
)
problem.resolve_constraints()

print ("Final sequence:", problem.sequence)