def test_AvoidPattern_on_strands(): # Negative strand only sequence = "CATGCTATGC" problem = DnaOptimizationProblem( sequence, constraints=[AvoidPattern("CAT", location=(0, 10, -1))], logger=None, ) problem.resolve_constraints() assert "CAT" in problem.sequence assert "ATG" not in problem.sequence # Negative strand only sequence = "CATGCTATGC" problem = DnaOptimizationProblem( sequence, constraints=[AvoidPattern("CAT", location=(0, 10, -1))], logger=None, ) problem.resolve_constraints() assert "CAT" in problem.sequence assert "ATG" not in problem.sequence # Both strands sequence = "CATGCTATGC" problem = DnaOptimizationProblem( sequence, constraints=[AvoidPattern("CAT")], logger=None, ) problem.resolve_constraints() assert "CAT" not in problem.sequence assert "ATG" not in problem.sequence
def test_avoid_repeated_small_kmers(): problem = DnaOptimizationProblem( sequence="AGAAGAAGAAGAAGAAGATTTTTTTTTTTTTGGAGGAGGAGGACCCCCCCCCCCCGAGG", constraints=[AvoidPattern(RepeatedKmerPattern(3, 3))]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_no_solution_error_frozen_region(): problem = DnaOptimizationProblem( sequence="AAAAATCGTCTCTTTT", constraints=[AvoidChanges(), AvoidPattern('BsmBI_site')] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'region that cannot be mutated' in str(err.value)
def test_no_solution_error_frozen_region(): problem = DnaOptimizationProblem( sequence="AAAAATCGTCTCTTTT", constraints=[AvoidChanges(), AvoidPattern(enzyme='BsmBI')]) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Constraint breach in frozen region' in str(err.value)
def test_avoid_pattern_basics(): numpy.random.seed(123) problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern(enzyme="BsaI")]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_constraints_text_summary(): problem = DnaOptimizationProblem(sequence="ATTGCCATATGCGC", constraints=[ EnforceGCContent(mini=0.4, maxi=0.6), AvoidPattern('ATT') ]) text = problem.constraints_text_summary() assert 'FAILURE: 1 constraints evaluations failed' in text
def test_avoid_pattern_overlapping_locations(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence="AGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG", constraints=[AvoidPattern("NAN")]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass() assert "A" not in problem.sequence[1:-1]
def test_random_compatible_dna_sequence(): constraints = [ EnforceGCContent(mini=0.4, maxi=0.6, window=50), AvoidPattern('ATC') ] seq = random_compatible_dna_sequence(1000, constraints=constraints) problem = DnaOptimizationProblem(sequence=seq, constraints=constraints) assert ("ATC" not in seq) assert problem.all_constraints_pass()
def test_EnforceTranlation(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) problem = DnaOptimizationProblem( sequence=sequence, constraints=[AvoidPattern("AAA"), EnforceTranslation()], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_pattern_and_reverse(): bsmbi = "CGTCTC" bsmbi_rev = "GAGACG" sequence = 10 * bsmbi + 25 * bsmbi_rev + 15 * bsmbi + 15 * bsmbi_rev problem = DnaOptimizationProblem(sequence, constraints=[AvoidPattern('BsmBI_site')], objectives=[AvoidChanges()]) problem.resolve_constraints() problem.optimize() assert sum(problem.sequence_edits_as_array()) < 70
def compute_forbidden_patterns_locations(self, record): """Return an array where ``arr[i] == 1`` means that i is surrounded by a user-forbidden pattern.""" pattern_constraints = [ AvoidPattern(homopolymer_pattern(c, 5)) for c in 'ATGC' ] kmer_constraints = [ AvoidPattern(repeated_kmers(k, n)) for k, n in [(4, 2), (3, 3), (2, 4)] ] problem = DnaOptimizationProblem(sequence=record, constraints=pattern_constraints + kmer_constraints) constraints_breaches = group_overlapping_segments([ (f.location.start, f.location.end) for ev in problem.constraints_evaluations() for f in ev.locations_to_biopython_features() if not ev.passes ]) return segments_to_array(constraints_breaches, len(record))
def test_optimize_with_report(tmpdir): problem = DnaOptimizationProblem(sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern('BsmBI_site')]) target = os.path.join(str(tmpdir), 'with_solution') os.mkdir(target) assert os.listdir(target) == [] success, message, data = problem.optimize_with_report(target) assert success assert os.listdir(target) != []
def test_optimization_with_report_no_solution(tmpdir): problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern(enzyme='BsmBI'), AvoidChanges()] ) target = os.path.join(str(tmpdir), 'no_solution') os.mkdir(target) assert os.listdir(target) == [] success, message, data = optimization_with_report(target, problem) assert not success assert os.listdir(target) != []
def test_basics(): numpy.random.seed(123) probas = {'A': 0.2, 'T': 0.2, 'G': 0.3, 'C': 0.3} problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, probas=probas, seed=123), constraints=[ AvoidPattern(enzyme="BsaI"), EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50) ]) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidPattern_with_jaspar_motifs(): stringio = StringIO(JASPAR_CONTENT) motif_patterns = MotifPssmPattern.list_from_file(stringio, file_format="jaspar", relative_threshold=0.9) problem = DnaOptimizationProblem( sequence="GGGGGGGGGGTGCGTGATTAAAGGGGG", constraints=[AvoidPattern(p) for p in motif_patterns], ) assert 2 == len(problem.constraints_evaluations().all_locations()) problem.resolve_constraints() assert problem.all_constraints_pass()
def test_optimize_with_report_no_solution(tmpdir): problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[AvoidPattern("BsmBI_site"), AvoidChanges()], logger=None, ) target = os.path.join(str(tmpdir), "no_solution") os.mkdir(target) assert os.listdir(target) == [] success, message, data = problem.optimize_with_report(target) assert not success assert os.listdir(target) != []
def load_user_options(args, location): assert(isinstance(location, Location)) #set enforce translation to the whole thing constraints = [] objectives = [] if args.harmonized: opt_mode = 'harmonized' else: opt_mode = 'best_codon' objectives += [ CodonOptimize(species=args.species, location=location, mode=opt_mode) ] constraints += [ EnforceTranslation(location=location) ] if args.avoid_homopolymers: constraints += [ AvoidPattern(HomopolymerPattern("A",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("T",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("G",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("C",args.avoid_homopolymers),location=location)] if args.avoid_hairpins: constraints += [AvoidHairpins(location=location)] if args.avoid_patterns: constraints += [AvoidPattern(pattern,location=location) for pattern in args.avoid_patterns] #NOTE! Printing this to a template is broken if args.avoid_restriction_sites: constraints += [AvoidPattern(EnzymeSitePattern(enzy),location=location) for enzy in args.avoid_restriction_sites] if args.constrain_global_GC_content: constraints += [EnforceGCContent(mini=args.global_GC_content_min, maxi=args.global_GC_content_max, location=location)] if args.constrain_local_GC_content: constraints += [EnforceGCContent(mini=args.local_GC_content_min, maxi=args.global_GC_content_max, window=args.local_GC_content_window, location=location)] if args.constrain_terminal_GC_content: constraints += [EnforceTerminalGCContent(mini=args.terminal_GC_content_min, maxi=args.terminal_GC_content_max, window_size=8, location=location)] if args.constrain_CAI: constraints += [ConstrainCAI(species=args.species, minimum=args.constrain_CAI_minimum, location=location)] if args.optimize_dicodon_frequency: objectives += [MaximizeDicodonAdaptiveIndex()] if args.kmers: objectives += [MinimizeKmerScore(k=args.kmers, boost=args.avoid_kmers_boost, location=location)] if args.avoid_secondary_structure: objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_secondary_structure_max_e, location=location, boost=args.avoid_secondary_structure_boost)] if args.avoid_initiator_secondary_structure: objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_initiator_secondary_structure_max_e, location=location, optimize_initiator=True, boost=args.avoid_initiator_secondary_structure_boost)] return objectives, constraints
def test_AvoidPattern_with_regular_expression(): sequence = ("ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTG" "GTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGC" "GAGGGCGAGGGCGATGCCACCAACGGCAAGCTGACCCTGAAGTTCATC") problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(), AvoidPattern(r"GGT(.*)GAT")], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceSequence(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(1234) for symbol, nucleotides in [("W", "AT"), ("S", "GC")]: n_nucleotides = 15 start = 50 location = (start, start + n_nucleotides) problem = DnaOptimizationProblem( sequence=25 * "ATGC", constraints=[ AvoidPattern("ATGC"), AvoidPattern("AAA"), AvoidPattern("GGG"), EnforceSequence(n_nucleotides * symbol, location=location), ], ) problem.max_random_iters = 10000 problem.resolve_constraints() s, e = start, start + n_nucleotides assert all([n in nucleotides for n in problem.sequence[s:e]]) # Test -1 strand: seq = "ATG" + "CAG" + "AGCAAGGTGCTGCT" problem = DnaOptimizationProblem( sequence=seq, constraints=[ EnforcePatternOccurence( pattern="CTG", # CAG on strand +1 occurences=2, strand=-1, location=Location(start=0, end=50), ) ], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceSequence(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(1234) for symbol, nucleotides in [('W', 'AT'), ('S', 'GC')]: n_nucleotides = 15 start = 50 location = (start, start + n_nucleotides) problem = DnaOptimizationProblem(sequence=25 * "ATGC", constraints=[ AvoidPattern("ATGC"), AvoidPattern("AAA"), AvoidPattern("GGG"), EnforceSequence(n_nucleotides * symbol, location=location) ]) problem.max_random_iters = 10000 problem.resolve_constraints() s, e = start, start + n_nucleotides assert all([n in nucleotides for n in problem.sequence[s:e]])
def test_EnforceGCContents(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[ AvoidPattern(enzyme="BsaI"), EnforceGCContent(mini=0.3, maxi=0.7, window=50) ], objectives=[EnforceGCContent(target=0.4)] ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceTranlationReversed(): numpy.random.seed(1234) sequence = reverse_translate(random_protein_sequence(50, seed=123)) rev_sequence = reverse_complement(sequence) problem = DnaOptimizationProblem( sequence=rev_sequence, constraints=[ AvoidPattern("AGC"), EnforceTranslation(location=(0, len(sequence), -1)) ], ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_basics(): numpy.random.seed(123) probas = {"A": 0.2, "T": 0.2, "G": 0.3, "C": 0.3} problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, probas=probas, seed=123), constraints=[ AvoidPattern("BsaI_site"), EnforceTerminalGCContent(mini=0.2, maxi=0.4, window_size=50), ], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_AvoidChanges_with_max_edits(): numpy.random.seed(1) problem = DnaOptimizationProblem( sequence="ATATATATATA", constraints=[ AvoidChanges(max_edits=2), AvoidPattern("ATATA"), EnforcePatternOccurence("A", occurences=6, location=(0, 11, 1)), EnforcePatternOccurence("T", occurences=4, location=(0, 11, 1)), ], logger=None, ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_EnforceSequence_as_objective(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(1234) n_nucleotides = 15 start = 50 location = (start, start + n_nucleotides) problem = DnaOptimizationProblem( sequence=25 * "ATGC", constraints=[AvoidPattern("ATGC")], objectives=[EnforceSequence("W" * n_nucleotides, location=location)]) assert problem.objective_scores_sum() < 0 problem.resolve_constraints() problem.optimize() assert problem.objective_scores_sum() == 0
def __init__( self, left_overhang, right_overhang, left_addition="", right_addition="", enzyme="BsmBI", extra_avoided_sites=(), description="Golden Gate domesticator", name="unnamed_domesticator", cds_by_default=False, constraints=(), objectives=(), ): self.enzyme = enzyme self.left_overhang = left_overhang left_overhang = sequence_to_biopython_record(left_overhang) self.right_overhang = right_overhang right_overhang = sequence_to_biopython_record(right_overhang) for seq in [left_overhang, right_overhang]: annotate_record(seq, label=str(seq.seq)) enzyme_seq = Restriction.__dict__[enzyme].site enzyme_seq = sequence_to_biopython_record(enzyme_seq) annotate_record(enzyme_seq, label=enzyme) self.enzyme_seq = enzyme_seq left_flank = self.enzyme_seq + "A" + left_overhang + left_addition right_flank = (right_addition + right_overhang + (self.enzyme_seq + "A").reverse_complement()) self.extra_avoided_sites = extra_avoided_sites constraints = list(constraints) + [(lambda seq: AvoidPattern( EnzymeSitePattern(enzyme), location=Location(len(left_flank), len(left_flank) + len(seq)), )) for enz in ([enzyme] + list(extra_avoided_sites))] PartDomesticator.__init__( self, left_flank=left_flank, right_flank=right_flank, constraints=constraints, objectives=objectives, description=description, name=name, cds_by_default=cds_by_default, )
def create_new_sequence( self, naive_target_sequence : str, codon_usage_table : Optional[str], existing_sequences : List[str] ) -> str: """Run DNAChisel to create a new codon optimized DNA sequence """ constraints=[ EnforceTranslation(), #EnforceGCContent(mini=0.4, maxi=0.6, window=60), ] constraints.extend([ AvoidPattern(sequence) for sequence in existing_sequences ]) problem = DnaOptimizationProblem( sequence=naive_target_sequence, constraints=constraints, objectives=[MatchTargetCodonUsage(species="s_cerevisiae")], ) #print("\nBefore optimization:\n") #print(problem.constraints_text_summary()) #print(problem.objectives_text_summary()) problem.resolve_constraints(final_check=True) problem.optimize() #print("\nAfter optimization:\n") #print(problem.constraints_text_summary()) #print(problem.objectives_text_summary()) return problem.sequence
from dnachisel import DnaOptimizationProblem, AvoidPattern, random_dna_sequence from urllib import request # DOWNLOAD THE LIST OF TF BINDING SITES url = "http://regulondb.ccg.unam.mx/menu/download/datasets/files/PSSMSet.txt" data = request.urlopen(url).read().decode('utf-8') # PARSE THE DATA LINE BY LINE TO OBTAIN A LIST OF TF BINDING SEQUENCES tf_binding_sequences = [ line for line in data.splitlines() if set() < set(line) <= set("ATGC") ] # DEFINE AND SOLVE THE OPTIMIZATION PROBLEM problem = DnaOptimizationProblem( sequence=random_dna_sequence(50000), constraints=[AvoidPattern(pattern) for pattern in tf_binding_sequences]) problem.resolve_constraints() problem.to_record("sequence_without_tf_binding_sites.gb")
url = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + "db=nucleotide&id=48994873&rettype=gb&retmode=txt" ) genbank_data = request.urlopen(url).read().decode("utf-8") genbank_record = load_record(StringIO(genbank_data), file_format="genbank") print("INITIALIZING THE PROBLEM WITH CONSTRAINTS FOR EACH GENE...") constraints = [] for feature in genbank_record.features: if feature.type == "gene" and len(feature.location.parts) == 1: location = Location.from_biopython_location(feature.location) if (len(location) % 3 == 0) and len(location) > 100: gene_constraints = [ EnforceTranslation(location = location), AvoidPattern("BsmBI_site", location), EnforceGCContent( mini=0.40, maxi=0.60, window=150, location=location ), ] constraints.extend(gene_constraints) problem = DnaOptimizationProblem(genbank_record, constraints) print("RESOLVING THE CONSTRAINTS...") problem.logger.ignore_bars_under = 50 problem.resolve_constraints() problem.to_record("ecoli_genes_optimization.gb")
"""In this script we create a random sequence rid of the 6pb enzyme restriction sites listed in Biopython.""" from dnachisel import DnaOptimizationProblem, AvoidPattern, random_dna_sequence from Bio.Restriction import AllEnzymes # CREATE AN AvoidPattern CONSTRAINT FOR EACH ENZYME SITE OF LENGTH 6 constraints = [ AvoidPattern("%s_site" % enzyme) for enzyme in AllEnzymes if enzyme.size == 6 ] # CREATE AN RESOLVE THE PROBLEM: problem = DnaOptimizationProblem( sequence=random_dna_sequence(5000), constraints=constraints, logger=None ) problem.resolve_constraints() print ("Final sequence:", problem.sequence)