def load_user_options(args, location): assert(isinstance(location, Location)) #set enforce translation to the whole thing constraints = [] objectives = [] if args.harmonized: opt_mode = 'harmonized' else: opt_mode = 'best_codon' objectives += [ CodonOptimize(species=args.species, location=location, mode=opt_mode) ] constraints += [ EnforceTranslation(location=location) ] if args.avoid_homopolymers: constraints += [ AvoidPattern(HomopolymerPattern("A",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("T",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("G",args.avoid_homopolymers),location=location), AvoidPattern(HomopolymerPattern("C",args.avoid_homopolymers),location=location)] if args.avoid_hairpins: constraints += [AvoidHairpins(location=location)] if args.avoid_patterns: constraints += [AvoidPattern(pattern,location=location) for pattern in args.avoid_patterns] #NOTE! Printing this to a template is broken if args.avoid_restriction_sites: constraints += [AvoidPattern(EnzymeSitePattern(enzy),location=location) for enzy in args.avoid_restriction_sites] if args.constrain_global_GC_content: constraints += [EnforceGCContent(mini=args.global_GC_content_min, maxi=args.global_GC_content_max, location=location)] if args.constrain_local_GC_content: constraints += [EnforceGCContent(mini=args.local_GC_content_min, maxi=args.global_GC_content_max, window=args.local_GC_content_window, location=location)] if args.constrain_terminal_GC_content: constraints += [EnforceTerminalGCContent(mini=args.terminal_GC_content_min, maxi=args.terminal_GC_content_max, window_size=8, location=location)] if args.constrain_CAI: constraints += [ConstrainCAI(species=args.species, minimum=args.constrain_CAI_minimum, location=location)] if args.optimize_dicodon_frequency: objectives += [MaximizeDicodonAdaptiveIndex()] if args.kmers: objectives += [MinimizeKmerScore(k=args.kmers, boost=args.avoid_kmers_boost, location=location)] if args.avoid_secondary_structure: objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_secondary_structure_max_e, location=location, boost=args.avoid_secondary_structure_boost)] if args.avoid_initiator_secondary_structure: objectives += [MinimizeSecondaryStructure(max_energy=args.avoid_initiator_secondary_structure_max_e, location=location, optimize_initiator=True, boost=args.avoid_initiator_secondary_structure_boost)] return objectives, constraints
def test_EnforceGCContents(): numpy.random.seed(123) problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000, seed=123), constraints=[ AvoidPattern(enzyme="BsaI"), EnforceGCContent(mini=0.3, maxi=0.7, window=50) ], objectives=[EnforceGCContent(target=0.4)] ) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def optimize(self, codon_table): self.optimize_frequent(codon_table) # return opt_codons = self.__vaccine_codons_gen.copy() self.__vaccine_codons_gen.clear() vac_strand = self.get_strand(opt_codons) #vir_strand = self.get_strand(self.__virus_codons) codon_table = pct.get_codons_table(codon_table) problem = DnaOptimizationProblem( sequence=vac_strand, constraints=[ EnforceTranslation(genetic_table='Standard', start_codon='ATG'), EnforceGCContent(mini=0.54, maxi=0.9, window=120) ], objectives=[ CodonOptimize(method="use_best_codon", codon_usage_table=codon_table) ] ) problem.resolve_constraints() problem.optimize() self.__vaccine_codons_gen = [] count = 1 vcodon = "" for x in problem.sequence: if count % 3 == 0: vcodon += x self.__vaccine_codons_gen.append(vcodon) vcodon = "" else: vcodon += x count += 1 return
def test_EnforceRegionsCompatibility(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(123) def compatibility_condition(location1, location2, problem): seq1 = location1.extract_sequence(problem.sequence) seq2 = location2.extract_sequence(problem.sequence) return sequences_differences(seq1, seq2) >= 2 locations = [(0, 4), (50, 54), (100, 104), (150, 154)] problem = DnaOptimizationProblem( sequence=random_dna_sequence(200, seed=123), constraints=[ EnforceRegionsCompatibility( locations=locations, compatibility_condition=compatibility_condition, condition_label="2bp difference", ), EnforceGCContent(mini=0.4, maxi=0.6, window=40), ], logger=None, ) assert not any([e.passes for e in problem.constraints_evaluations()]) problem.resolve_constraints() assert problem.all_constraints_pass() seq = problem.sequence assert [ sequences_differences(seq[s1:e1], seq[s2:e2]) >= 2 for (s1, e1), (s2, e2) in itertools.combinations(locations, 2) ]
def test_optimization_2(): sequence_path = os.path.join("tests", "data", "test_optimization_sequence_2.fa") sequence = str(load_record(sequence_path).seq)[:5500] deluxe_dna = CommercialDnaOffer( name="DeluxeDNA.com", sequence_constraints=[SequenceLengthConstraint(max_length=4000)], pricing=PerBasepairPricing(0.20), lead_time=10, ) cheap_dna = CommercialDnaOffer( name="CheapDNA.com", sequence_constraints=[ NoPatternConstraint(enzyme="BsaI"), EnforceGCContent(0.3, 0.7, window=60), ], pricing=PerBasepairPricing(0.10), lead_time=15, ) # BLOCKS TO CHUNKS ASSEMBLY gibson_blocks_assembly_station = DnaAssemblyStation( name="Gibson Blocks Assembly", assembly_method=GibsonAssemblyMethod( overhang_selector=FixedSizeSegmentSelector(10), min_segment_length=1000, max_segment_length=6000, duration=8, cost=16, ), supplier=[deluxe_dna, cheap_dna], coarse_grain=30, fine_grain=False, memoize=True, # a_star_factor="auto", ) quote_before = gibson_blocks_assembly_station.get_quote(sequence) assert quote_before.price > 850 objective = OptimizeManufacturability(gibson_blocks_assembly_station) problem = DnaOptimizationProblem( sequence=sequence, constraints=[EnforceTranslation(location=(0, 4998))], objectives=[objective], ) problem.randomization_threshold = 0 # Forces "random search" mode problem.max_random_iters = 5 problem.optimize() print("OPTIMIZATION DONE, GENERATING REPORT") quote_after = gibson_blocks_assembly_station.get_quote(problem.sequence) assert quote_after.price < 580
def test_constraints_text_summary(): problem = DnaOptimizationProblem(sequence="ATTGCCATATGCGC", constraints=[ EnforceGCContent(mini=0.4, maxi=0.6), AvoidPattern('ATT') ]) text = problem.constraints_text_summary() assert 'FAILURE: 1 constraints evaluations failed' in text
def test_no_solution_error_random_search(): problem = DnaOptimizationProblem( sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTT", constraints=[AvoidChanges((0, 10)), EnforceGCContent(mini=0.8)] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Random search did not' in str(err.value)
def test_no_solution_error_exhaustive_search(): problem = DnaOptimizationProblem( sequence="TTTTTTT", constraints=[AvoidChanges((0, 4)), EnforceGCContent(mini=0.8)] ) with pytest.raises(NoSolutionError) as err: problem.resolve_constraints() assert 'Exhaustive search failed' in str(err.value)
def test_EnforceChoice(): # Two enzymes, BsmBI(CGTCTC) is GC-rich, EcoRI(GAATTC) is GC-poor, which # enzyme will be chosen and inserted in the sequence depends on the other # constraint on GC content numpy.random.seed(123) spec = EnforceChoice(choices=['BsmBI_site', 'EcoRI_site'], location=(2, 8)) problem = DnaOptimizationProblem( sequence="AGCCCCCCGT", constraints=[spec, EnforceGCContent(maxi=0.3)]) problem.resolve_constraints() assert 'GAATTC' in problem.sequence problem = DnaOptimizationProblem( sequence="AGCCCCCCGT", constraints=[spec, EnforceGCContent(mini=0.7)]) problem.resolve_constraints() assert 'CGTCTC' in problem.sequence
def test_random_compatible_dna_sequence(): constraints = [ EnforceGCContent(mini=0.4, maxi=0.6, window=50), AvoidPattern('ATC') ] seq = random_compatible_dna_sequence(1000, constraints=constraints) problem = DnaOptimizationProblem(sequence=seq, constraints=constraints) assert ("ATC" not in seq) assert problem.all_constraints_pass()
def test_parameters_from_string(): for pattern, expected in [ ("35%", (None, None, 0.35, None)), ("35%/20bp", (None, None, 0.35, 20)), ("5-55%", (0.05, 0.55, None, None)), ("5-55%/400bp", (0.05, 0.55, None, 400)), ]: mini, maxi, target, w = EnforceGCContent.string_to_parameters(pattern) assert (mini, maxi, target, w) == expected
def test_avoid_change_as_objectives_basics(): numpy.random.seed(123) results = [] for boost in (0, 0.1, 0.2, 1): sequence = random_dna_sequence(1000, seed=123) problem = DnaOptimizationProblem( sequence=sequence, objectives=[ EnforceGCContent( mini=0.45, maxi=0.55, window=80).copy_with_changes(locations_span=300), AvoidChanges(boost=boost).as_passive_objective() ]) problem.optimize() differences = sequences_differences(problem.sequence, problem.sequence_before) results.append(differences) assert results[0] > 40 assert (results[0] > results[1] > results[2] > results[3]) assert results[-1] == 0
url = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + "db=nucleotide&id=48994873&rettype=gb&retmode=txt" ) genbank_data = request.urlopen(url).read().decode("utf-8") genbank_record = load_record(StringIO(genbank_data), file_format="genbank") print("INITIALIZING THE PROBLEM WITH CONSTRAINTS FOR EACH GENE...") constraints = [] for feature in genbank_record.features: if feature.type == "gene" and len(feature.location.parts) == 1: location = Location.from_biopython_location(feature.location) if (len(location) % 3 == 0) and len(location) > 100: gene_constraints = [ EnforceTranslation(location = location), AvoidPattern("BsmBI_site", location), EnforceGCContent( mini=0.40, maxi=0.60, window=150, location=location ), ] constraints.extend(gene_constraints) problem = DnaOptimizationProblem(genbank_record, constraints) print("RESOLVING THE CONSTRAINTS...") problem.logger.ignore_bars_under = 50 problem.resolve_constraints() problem.to_record("ecoli_genes_optimization.gb")
random_protein_sequence, reverse_translate, CodonOptimize, EnforceTranslation, AvoidPattern, EnforceGCContent, ) protein = random_protein_sequence(1000, seed=123) sequence = reverse_translate(protein) problem = DnaOptimizationProblem( sequence=sequence, constraints=[ EnforceTranslation(), AvoidPattern("BsmBI_site"), EnforceGCContent(mini=0.4, maxi=0.6, window=60), ], objectives=[CodonOptimize(species="s_cerevisiae")], ) print("\nBefore optimization:\n") print(problem.constraints_text_summary()) print(problem.objectives_text_summary()) problem.resolve_constraints(final_check=True) problem.optimize() print("\nAfter optimization:\n") print(problem.constraints_text_summary()) print(problem.objectives_text_summary())
def from_specs( n_barcodes=96, barcode_length=20, spacer="AA", forbidden_enzymes=("BsaI", "BsmBI", "BbsI"), barcode_tmin=55, barcode_tmax=70, other_primer_sequences=(), heterodim_tmax=5, max_homology_length=10, include_spacers=True, names_template="B_%03d", ): """Return a BarcodesCollection object with compatible barcodes. Parameters ---------- n_barcodes Number of barcodes to design. barcode_length Length of each barcode. spacer Spacer to place between each barcode during the optimization, ideally the same spacer that will be used when adding the barcode to a part. include_spacers Whether the spacers should be part of the final sequence of the barcodes (they still won't be considered part of the annealing primer and won't be used for melting temperature computations). forbidden_enzymes Name of enzymes whose sites should not be in the barcodes. barcode_tmin, barcode_tmax Interval of acceptable values for the melting temperature. other_primer_sequences External sequences with which the primers should not anneal. heterodim_tmax Max acceptable melting temperature for the annealing of a barcode and one of the other_primer_sequences. max_homology_length Maximal homology between any two barcodes in the sequence. names_template The template used to name barcode number "i". """ unit_length = barcode_length + len(spacer) seq_len = n_barcodes * unit_length units_coordinates = [(i, i + unit_length) for i in range(0, seq_len, unit_length)] constraints = [ AvoidPattern(EnzymeSitePattern(enzyme)) for enzyme in forbidden_enzymes ] for start, end in units_coordinates: constraints += [ AllowPrimer( tmin=barcode_tmin, tmax=barcode_tmax, max_homology_length=max_homology_length, avoid_heterodim_with=None, max_heterodim_tm=5, location=(start, end - len(spacer)), ), EnforceSequence(spacer, location=(end - len(spacer), end)), EnforceGCContent(mini=0.4, maxi=0.6, location=(start, end - len(spacer))), ] problem = DnaOptimizationProblem(sequence=random_dna_sequence(seq_len), constraints=constraints) problem.logger.ignored_bars.add("location") problem.resolve_constraints() barcodes = [ problem.sequence[start:end] for (start, end) in units_coordinates ] if not include_spacers: barcodes = [b[:-len(spacer)] for b in barcodes] names = [(names_template % (i + 1)) for i in range(len(barcodes))] return BarcodesCollection(zip(names, barcodes))
"""Example of use of the AvoidChanges as an objective to minimize modifications of a sequence.""" from dnachisel import (DnaOptimizationProblem, random_dna_sequence, AvoidPattern, AvoidChanges, sequences_differences, EnforceGCContent) # Note: we are not providing a location for AvoidChanges: it applies globally for boost in (0, 0.1, 1, 10.0): sequence = random_dna_sequence(1000, seed=123) problem = DnaOptimizationProblem( sequence=sequence, objectives=[ EnforceGCContent(mini=0.45, maxi=0.55, window=80), AvoidChanges(boost=boost).as_passive_objective() ]) problem.optimize() differences = sequences_differences(problem.sequence, problem.sequence_before) print("%d nucleotides modified for boost=%.1f" % (differences, boost))
for k2, v2 in v.items(): if k2 in RSCU_list: codon_table_11[k][k2] = RSCU_list[k2] print("\nOptimizing codons for input gene list") #Read gene fasta sequence and initiate optimizer problem = DnaOptimizationProblem( sequence=gene, constraints=[ EnforceTranslation(), AvoidPattern("BsmBI_site", "BamHI"), EnforceTranslation(), EnforceGCContent(mini=0.35, maxi=0.65, window=50), #TWIST: 25% and 65% GC ], objectives=[CodonOptimize(codon_usage_table=codon_table_11)], ) if taxid and not input_path: print("\nOptimizing codons for taxonomic ID: " + taxid) #Read gene fasta sequence and initiate optimizer if not protein_flag: problem = DnaOptimizationProblem( sequence=gene, constraints=[ #EnforceSequence(sequence = "ATG", location=(0, 2)), AvoidChanges(location=(0, 2)), AvoidPattern("BsmBI_site", "BamHI"),
DnaOptimizationProblem, random_dna_sequence, EnforceGCContent, AvoidPattern, EnforceGCContent, ) import numpy # We setup the randomizer to always get the same sequence numpy.random.seed(123) problem = DnaOptimizationProblem( sequence=random_dna_sequence(10000), constraints=[ AvoidPattern("BsaI_site"), EnforceGCContent(mini=0.3, maxi=0.7, window=50), ], objectives=[EnforceGCContent(target=0.4)], ) print("\n\n=== Status before optimization ===") print(problem.constraints_text_summary()) print(problem.objectives_text_summary()) print("Now solving constraints...") problem.resolve_constraints() print("Done. Now optimizing objectives...") problem.max_random_iters = 10000 problem.optimize() print("\n\n=== Status after optimization ===\n")
def from_specs( n_barcodes=384, barcode_length=20, spacer="", forbidden_enzymes=("BsaI",), include_spacers=True, names_template="B_%03d", ): """Return a CustomBarcodesCollection object with compatible barcodes. **Parameters** **n_barcodes** > Number of barcodes to design. **barcode_length** > Length of each barcode. **spacer** > Spacer to place between each barcode during the optimization, ideally the same spacer that will be used when adding the barcode to a part. **include_spacers** > Whether the spacers should be part of the final sequence of the barcodes (they still won't be considered part of the annealing primer and won't be used for melting temperature computations). **forbidden_enzymes** > Name of enzymes whose sites should not be in the barcodes. **names_template** > The template used to name barcode number "i". """ unit_length = barcode_length + len(spacer) seq_len = n_barcodes * unit_length units_coordinates = [ (i, i + unit_length) for i in range(0, seq_len, unit_length) ] constraints = [ AvoidPattern(EnzymeSitePattern(enzyme)) for enzyme in forbidden_enzymes ] constraints += [AvoidPattern(RepeatedKmerPattern(4, 1))] for start, end in units_coordinates: constraints += [ UniquifyAllKmers( barcode_length, reference=None, location=(end - len(spacer), end) ), EnforceGCContent( mini=0.3, maxi=0.7, location=(start, end - len(spacer)) ), ] problem = DnaOptimizationProblem( sequence=random_dna_sequence(seq_len), constraints=constraints ) problem.logger.ignored_bars.add("location") problem.resolve_constraints() barcodes = [problem.sequence[start:end] for (start, end) in units_coordinates] if not include_spacers: barcodes = [b[: -len(spacer)] for b in barcodes] names = [(names_template % (i + 1)) for i in range(len(barcodes))] return CustomBarcodesCollection(zip(names, barcodes))
record = load_record(record_file, fmt="genbank") CDS_list = [(int(f.location.start), int(f.location.end), int(f.location.strand)) for f in record.features if f.type == "CDS"] # DEFINE CONSTRAINTS dna_provider_constraints = [ AvoidPattern("BsaI_site"), AvoidPattern("AarI_site"), AvoidPattern("9xA"), AvoidPattern("9xT"), AvoidPattern(HomopolymerPattern("6xG")), AvoidPattern(HomopolymerPattern("6xC")), EnforceGCContent(0.4, 0.65), EnforceGCContent(0.25, 0.80, window=50), ] CDS_constraints = [] for (start, end, strand) in CDS_list: if strand == 1: promoter_region = (start - 30, start - 1) else: promoter_region = (end + 1, end + 30) CDS_constraints += [ AvoidChanges(promoter_region), EnforceTranslation((start, end, strand)), ] # DEFINE OBJECTIVES