Beispiel #1
0
def make_restriction_part(part_length, left_overhang, right_overhang,
                          enzyme, forbidden_enzymes, assembly_enzyme='BsmBI'):
    l_left = len(left_overhang)
    l_right = len(right_overhang)
    left_overhang_location = (0, l_left)
    right_overhang_location = (l_left + part_length,
                               l_left + part_length + l_right)
    center_location = (l_left, l_left + part_length)
    core_sequence = (left_overhang + dc.random_dna_sequence(part_length)
                     + right_overhang)
    enforce_enzyme = dc.EnforcePatternOccurence(
        enzyme=enzyme, location=center_location)
    problem = dc.DnaOptimizationProblem(
        sequence=core_sequence,
        constraints=[
            dc.AvoidChanges(left_overhang_location),
            dc.AvoidChanges(right_overhang_location),
        ] + [enforce_enzyme] + [
            dc.AvoidPattern(enzyme=enzyme_name)
            for enzyme_name in forbidden_enzymes + [assembly_enzyme]
        ]
    )
    problem.resolve_constraints()
    core_sequence = dc.sequence_to_biopython_record(problem.sequence)
    for loc in [left_overhang_location, right_overhang_location]:
        dc.annotate_record(core_sequence, loc, 'overhang')
    site_location = enforce_enzyme.evaluate(problem).data['matches'][0]
    dc.annotate_record(core_sequence, site_location.to_tuple(), enzyme)
    assembly_site = Restriction.__dict__[assembly_enzyme].site
    flank = dc.sequence_to_biopython_record(assembly_site + 'A')
    dc.annotate_record(flank, label='flank')
    return flank + core_sequence + flank.reverse_complement()
def test_circular_sequence_basic():
    np.random.seed(123)
    # Until the feature gets more battle-test, we're making sure it works
    # across a range of sequences.
    for i in range(4):
        dna_sequence = (
            "CTC"
            + dc.random_dna_sequence(100)
            + "CGTCTC"
            + dc.random_dna_sequence(100)
            + "CGT"
        )
        problem = dc.CircularDnaOptimizationProblem(
            sequence=dna_sequence,
            constraints=[
                dc.AvoidPattern("BsmBI_site"),
                dc.EnforceGCContent(
                    mini=0.4, maxi=0.6, location=(150, 250), window=50
                ),
                dc.UniquifyAllKmers(k=9, location=(10, 100)),
            ],
            logger=None,
        )
        assert not problem.all_constraints_pass()
        problem.resolve_constraints()
        assert problem.all_constraints_pass()
def test_circular_sequence_optimize_with_report(tmpdir):
    """Test that the custom function of CircularDnaOptimizationProblems works.
    """
    np.random.seed(123)
    # Until the feature gets more battle-test, we're making sure it works
    # across a range of sequences.
    dna_sequence = (
        "CTC"
        + dc.random_dna_sequence(100)
        + "CGTCTC"
        + dc.random_dna_sequence(100)
        + "CGT"
    )
    problem = dc.CircularDnaOptimizationProblem(
        sequence=dna_sequence,
        constraints=[
            dc.AvoidPattern("BsmBI_site"),
            dc.EnforceGCContent(
                mini=0.4, maxi=0.6, location=(150, 250), window=50
            ),
            dc.UniquifyAllKmers(k=9, location=(10, 100)),
        ],
        logger=None,
    )

    target = os.path.join(str(tmpdir), "circular_with_solution")
    os.mkdir(target)
    assert os.listdir(target) == []
    assert not problem.all_constraints_pass()
    success, message, data = problem.optimize_with_report(target)
    assert problem.all_constraints_pass()
    record = problem.to_record()
    assert str(record.seq) != dna_sequence
Beispiel #4
0
def test_whole_sequence_change_objective_20_going_down():
    np.random.seed(123)
    problem = dc.DnaOptimizationProblem(
        sequence=20*"AT",
        constraints=[dc.AvoidPattern("ATA")],
        objectives=[dc.EnforceChanges(amount=20)],
    )
    problem.mutations_per_iteration = 2
    problem.resolve_constraints()
    assert problem.number_of_edits() >= 24
    problem.optimize()
    assert problem.number_of_edits() == 20
def test_constraints_reports():
    genbank_dir = os.path.join("tests", "data", "10_emma_genbanks")
    records = [
        dc.load_record(os.path.join(genbank_dir, filename), name=filename)
        for filename in os.listdir(genbank_dir)
    ]

    # DEFINE THE CONSTRAINTS TO BE CHECKED ON EACH RECORD

    constraints = [
        dc.AvoidPattern("BsaI_site"),
        dc.AvoidPattern("BsmBI_site"),
        dc.AvoidPattern("BbsI_site"),
        dc.AvoidPattern("8x1mer"),
        dc.AvoidPattern("5x3mer"),
        dc.AvoidPattern("9x2mer"),
        dc.AvoidHairpins(stem_size=20, hairpin_window=200),
        dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100),
    ]

    # CREATE A SPREADSHEET AND PLOTS OF THE BREACHES

    dataframe = cr.constraints_breaches_dataframe(constraints, records)
    records = cr.records_from_breaches_dataframe(dataframe, records)
    assert sum([len(r.features) for r in records]) == 157
    pdf_data = cr.breaches_records_to_pdf(records)

    assert 70000 < len(pdf_data) < 72000
def test_insert_and_erase_pattern():
    numpy.random.seed(123)
    protein = dc.random_protein_sequence(100)
    pattern = "ATGC"

    # CREATE A SEQUENCE WITH 0 PATTERN OCCURENCES

    sequence = dc.random_compatible_dna_sequence(
        sequence_length=300,
        constraints=[
            dc.EnforceTranslation(translation=protein),
            dc.AvoidPattern(pattern),
        ],
        logger=None,
    )

    # NOW INCREASE PATTERN OCCURENCES FROM 0 TO 5

    problem = dc.DnaOptimizationProblem(
        sequence=sequence,
        constraints=[
            dc.EnforcePatternOccurence(pattern, occurences=5),
            dc.EnforceTranslation(),
        ],
        logger=None,
    )
    assert problem.constraints[0].evaluate(problem).score == -5
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
    sequence = problem.sequence

    # NOW DECREASE THE NUMBER OF OCCURENCES FROM 5 TO 2

    problem = dc.DnaOptimizationProblem(
        sequence=sequence,
        constraints=[
            dc.EnforcePatternOccurence(pattern, occurences=2),
            dc.EnforceTranslation(),
        ],
        logger=None,
    )
    assert problem.constraints[0].evaluate(problem).score == -3
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Beispiel #7
0
    def work(self):

        data = self.data
        figures = []

        self.logger(message="Generating report...")
        records = records_from_data_files(data.files)
        constraints = [
            dc.AvoidPattern("BsaI_site"),
            dc.AvoidPattern("BsmBI_site"),
            dc.AvoidPattern("BbsI_site"),
            dc.AvoidPattern("SapI_site"),
            dc.AvoidPattern("8x1mer"),
            dc.AvoidPattern("5x3mer"),
            dc.AvoidPattern("9x2mer"),
            dc.AvoidHairpins(stem_size=20, hairpin_window=200),
            dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100),
            dc.EnforceGCContent(mini=0.1, maxi=0.9, window=100),
            dc.UniquifyAllKmers(k=15),
        ]

        dataframe = cr.constraints_breaches_dataframe(constraints, records)
        spreadsheet_io = BytesIO()
        dataframe.to_excel(spreadsheet_io)
        records = cr.records_from_breaches_dataframe(dataframe, records)
        zipped_records = flametree.file_tree("@memory")
        if data.include_genbanks:
            for record in records:
                target = zipped_records._file("%s.gb" % record.id)
                write_record(record, target)
        pdf_io = BytesIO()
        cr.breaches_records_to_pdf(records, pdf_io, logger=self.logger)

        return {
            "pdf_report": {
                "data":
                data_to_html_data(
                    pdf_io.getvalue(),
                    "pdf",
                    filename="manufacturability_report.pdf",
                ),
                "name":
                "manufacturability_report.pdf",
                "mimetype":
                "application/pdf",
            },
            "records": {
                "data":
                data_to_html_data(
                    zipped_records._close(),
                    "zip",
                    filename="manufacturability_annotated_records.zip",
                ),
                "name":
                "manufacturability_annotated_records.zip",
                "mimetype":
                "application/zip",
            },
            "spreadsheet": {
                "data":
                data_to_html_data(
                    spreadsheet_io.getvalue(),
                    "xlsx",
                    filename="manufacturability_report.xlsx",
                ),
                "name":
                "manufacturability_report.xlsx",
                "mimetype":
                "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            },
        }
problem:

- The sequence is designed to have a cross-origin BsmBI site that will need
  to be removed, because the location-less specification ``AvoidPattern``
  is interpreted as applying to the full circle.

- The specification ``EnforceGCContent`` is cross-origin since its location is
  1500-2500, and the sequence is ~2000bp long.

"""

import dnachisel as dc

dna_sequence = "CTC%sCGTCTC%sCGT" % (
    dc.random_dna_sequence(1000),
    dc.random_dna_sequence(1000),
)

constraints = [
    dc.AvoidPattern("BsmBI_site"),
    dc.EnforceGCContent(mini=0.4, maxi=0.6, location=(1500, 2500), window=50),
    dc.UniquifyAllKmers(k=9, location=(10, 1000)),
]

problem = dc.CircularDnaOptimizationProblem(
    sequence=dna_sequence, constraints=constraints
)

print("BEFORE OPTIMIZATION:\n\n", problem.constraints_text_summary())
problem.resolve_constraints()
print("AFTER OPTIMIZATION:\n\n", problem.constraints_text_summary())
from copy import deepcopy
from collections import OrderedDict
import dnachisel as dc
import pandas
import seaborn as sns

sns.set()

# DEFINE HOW OPTIMIZATION PROBLEMS ARE CREATED

specifications = {
    "~keep": dc.AvoidChanges(),
    "~no(CG)": dc.AvoidPattern("CG"),
    "~codon_optimize": dc.CodonOptimize(species="e_coli"),
    "~unique_kmers": dc.UniquifyAllKmers(20),
    "~gc(39%)": dc.EnforceGCContent(target=0.39, window=200),
}
class_to_label = {
    spec.__class__: label for label, spec in specifications.items()
}
sequence = dc.load_record("record.gb")


def create_problem(boost_profile):
    location = dc.Location(1000, 9247)
    objectives = []
    for spec_name, boost in boost_profile.items():
        spec = specifications[spec_name]
        spec = spec.copy_with_changes(boost=boost, location=location)
        objectives.append(spec)
    return dc.DnaOptimizationProblem(
Beispiel #10
0
import dnachisel as dc
import dnachisel.reports.constraints_reports as cr
import os

# IMPORT THE 10 RECORDS FROM THE genbanks/ FOLDER

records = [
    dc.load_record(os.path.join("genbanks", filename), name=filename)
    for filename in os.listdir("genbanks")
]

# DEFINE THE CONSTRAINTS TO BE CHECKED ON EACH RECORD

constraints = [
    dc.AvoidPattern("BsaI_site"),
    dc.AvoidPattern("BsmBI_site"),
    dc.AvoidPattern("BbsI_site"),
    dc.AvoidPattern("8x1mer"),
    dc.AvoidPattern("5x3mer"),
    dc.AvoidPattern("9x2mer"),
    dc.AvoidHairpins(stem_size=20, hairpin_window=200),
    dc.EnforceGCContent(mini=0.3, maxi=0.7, window=100),
]

# CREATE A SPREADSHEET AND PLOTS OF THE BREACHES

dataframe = cr.constraints_breaches_dataframe(constraints, records)
dataframe.to_excel("breaches.xlsx")
records = cr.records_from_breaches_dataframe(dataframe, records)
cr.breaches_records_to_pdf(records, "breaches_plots.pdf")
regex = "(CCCTTT){3}C{3}"  # optimal pattern for i-motif formation
query_seq = (
    dnachisel.random_dna_sequence(length=50)
    + i_motif
    + dnachisel.random_dna_sequence(length=50)
)
print(query_seq)
seq = Bio.Seq.Seq(query_seq)

# Find first occurrence:
print(seq.find(i_motif))

# Find all:
matches = [
    (m.start(), m.end()) for m in re.finditer(i_motif, str(seq))
]  # list of tuples
print(seq[matches[0][0] : matches[0][1]])

# Find regex with DNA Chisel:
problem = dnachisel.DnaOptimizationProblem(
    sequence=query_seq, constraints=[dnachisel.AvoidPattern(pattern=regex)]
)
print(problem.constraints_text_summary())


compact_regex = "(C{3}T{3}){3}C{3}"  # variant of the same regex
problem = dnachisel.DnaOptimizationProblem(
    sequence=query_seq, constraints=[dnachisel.AvoidPattern(pattern=compact_regex)]
)
print(problem.constraints_text_summary())