def test_highly_complex_design(span_cost, design_class): backbone = random_record(3000) repeat = random_record(30) complex_sequence = repeat + random_record(200) + repeat + random_record( 1000) goal = backbone[1000:] + complex_sequence + backbone[:1000] f1 = backbone[:2000] f2 = backbone[1900:2500] make_linear([f1, f2]) make_circular([goal]) design = design_class(span_cost) design.n_jobs = 1 design.add_materials(primers=[], templates=[f1, f2], queries=[goal], fragments=[]) design.compile() design.optimize() print(design.to_df()[1]) results = list(design.results.values()) result = results[0] print(result.assemblies) print(result.assemblies[0]._nodes) print(result) print(design.out())
def to_record(seq, linear=True): record = SeqRecord(Seq(seq)) if linear: make_linear([record]) else: make_circular([record]) return record
def iter_fake_random_record(n_seqs: int, size_int: Tuple[int, int], cyclic: bool) -> List[SeqRecord]: for i in range(n_seqs): length = random.randint(*size_int) name = "<random record {}".format(str(uuid4())) rec = biopython.random_record(length, name=name, auto_annotate=True) rec.id = rec.name biopython.randomly_annotate(rec, (100, 1000)) if cyclic: make_circular([rec]) else: make_linear([rec]) yield rec
def test_num_groups_vs_endpoints(here, paths, query, span_cost): primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs", query) queries = make_circular(load_genbank_glob(query_path)) design = Design(span_cost) design.add_materials(primers=primers, templates=templates, queries=queries) design._blast() containers = design.container_list assert len(containers) == 1 container = containers[0] container.expand() groups = container.groups() print(len(groups)**2) a_arr = set() b_arr = set() for g in groups: a_arr.add(g.query_region.a) b_arr.add(g.query_region.b) print(len(a_arr) * len(b_arr))
def test_add_with_transform(): """We expect that when we add a record with a transformation, that the we can obtain the origin record and transformed record.""" db = SeqRecordDB() from uuid import uuid4 from pyblast.constants import Constants as C def pseudocircularize(r): r2 = r + r r2.name = C.PSEUDOCIRCULAR + "__" + r.name r2.id = str(uuid4()) return r2 record = SeqRecord( Seq("ACGTTCGTGATTGTGCTGTGTGTATGGTATGATTATAGTGATGTAGTGATGATGTAGTAGTATA") ) records = make_circular([record]) keys = db.add_many_with_transformations(records, pseudocircularize, C.PSEUDOCIRCULAR) key = keys[0] origin_key = db.get_origin_key(key) origin = db.get_origin(key) transformed = db.get(key) assert origin is record assert origin is not transformed assert origin_key is not key assert len(transformed) == 2 * len(record)
def test_add_multiple_transformations(): """We expect that when we add a record with a transformation, that the we can obtain the origin record and transformed record.""" db = SeqRecordDB() from uuid import uuid4 from pyblast.constants import Constants as C def pseudocircularize(r): pseudor = r + r pseudor.name = C.PSEUDOCIRCULAR + "__" + r.name pseudor.id = str(uuid4()) return pseudor record = SeqRecord( Seq("ACGTTCGTGATTGTGCTGTGTGTATGGTATGATTATAGTGATGTAGTGATGATGTAGTAGTATA") ) r1 = deepcopy(record) r2 = deepcopy(record) r1.id = "record1" r2.id = "record2" records = make_circular([r1, r2]) keys = db.add_many_with_transformations(records, pseudocircularize, C.PSEUDOCIRCULAR) assert len(keys) == 2 assert len(db) == 4
def run(self, n_jobs: int = 10): """Run a design job. :param n_jobs: number of parrallel jobs to run. (default: 10) :return: """ import warnings warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=BiopythonParserWarning) self._logger.info("Loading sequence files") primers = make_linear(load_fasta_glob(self._primers)) templates = make_circular(load_genbank_glob(self._templates)) fragments = make_linear(load_genbank_glob(self._fragments)) goals = make_circular(load_genbank_glob(self._goals)) design = Design() design.n_jobs = n_jobs design.add_materials(primers=primers, templates=templates, fragments=fragments, queries=goals) self._logger.info("Getting span cost model") span_cost = self._get_span_cost() design.span_cost = span_cost self._logger.info("Compiling possible molecular assemblies") design.compile() self._logger.info("Optimizing molecular assemblies") design.optimize() self._logger.info("Designing assembly primers and fragments") df, adf, design_json = design.to_df() adf.to_csv("summary.csv") df.to_csv("sequence_design.csv") records = [] for result in design.results.values(): if result.assemblies: a = result.assemblies[0] for i, role, m in a.molecules: records.append(m.sequence) SeqIO.write(records, os.path.join(self._directory, "sequences.gb"), "genbank")
def test_circular_over_subject(self): record = rand_record(1000) queries = [record] subjects = [record[200:300] + ns(500) + record[100:200]] queries = make_linear(queries) subjects = make_circular(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() compare_result(results[0], 101, 300, 601, 100)
def blast_factory(paths) -> BioBlastFactory: factory = BioBlastFactory() primers = make_linear(load_fasta_glob(paths[PRIMERS])) templates = load_genbank_glob(paths[REGISTRY]) queries = make_circular(load_genbank_glob(paths[QUERIES])) factory.add_records(primers, PRIMERS) factory.add_records(templates, TEMPLATES) factory.add_records(queries, QUERIES) return factory
def test_library_design_to_df_2(paths, here, span_cost): primers_path = join(here, "data/test_data_sd2", "primers.fasta") fragments_path = join(here, "data/test_data_sd2", "fragments", "*.gb") plasmids_path = join(here, "data/test_data_sd2", "plasmids", "*.gb") designs_path = join(here, "data/test_data_sd2", "designs", "*.gb") primers = make_linear(load_fasta_glob(primers_path)) templates = load_genbank_glob(plasmids_path) fragments = load_genbank_glob(fragments_path) print(fragments_path) queries = make_circular(load_genbank_glob(designs_path)) design = LibraryDesign(span_cost=span_cost) design.n_jobs = 1 design.add_materials( primers=primers, templates=make_circular(templates), queries=queries, fragments=make_linear(fragments), ) design.logger.set_level("DEBUG") design.compile() results = design.optimize() for result in results.values(): assembly = result.assemblies[0] print(assembly.to_df()) # a, b, c = design.to_df() a.to_csv("library_design.csv") b.to_csv("library_summary.csv") with open("designs.json", "w") as f: json.dump(c, f) print(a) print(b) print(c)
def test_benchmark_blast(benchmark, here, paths, query): primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs", query) queries = make_circular(load_genbank_glob(query_path)) design = Design() design.add_materials(primers=primers, templates=templates, queries=queries) design._blast() benchmark(design._blast)
def make_blast(): subjects = load_genbank_glob(join( here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = load_genbank_glob( join( here, "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb" ), force_unique_ids=True, ) queries = make_circular(queries) assert is_circular(queries[0]) return BioBlast(subjects, queries)
def test_circular_complete_subject(self): record = rand_record(1000) queries = [record] subjects = [record[500:] + record[:400]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(result) assert result["subject"]["start"] == 1 assert result["subject"]["end"] == 900 assert result["query"]["start"] == 501 assert result["query"]["end"] == 400
def test_circular_complete_query_1(self): """In this situation, the subject is completely aligned with a circular query starting at index 500 (starting index = 0). Note that the pyblast results start at index 1.""" record = rand_record(1000) queries = [record] subjects = [ns(100) + record[500:] + record[:500] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 501 assert result["query"]["raw_end"] == 1500 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1100
def _get_results_func(n_jobs): if True: print("PROCESSING!") primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs/*.gb") queries = make_circular( load_genbank_glob(query_path))[:LIM_NUM_DESIGNS] design = Design(span_cost=cached_span_cost) design.add_materials(primers=primers, templates=templates, queries=queries) if n_jobs > 1: design._run_with_pool(n_jobs, 1) else: design.run() return design, design.results
def test_circular_over_query(self): record = rand_record(1000) queries = [record] subjects = [record[-100:] + record[:100]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] result_seq = str((record[result["query"]["start"] - 1:] + record[:result["query"]["end"]]).seq) expected_seq = str(subjects[0].seq) assert result_seq == expected_seq compare_result(results[0], 1000 - 100 + 1, 100, 1, 200)
def test_example2(): from pyblast import BioBlast from pyblast.utils import make_linear, make_circular from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import json seq = "ACGTTGTAGTGTAGTTGATGATGATGTCTGTGTCGTGTGATGTGCTAGGGGTTGATGTGAGTAGTTAGTGGTAGTGTTTAGGGGCGGCGCGGAGTATGCTG" queries = [SeqRecord(Seq(seq))] subjects = [SeqRecord(Seq(seq[-20:] + seq[:30]))] # pyblast requires a 'topology' annotation on the SeqRecords. # we can make records circular or linear using `make_linear` or `make_circular` methods subjects = make_linear(subjects) queries = make_circular(queries) blast = BioBlast(subjects, queries) results = blast.blastn() print(json.dumps(results, indent=2))
def test_circular_complete_query_4(self): """In this situation, the subject is wraps around the query for 10 extra bases on the left and right site. Note that pyblast results start at index 1. """ record = rand_record(1000) queries = [record] subjects = [ns(100) + record[-10 + 500:] + record[:500 + 10] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 491 assert result["query"]["raw_end"] == 1510 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1120
def test_circular_complete_query_parametrized_rc(self, extra_right, extra_left): record = rand_record(1000) queries = [record] subjects = [ ns(100) + record[(500 - extra_left):] + record[:(500 + extra_right)] + ns(100) ] subjects = [subjects[0].reverse_complement()] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(json.dumps(result, indent=2)) assert result["query"]["start"] == 501 - extra_left assert result["query"]["raw_end"] == 1500 + extra_right assert result["subject"]["start"] == 1100 + extra_right + extra_left assert result["subject"]["end"] == 101 # to spans query_span = bioblast.parse_result_to_span(result["query"], output_index=0) subject_span = bioblast.parse_result_to_span(result["subject"], output_index=0) assert len(subject_span) == len( query_span) == 1000 + extra_right + extra_left assert query_span.a == 500 - extra_left assert query_span.b == 500 + extra_right assert subject_span.a == 100 assert subject_span.b == 1100 + extra_right + extra_left
def test_add_same_transformation(): """We expect that when we add a record with a transformation, that the we can obtain the origin record and transformed record.""" db = SeqRecordDB() from uuid import uuid4 from pyblast.constants import Constants as C def pseudocircularize(r): r2 = r + r r2.name = C.PSEUDOCIRCULAR + "__" + r.name r2.id = str(uuid4()) return r2 record = SeqRecord( Seq("ACGTTCGTGATTGTGCTGTGTGTATGGTATGATTATAGTGATGTAGTGATGATGTAGTAGTATA") ) records = make_circular([record, record]) keys = db.add_many_with_transformations(records, pseudocircularize, C.PSEUDOCIRCULAR) assert len(set(keys)) == 1 assert len(db) == 2
def test_library_design_to_df(paths, here, span_cost): primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/library_designs/*.gb") queries = make_circular(load_genbank_glob(query_path)) queries = queries design = LibraryDesign(span_cost=span_cost) design.n_jobs = 1 design.add_materials(primers=primers, templates=templates, queries=queries) design.logger.set_level("DEBUG") design.compile() results = design.optimize() print(results) a, b, c = design.to_df() a.to_csv("library_design.csv") b.to_csv("library_summary.csv") with open("designs.json", "w") as f: json.dump(c, f) print(a) print(b) print(c)
def generate_fake_designs( n_designs: int, circular: bool, n_cyclic_seqs: int, n_linear_seqs: int, n_primers: int, n_primers_from_templates: int, design_sequence_similarity_length: int = 0, cyclic_size_int: Tuple[int, int] = (3000, 10000), linear_size_int: Tuple[int, int] = (100, 4000), primer_size_int: Tuple[int, int] = (15, 60), plasmid_size_interval: Tuple[int, int] = (5000, 10000), chunk_size_interval: Tuple[int, int] = (100, 3000), random_chunk_prob_int: Tuple[float, float] = (0, 0.5), random_chunk_size_int: Tuple[int, int] = (100, 1000), ): library_dict = generate_fake_library( n_cyclic_seqs=n_cyclic_seqs, n_linear_seqs=n_linear_seqs, n_primers=n_primers, cyclic_size_int=cyclic_size_int, linear_size_int=linear_size_int, primer_size_int=primer_size_int, ) linear_seqs = library_dict["linear"] cyclic_seqs = library_dict["cyclic"] templates = cyclic_seqs + linear_seqs short_seqs = library_dict["short"] for i in range(n_primers_from_templates): primer = biopython.random_record_from_library( templates, circular=False, size_interval=(15, 100), max_chunks=1, chunk_size_interval=(15, 60), random_chunk_prob_int=(0, 0), random_chunk_size_int=(0, 0), ) short_seqs.append(primer) # generate designs from templates or random sequence designs = [] for i in range(n_designs): rec = biopython.random_record_from_library( templates, circular=circular, size_interval=plasmid_size_interval, chunk_size_interval=chunk_size_interval, random_chunk_prob_int=random_chunk_prob_int, random_chunk_size_int=random_chunk_size_int, ) designs.append(rec) if design_sequence_similarity_length: designs = _add_shared_sequence(designs, design_sequence_similarity_length) if circular: make_circular(designs) else: make_linear(designs) return { "design": designs, "cyclic": cyclic_seqs, "linear": linear_seqs, "short": short_seqs, }
def make_circular_and_id(rlist): make_circular(rlist) for r in rlist: r.id = str(uuid4())