def test_highly_complex_design(span_cost, design_class): backbone = random_record(3000) repeat = random_record(30) complex_sequence = repeat + random_record(200) + repeat + random_record( 1000) goal = backbone[1000:] + complex_sequence + backbone[:1000] f1 = backbone[:2000] f2 = backbone[1900:2500] make_linear([f1, f2]) make_circular([goal]) design = design_class(span_cost) design.n_jobs = 1 design.add_materials(primers=[], templates=[f1, f2], queries=[goal], fragments=[]) design.compile() design.optimize() print(design.to_df()[1]) results = list(design.results.values()) result = results[0] print(result.assemblies) print(result.assemblies[0]._nodes) print(result) print(design.out())
def test_library(span_cost): goal1 = random_record(4000) goal2 = random_record(2000) + goal1[2000:3000] + random_record(2000) goal3 = goal2 + random_record(100) make_circular_and_id([goal1, goal2, goal3]) r1 = goal1[:2000] r2 = goal1[3000:4000] r3 = goal2[-2000:] r4 = goal2[:2000] r5 = goal1[:2100] make_linear([r1, r2, r3, r4, r5]) design = LibraryDesign(span_cost) design.n_jobs = 1 design.add_materials( primers=[], templates=[r1, r2, r3, r4, r5], queries=[goal1, goal2, goal3], fragments=[], ) design.compile() results = design.optimize() for result in results.values(): print(result.assemblies[0].compute_cost()) df = result.assemblies[0].to_df() print(df) notes = list(df["notes"]) assert "'n_clusters': 3" in str(notes)
def to_record(seq, linear=True): record = SeqRecord(Seq(seq)) if linear: make_linear([record]) else: make_circular([record]) return record
def test_simple_alignment(): record = rand_record(1000) queries = [record[:]] subjects = [record[10:-10]] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], 11, len(record) - 10, 1, len(record) - 10 - 10)
def test_align_Ns(): record = rand_record(1000) nseq = SeqRecord(Seq("N" * 500)) queries = [record[:]] subjects = [nseq + record + nseq] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() print(results)
def test_partial_alignment(left_spacer, ij): record = rand_record(1000) queries = [record[:]] subjects = [ns(left_spacer) + record[ij[0]:ij[1]]] queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], ij[0] + 1, ij[1], 1 + left_spacer, ij[1] - ij[0] + left_spacer)
def iter_fake_random_record(n_seqs: int, size_int: Tuple[int, int], cyclic: bool) -> List[SeqRecord]: for i in range(n_seqs): length = random.randint(*size_int) name = "<random record {}".format(str(uuid4())) rec = biopython.random_record(length, name=name, auto_annotate=True) rec.id = rec.name biopython.randomly_annotate(rec, (100, 1000)) if cyclic: make_circular([rec]) else: make_linear([rec]) yield rec
def test_partial_alignment_reverse_complement(left_spacer, ij): record = rand_record(1000) queries = [record[:]] subjects = [record[ij[0]:ij[1]]] subjects[0] = ns(left_spacer) + subjects[0].reverse_complement() queries = make_linear(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() assert len(results) == 1 compare_result(results[0], ij[0] + 1, ij[1], len(subjects[0].seq), left_spacer + 1)
def test_num_groups_vs_endpoints(here, paths, query, span_cost): primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs", query) queries = make_circular(load_genbank_glob(query_path)) design = Design(span_cost) design.add_materials(primers=primers, templates=templates, queries=queries) design._blast() containers = design.container_list assert len(containers) == 1 container = containers[0] container.expand() groups = container.groups() print(len(groups)**2) a_arr = set() b_arr = set() for g in groups: a_arr.add(g.query_region.a) b_arr.add(g.query_region.b) print(len(a_arr) * len(b_arr))
def test_unnamed_queries_raises_duplicate_error(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) seqstr1 = str(subjects[0].seq)[:1000] seqstr2 = str(subjects[1].seq)[:1000] queries = [ SeqRecord(Seq(seqstr1)), SeqRecord(Seq(seqstr2)) # SeqRecord(Seq(str(subjects[1][:1000]))), ] make_linear(queries) with pytest.raises(PyBlastException): BioBlast(subjects, queries)
def run(self, n_jobs: int = 10): """Run a design job. :param n_jobs: number of parrallel jobs to run. (default: 10) :return: """ import warnings warnings.simplefilter(action="ignore", category=RuntimeWarning) warnings.simplefilter(action="ignore", category=BiopythonParserWarning) self._logger.info("Loading sequence files") primers = make_linear(load_fasta_glob(self._primers)) templates = make_circular(load_genbank_glob(self._templates)) fragments = make_linear(load_genbank_glob(self._fragments)) goals = make_circular(load_genbank_glob(self._goals)) design = Design() design.n_jobs = n_jobs design.add_materials(primers=primers, templates=templates, fragments=fragments, queries=goals) self._logger.info("Getting span cost model") span_cost = self._get_span_cost() design.span_cost = span_cost self._logger.info("Compiling possible molecular assemblies") design.compile() self._logger.info("Optimizing molecular assemblies") design.optimize() self._logger.info("Designing assembly primers and fragments") df, adf, design_json = design.to_df() adf.to_csv("summary.csv") df.to_csv("sequence_design.csv") records = [] for result in design.results.values(): if result.assemblies: a = result.assemblies[0] for i, role, m in a.molecules: records.append(m.sequence) SeqIO.write(records, os.path.join(self._directory, "sequences.gb"), "genbank")
def test_reverse_alignment_simple(): record = rand_record(1000) query = record subject = record[10:990].reverse_complement() subjects = make_linear([subject]) queries = make_linear([query]) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() for k, v in bioblast.seq_db.records.items(): print(k) print(v) print(json.dumps(results, indent=2)) assert results[0]["query"]["start"] == 10 + 1 assert results[0]["query"]["end"] == 990 assert results[0]["subject"]["start"] == 980 assert results[0]["subject"]["end"] == 1
def test(paths): factory = BioBlastFactory() templates = load_genbank_glob(paths[TEMPLATES]) subject = templates[0] query = subject.reverse_complement() templates = make_linear([subject]) queries = make_linear([query]) factory.add_records(templates, TEMPLATES) factory.add_records(queries, QUERIES) blast = factory(TEMPLATES, QUERIES) results = blast.blastn() assert results[0]["subject"]["strand"] == -1 assert results[0]["subject"]["start"] == len(templates[0])
def test_validate_rc(here): queries = load_genbank_glob(join(here, "data/test_data/genbank/designs/*.gb"), force_unique_ids=True) templates = make_linear(queries[:1]) queries = make_linear([templates[0].reverse_complement()]) factory = BioBlastFactory() factory.add_records(queries, "queries") factory.add_records(templates, "templates") blaster = factory("templates", "queries") results = blaster.blastn() assert results[0]["subject"]["strand"] == -1 assert results[0]["query"]["strand"] == 1 assert results[0]["subject"]["start"] == len(queries[0])
def test_basic_run_reverse_complement(): junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc" junk2 = "g" * 30 frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta" query = SeqRecord(seq=Seq(frag), annotations={ "circular": False }).reverse_complement() subject = SeqRecord(seq=Seq(junk1 + frag + junk2), annotations={"circular": False}) make_linear([query]) # print(type(query)) # print(type(subject)) blaster = BioBlast([subject], [query]) blaster.blastn() alignments = blaster.results for a in alignments: print(json.dumps(a, indent=2)) assert a["subject"]["strand"] == -1
def test_circular_over_subject(self): record = rand_record(1000) queries = [record] subjects = [record[200:300] + ns(500) + record[100:200]] queries = make_linear(queries) subjects = make_circular(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() compare_result(results[0], 101, 300, 601, 100)
def blast_factory(paths) -> BioBlastFactory: factory = BioBlastFactory() primers = make_linear(load_fasta_glob(paths[PRIMERS])) templates = load_genbank_glob(paths[REGISTRY]) queries = make_circular(load_genbank_glob(paths[QUERIES])) factory.add_records(primers, PRIMERS) factory.add_records(templates, TEMPLATES) factory.add_records(queries, QUERIES) return factory
def test_library_design_to_df_2(paths, here, span_cost): primers_path = join(here, "data/test_data_sd2", "primers.fasta") fragments_path = join(here, "data/test_data_sd2", "fragments", "*.gb") plasmids_path = join(here, "data/test_data_sd2", "plasmids", "*.gb") designs_path = join(here, "data/test_data_sd2", "designs", "*.gb") primers = make_linear(load_fasta_glob(primers_path)) templates = load_genbank_glob(plasmids_path) fragments = load_genbank_glob(fragments_path) print(fragments_path) queries = make_circular(load_genbank_glob(designs_path)) design = LibraryDesign(span_cost=span_cost) design.n_jobs = 1 design.add_materials( primers=primers, templates=make_circular(templates), queries=queries, fragments=make_linear(fragments), ) design.logger.set_level("DEBUG") design.compile() results = design.optimize() for result in results.values(): assembly = result.assemblies[0] print(assembly.to_df()) # a, b, c = design.to_df() a.to_csv("library_design.csv") b.to_csv("library_summary.csv") with open("designs.json", "w") as f: json.dump(c, f) print(a) print(b) print(c)
def test_self_blast(here): subjects = load_genbank_glob(join(here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = [ SeqRecord(Seq(str(subjects[0][:1000].seq))), # SeqRecord(Seq(str(subjects[1][:1000]))), ] force_unique_record_ids(make_linear(queries)) bioblast = BioBlast(queries, queries) results = bioblast.blastn() assert not results
def test_benchmark_blast(benchmark, here, paths, query): primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs", query) queries = make_circular(load_genbank_glob(query_path)) design = Design() design.add_materials(primers=primers, templates=templates, queries=queries) design._blast() benchmark(design._blast)
def make_blast(): subjects = load_fasta_glob(join( here, "data/test_data/primers/primers.fasta"), force_unique_ids=True) subjects = make_linear(subjects) queries = load_genbank_glob( join( here, "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb" ), force_unique_ids=True, ) return BioBlast(subjects, queries)
def test_example1(): from pyblast import BioBlast from pyblast.utils import make_linear from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import json queries = [ SeqRecord( Seq("ACGTGATTCGTCGTGTAGTTGAGTGTTACGTTGCATGTCGTACGTGTGTAGTGTCGTGTAGTGCTGATGCTACGTGATCG" )) ] subjects = [ SeqRecord(Seq("ACGTGATTCGTCGTGTAGTTGAGTGTTACGTTGCATGTCGTTACGTGATCG")) ] # pyblast requires a 'topology' annotation on the SeqRecords. # we can make records circular or linear using `make_linear` or `make_circular` methods subjects = make_linear(subjects) queries = make_linear(queries) blast = BioBlast(subjects, queries) results = blast.blastn() print(json.dumps(results, indent=2))
def test_reverse_alignment_simple2(): record = rand_record(1000) query = record subject = record[10:-10].reverse_complement() subjects = make_linear([subject]) queries = make_linear([query]) factory = BioBlastFactory() factory.add_records(subjects, "subjects") factory.add_records(queries, "queries") bioblast = factory("subjects", "queries") results = bioblast.blastn() for k, v in bioblast.seq_db.records.items(): print(k) print(v) print(json.dumps(results, indent=2)) assert results[0]["subject"]["strand"] == -1 assert results[0]["subject"]["start"] == 980 assert results[0]["subject"]["end"] == 1 assert results[0]["query"]["start"] == 11 assert results[0]["query"]["end"] == 990
def test_circular_complete_subject(self): record = rand_record(1000) queries = [record] subjects = [record[500:] + record[:400]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(result) assert result["subject"]["start"] == 1 assert result["subject"]["end"] == 900 assert result["query"]["start"] == 501 assert result["query"]["end"] == 400
def _get_results_func(n_jobs): if True: print("PROCESSING!") primers = make_linear(load_fasta_glob(paths["primers"])) templates = load_genbank_glob(paths["templates"]) query_path = join(here, "data/test_data/genbank/designs/*.gb") queries = make_circular( load_genbank_glob(query_path))[:LIM_NUM_DESIGNS] design = Design(span_cost=cached_span_cost) design.add_materials(primers=primers, templates=templates, queries=queries) if n_jobs > 1: design._run_with_pool(n_jobs, 1) else: design.run() return design, design.results
def test_circular_over_query(self): record = rand_record(1000) queries = [record] subjects = [record[-100:] + record[:100]] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] result_seq = str((record[result["query"]["start"] - 1:] + record[:result["query"]["end"]]).seq) expected_seq = str(subjects[0].seq) assert result_seq == expected_seq compare_result(results[0], 1000 - 100 + 1, 100, 1, 200)
def test_circular_complete_query_1(self): """In this situation, the subject is completely aligned with a circular query starting at index 500 (starting index = 0). Note that the pyblast results start at index 1.""" record = rand_record(1000) queries = [record] subjects = [ns(100) + record[500:] + record[:500] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 501 assert result["query"]["raw_end"] == 1500 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1100
def test_interaction_network(): """We expect self alignments to be removed from the results.""" records = [None, None, None, None] records[0] = rand_record(500) records[1] = rand_record(100) + records[0][:-100] + rand_record(1000) records[2] = rand_record(200) + records[1][:700] + rand_record(500) records[3] = records[2][-500:] + rand_record(500) force_unique_record_ids(records) queries = make_linear(records) bioblast = BioBlast(queries, queries) results = bioblast.blastn() assert results for r in results: k1 = r["query"]["origin_key"] k2 = r["subject"]["origin_key"] print(k1, k2) assert not k1 == k2
def test_circular_complete_query_4(self): """In this situation, the subject is wraps around the query for 10 extra bases on the left and right site. Note that pyblast results start at index 1. """ record = rand_record(1000) queries = [record] subjects = [ns(100) + record[-10 + 500:] + record[:500 + 10] + ns(100)] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] assert result["query"]["start"] == 491 assert result["query"]["raw_end"] == 1510 assert result["subject"]["start"] == 101 assert result["subject"]["end"] == 1120
def test_circular_complete_query_parametrized_rc(self, extra_right, extra_left): record = rand_record(1000) queries = [record] subjects = [ ns(100) + record[(500 - extra_left):] + record[:(500 + extra_right)] + ns(100) ] subjects = [subjects[0].reverse_complement()] queries = make_circular(queries) subjects = make_linear(subjects) bioblast = BioBlast(subjects, queries) results = bioblast.blastn() result = results[0] print(json.dumps(result, indent=2)) assert result["query"]["start"] == 501 - extra_left assert result["query"]["raw_end"] == 1500 + extra_right assert result["subject"]["start"] == 1100 + extra_right + extra_left assert result["subject"]["end"] == 101 # to spans query_span = bioblast.parse_result_to_span(result["query"], output_index=0) subject_span = bioblast.parse_result_to_span(result["subject"], output_index=0) assert len(subject_span) == len( query_span) == 1000 + extra_right + extra_left assert query_span.a == 500 - extra_left assert query_span.b == 500 + extra_right assert subject_span.a == 100 assert subject_span.b == 1100 + extra_right + extra_left