def test_highly_complex_design(span_cost, design_class):
    backbone = random_record(3000)
    repeat = random_record(30)
    complex_sequence = repeat + random_record(200) + repeat + random_record(
        1000)
    goal = backbone[1000:] + complex_sequence + backbone[:1000]
    f1 = backbone[:2000]
    f2 = backbone[1900:2500]

    make_linear([f1, f2])
    make_circular([goal])

    design = design_class(span_cost)
    design.n_jobs = 1
    design.add_materials(primers=[],
                         templates=[f1, f2],
                         queries=[goal],
                         fragments=[])

    design.compile()
    design.optimize()

    print(design.to_df()[1])

    results = list(design.results.values())
    result = results[0]
    print(result.assemblies)
    print(result.assemblies[0]._nodes)
    print(result)

    print(design.out())
def test_library(span_cost):
    goal1 = random_record(4000)
    goal2 = random_record(2000) + goal1[2000:3000] + random_record(2000)
    goal3 = goal2 + random_record(100)
    make_circular_and_id([goal1, goal2, goal3])

    r1 = goal1[:2000]
    r2 = goal1[3000:4000]
    r3 = goal2[-2000:]
    r4 = goal2[:2000]

    r5 = goal1[:2100]

    make_linear([r1, r2, r3, r4, r5])

    design = LibraryDesign(span_cost)
    design.n_jobs = 1
    design.add_materials(
        primers=[],
        templates=[r1, r2, r3, r4, r5],
        queries=[goal1, goal2, goal3],
        fragments=[],
    )

    design.compile()
    results = design.optimize()

    for result in results.values():
        print(result.assemblies[0].compute_cost())
        df = result.assemblies[0].to_df()
        print(df)
        notes = list(df["notes"])
        assert "'n_clusters': 3" in str(notes)
Ejemplo n.º 3
0
def to_record(seq, linear=True):
    record = SeqRecord(Seq(seq))
    if linear:
        make_linear([record])
    else:
        make_circular([record])
    return record
Ejemplo n.º 4
0
def test_simple_alignment():
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [record[10:-10]]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1
    compare_result(results[0], 11, len(record) - 10, 1, len(record) - 10 - 10)
Ejemplo n.º 5
0
def test_align_Ns():
    record = rand_record(1000)
    nseq = SeqRecord(Seq("N" * 500))
    queries = [record[:]]
    subjects = [nseq + record + nseq]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    print(results)
Ejemplo n.º 6
0
def test_partial_alignment(left_spacer, ij):
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [ns(left_spacer) + record[ij[0]:ij[1]]]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1

    compare_result(results[0], ij[0] + 1, ij[1], 1 + left_spacer,
                   ij[1] - ij[0] + left_spacer)
Ejemplo n.º 7
0
def iter_fake_random_record(n_seqs: int, size_int: Tuple[int, int],
                            cyclic: bool) -> List[SeqRecord]:
    for i in range(n_seqs):
        length = random.randint(*size_int)

        name = "<random record {}".format(str(uuid4()))
        rec = biopython.random_record(length, name=name, auto_annotate=True)
        rec.id = rec.name
        biopython.randomly_annotate(rec, (100, 1000))

        if cyclic:
            make_circular([rec])
        else:
            make_linear([rec])
        yield rec
Ejemplo n.º 8
0
def test_partial_alignment_reverse_complement(left_spacer, ij):
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [record[ij[0]:ij[1]]]
    subjects[0] = ns(left_spacer) + subjects[0].reverse_complement()

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1

    compare_result(results[0], ij[0] + 1, ij[1], len(subjects[0].seq),
                   left_spacer + 1)
Ejemplo n.º 9
0
def test_num_groups_vs_endpoints(here, paths, query, span_cost):
    primers = make_linear(load_fasta_glob(paths["primers"]))
    templates = load_genbank_glob(paths["templates"])

    query_path = join(here, "data/test_data/genbank/designs", query)
    queries = make_circular(load_genbank_glob(query_path))

    design = Design(span_cost)

    design.add_materials(primers=primers, templates=templates, queries=queries)

    design._blast()
    containers = design.container_list
    assert len(containers) == 1
    container = containers[0]
    container.expand()
    groups = container.groups()
    print(len(groups)**2)

    a_arr = set()
    b_arr = set()

    for g in groups:
        a_arr.add(g.query_region.a)
        b_arr.add(g.query_region.b)

    print(len(a_arr) * len(b_arr))
Ejemplo n.º 10
0
def test_unnamed_queries_raises_duplicate_error(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)

    seqstr1 = str(subjects[0].seq)[:1000]
    seqstr2 = str(subjects[1].seq)[:1000]

    queries = [
        SeqRecord(Seq(seqstr1)),
        SeqRecord(Seq(seqstr2))
        # SeqRecord(Seq(str(subjects[1][:1000]))),
    ]
    make_linear(queries)
    with pytest.raises(PyBlastException):
        BioBlast(subjects, queries)
Ejemplo n.º 11
0
    def run(self, n_jobs: int = 10):
        """Run a design job.

        :param n_jobs: number of parrallel jobs to run. (default: 10)
        :return:
        """
        import warnings

        warnings.simplefilter(action="ignore", category=RuntimeWarning)
        warnings.simplefilter(action="ignore", category=BiopythonParserWarning)

        self._logger.info("Loading sequence files")
        primers = make_linear(load_fasta_glob(self._primers))
        templates = make_circular(load_genbank_glob(self._templates))
        fragments = make_linear(load_genbank_glob(self._fragments))
        goals = make_circular(load_genbank_glob(self._goals))
        design = Design()
        design.n_jobs = n_jobs
        design.add_materials(primers=primers,
                             templates=templates,
                             fragments=fragments,
                             queries=goals)

        self._logger.info("Getting span cost model")
        span_cost = self._get_span_cost()
        design.span_cost = span_cost

        self._logger.info("Compiling possible molecular assemblies")
        design.compile()

        self._logger.info("Optimizing molecular assemblies")
        design.optimize()

        self._logger.info("Designing assembly primers and fragments")
        df, adf, design_json = design.to_df()
        adf.to_csv("summary.csv")
        df.to_csv("sequence_design.csv")

        records = []
        for result in design.results.values():
            if result.assemblies:
                a = result.assemblies[0]
                for i, role, m in a.molecules:
                    records.append(m.sequence)

        SeqIO.write(records, os.path.join(self._directory, "sequences.gb"),
                    "genbank")
Ejemplo n.º 12
0
def test_reverse_alignment_simple():
    record = rand_record(1000)
    query = record
    subject = record[10:990].reverse_complement()

    subjects = make_linear([subject])
    queries = make_linear([query])

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    for k, v in bioblast.seq_db.records.items():
        print(k)
        print(v)
    print(json.dumps(results, indent=2))
    assert results[0]["query"]["start"] == 10 + 1
    assert results[0]["query"]["end"] == 990
    assert results[0]["subject"]["start"] == 980
    assert results[0]["subject"]["end"] == 1
def test(paths):
    factory = BioBlastFactory()

    templates = load_genbank_glob(paths[TEMPLATES])
    subject = templates[0]
    query = subject.reverse_complement()

    templates = make_linear([subject])
    queries = make_linear([query])

    factory.add_records(templates, TEMPLATES)
    factory.add_records(queries, QUERIES)

    blast = factory(TEMPLATES, QUERIES)

    results = blast.blastn()
    assert results[0]["subject"]["strand"] == -1
    assert results[0]["subject"]["start"] == len(templates[0])
Ejemplo n.º 14
0
def test_validate_rc(here):
    queries = load_genbank_glob(join(here,
                                     "data/test_data/genbank/designs/*.gb"),
                                force_unique_ids=True)

    templates = make_linear(queries[:1])
    queries = make_linear([templates[0].reverse_complement()])

    factory = BioBlastFactory()
    factory.add_records(queries, "queries")
    factory.add_records(templates, "templates")

    blaster = factory("templates", "queries")

    results = blaster.blastn()

    assert results[0]["subject"]["strand"] == -1
    assert results[0]["query"]["strand"] == 1
    assert results[0]["subject"]["start"] == len(queries[0])
Ejemplo n.º 15
0
def test_basic_run_reverse_complement():
    junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc"
    junk2 = "g" * 30
    frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta"

    query = SeqRecord(seq=Seq(frag), annotations={
        "circular": False
    }).reverse_complement()
    subject = SeqRecord(seq=Seq(junk1 + frag + junk2),
                        annotations={"circular": False})

    make_linear([query])
    # print(type(query))
    # print(type(subject))
    blaster = BioBlast([subject], [query])
    blaster.blastn()
    alignments = blaster.results
    for a in alignments:
        print(json.dumps(a, indent=2))
        assert a["subject"]["strand"] == -1
Ejemplo n.º 16
0
    def test_circular_over_subject(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[200:300] + ns(500) + record[100:200]]

        queries = make_linear(queries)
        subjects = make_circular(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        compare_result(results[0], 101, 300, 601, 100)
Ejemplo n.º 17
0
def blast_factory(paths) -> BioBlastFactory:
    factory = BioBlastFactory()

    primers = make_linear(load_fasta_glob(paths[PRIMERS]))
    templates = load_genbank_glob(paths[REGISTRY])
    queries = make_circular(load_genbank_glob(paths[QUERIES]))

    factory.add_records(primers, PRIMERS)
    factory.add_records(templates, TEMPLATES)
    factory.add_records(queries, QUERIES)

    return factory
Ejemplo n.º 18
0
def test_library_design_to_df_2(paths, here, span_cost):
    primers_path = join(here, "data/test_data_sd2", "primers.fasta")
    fragments_path = join(here, "data/test_data_sd2", "fragments", "*.gb")
    plasmids_path = join(here, "data/test_data_sd2", "plasmids", "*.gb")
    designs_path = join(here, "data/test_data_sd2", "designs", "*.gb")

    primers = make_linear(load_fasta_glob(primers_path))
    templates = load_genbank_glob(plasmids_path)
    fragments = load_genbank_glob(fragments_path)
    print(fragments_path)
    queries = make_circular(load_genbank_glob(designs_path))

    design = LibraryDesign(span_cost=span_cost)
    design.n_jobs = 1
    design.add_materials(
        primers=primers,
        templates=make_circular(templates),
        queries=queries,
        fragments=make_linear(fragments),
    )

    design.logger.set_level("DEBUG")
    design.compile()

    results = design.optimize()

    for result in results.values():
        assembly = result.assemblies[0]
        print(assembly.to_df())
    #

    a, b, c = design.to_df()
    a.to_csv("library_design.csv")
    b.to_csv("library_summary.csv")
    with open("designs.json", "w") as f:
        json.dump(c, f)
    print(a)
    print(b)
    print(c)
Ejemplo n.º 19
0
def test_self_blast(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)
    queries = [
        SeqRecord(Seq(str(subjects[0][:1000].seq))),
        # SeqRecord(Seq(str(subjects[1][:1000]))),
    ]
    force_unique_record_ids(make_linear(queries))

    bioblast = BioBlast(queries, queries)
    results = bioblast.blastn()
    assert not results
Ejemplo n.º 20
0
def test_benchmark_blast(benchmark, here, paths, query):
    primers = make_linear(load_fasta_glob(paths["primers"]))
    templates = load_genbank_glob(paths["templates"])

    query_path = join(here, "data/test_data/genbank/designs", query)
    queries = make_circular(load_genbank_glob(query_path))

    design = Design()

    design.add_materials(primers=primers, templates=templates, queries=queries)

    design._blast()

    benchmark(design._blast)
Ejemplo n.º 21
0
    def make_blast():

        subjects = load_fasta_glob(join(
            here, "data/test_data/primers/primers.fasta"),
                                   force_unique_ids=True)
        subjects = make_linear(subjects)
        queries = load_genbank_glob(
            join(
                here,
                "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb"
            ),
            force_unique_ids=True,
        )
        return BioBlast(subjects, queries)
Ejemplo n.º 22
0
def test_example1():
    from pyblast import BioBlast
    from pyblast.utils import make_linear
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    import json

    queries = [
        SeqRecord(
            Seq("ACGTGATTCGTCGTGTAGTTGAGTGTTACGTTGCATGTCGTACGTGTGTAGTGTCGTGTAGTGCTGATGCTACGTGATCG"
                ))
    ]
    subjects = [
        SeqRecord(Seq("ACGTGATTCGTCGTGTAGTTGAGTGTTACGTTGCATGTCGTTACGTGATCG"))
    ]

    # pyblast requires a 'topology' annotation on the SeqRecords.
    # we can make records circular or linear using `make_linear` or `make_circular` methods
    subjects = make_linear(subjects)
    queries = make_linear(queries)

    blast = BioBlast(subjects, queries)
    results = blast.blastn()
    print(json.dumps(results, indent=2))
Ejemplo n.º 23
0
def test_reverse_alignment_simple2():
    record = rand_record(1000)
    query = record
    subject = record[10:-10].reverse_complement()

    subjects = make_linear([subject])
    queries = make_linear([query])

    factory = BioBlastFactory()
    factory.add_records(subjects, "subjects")
    factory.add_records(queries, "queries")

    bioblast = factory("subjects", "queries")

    results = bioblast.blastn()
    for k, v in bioblast.seq_db.records.items():
        print(k)
        print(v)
    print(json.dumps(results, indent=2))
    assert results[0]["subject"]["strand"] == -1
    assert results[0]["subject"]["start"] == 980
    assert results[0]["subject"]["end"] == 1
    assert results[0]["query"]["start"] == 11
    assert results[0]["query"]["end"] == 990
Ejemplo n.º 24
0
    def test_circular_complete_subject(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[500:] + record[:400]]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]
        print(result)
        assert result["subject"]["start"] == 1
        assert result["subject"]["end"] == 900
        assert result["query"]["start"] == 501
        assert result["query"]["end"] == 400
Ejemplo n.º 25
0
    def _get_results_func(n_jobs):
        if True:
            print("PROCESSING!")
            primers = make_linear(load_fasta_glob(paths["primers"]))
            templates = load_genbank_glob(paths["templates"])

            query_path = join(here, "data/test_data/genbank/designs/*.gb")
            queries = make_circular(
                load_genbank_glob(query_path))[:LIM_NUM_DESIGNS]

            design = Design(span_cost=cached_span_cost)
            design.add_materials(primers=primers,
                                 templates=templates,
                                 queries=queries)
            if n_jobs > 1:
                design._run_with_pool(n_jobs, 1)
            else:
                design.run()
            return design, design.results
Ejemplo n.º 26
0
    def test_circular_over_query(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[-100:] + record[:100]]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]

        result_seq = str((record[result["query"]["start"] - 1:] +
                          record[:result["query"]["end"]]).seq)
        expected_seq = str(subjects[0].seq)
        assert result_seq == expected_seq

        compare_result(results[0], 1000 - 100 + 1, 100, 1, 200)
Ejemplo n.º 27
0
    def test_circular_complete_query_1(self):
        """In this situation, the subject is completely aligned with a circular query
        starting at index 500 (starting index = 0). Note that the
         pyblast results start at index 1."""
        record = rand_record(1000)
        queries = [record]
        subjects = [ns(100) + record[500:] + record[:500] + ns(100)]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()
        result = results[0]

        assert result["query"]["start"] == 501
        assert result["query"]["raw_end"] == 1500
        assert result["subject"]["start"] == 101
        assert result["subject"]["end"] == 1100
Ejemplo n.º 28
0
def test_interaction_network():
    """We expect self alignments to be removed from the results."""
    records = [None, None, None, None]

    records[0] = rand_record(500)
    records[1] = rand_record(100) + records[0][:-100] + rand_record(1000)
    records[2] = rand_record(200) + records[1][:700] + rand_record(500)
    records[3] = records[2][-500:] + rand_record(500)

    force_unique_record_ids(records)

    queries = make_linear(records)

    bioblast = BioBlast(queries, queries)
    results = bioblast.blastn()
    assert results
    for r in results:
        k1 = r["query"]["origin_key"]
        k2 = r["subject"]["origin_key"]
        print(k1, k2)
        assert not k1 == k2
Ejemplo n.º 29
0
    def test_circular_complete_query_4(self):
        """In this situation, the subject is wraps around the query for 10
        extra bases on the left and right site.

        Note that pyblast results start at index 1.
        """
        record = rand_record(1000)
        queries = [record]
        subjects = [ns(100) + record[-10 + 500:] + record[:500 + 10] + ns(100)]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()
        result = results[0]

        assert result["query"]["start"] == 491
        assert result["query"]["raw_end"] == 1510
        assert result["subject"]["start"] == 101
        assert result["subject"]["end"] == 1120
Ejemplo n.º 30
0
    def test_circular_complete_query_parametrized_rc(self, extra_right,
                                                     extra_left):
        record = rand_record(1000)
        queries = [record]
        subjects = [
            ns(100) + record[(500 - extra_left):] +
            record[:(500 + extra_right)] + ns(100)
        ]

        subjects = [subjects[0].reverse_complement()]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]
        print(json.dumps(result, indent=2))

        assert result["query"]["start"] == 501 - extra_left
        assert result["query"]["raw_end"] == 1500 + extra_right
        assert result["subject"]["start"] == 1100 + extra_right + extra_left
        assert result["subject"]["end"] == 101

        # to spans
        query_span = bioblast.parse_result_to_span(result["query"],
                                                   output_index=0)
        subject_span = bioblast.parse_result_to_span(result["subject"],
                                                     output_index=0)

        assert len(subject_span) == len(
            query_span) == 1000 + extra_right + extra_left
        assert query_span.a == 500 - extra_left
        assert query_span.b == 500 + extra_right

        assert subject_span.a == 100
        assert subject_span.b == 1100 + extra_right + extra_left