Ejemplo n.º 1
0
def test_perfect_match():
    s = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta"
    r1 = SeqRecord(seq=Seq(s), annotations={"topology": "linear"})
    r2 = SeqRecord(seq=Seq(s), annotations={"topology": "linear"})

    blaster = BioBlast([r1], [r2])
    blaster.blastn()
    result = blaster.results[0]
    assert result["query"]["start"] == 1
    assert result["query"]["end"] == len(s)
    assert result["subject"]["start"] == 1
    assert result["subject"]["end"] == len(s)
Ejemplo n.º 2
0
def test_simple_alignment():
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [record[10:-10]]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1
    compare_result(results[0], 11, len(record) - 10, 1, len(record) - 10 - 10)
Ejemplo n.º 3
0
    def test_circular_over_subject(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[200:300] + ns(500) + record[100:200]]

        queries = make_linear(queries)
        subjects = make_circular(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        compare_result(results[0], 101, 300, 601, 100)
Ejemplo n.º 4
0
def test_align_Ns():
    record = rand_record(1000)
    nseq = SeqRecord(Seq("N" * 500))
    queries = [record[:]]
    subjects = [nseq + record + nseq]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    print(results)
Ejemplo n.º 5
0
def test_self_blast(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)
    queries = [
        SeqRecord(Seq(str(subjects[0][:1000].seq))),
        # SeqRecord(Seq(str(subjects[1][:1000]))),
    ]
    force_unique_record_ids(make_linear(queries))

    bioblast = BioBlast(queries, queries)
    results = bioblast.blastn()
    assert not results
Ejemplo n.º 6
0
def test_run_bioblast_twice():
    junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc"
    junk2 = "g" * 30
    frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta"

    query = SeqRecord(seq=Seq(frag), annotations={"circular": False})
    subject = SeqRecord(seq=Seq(junk1 + frag + junk2),
                        annotations={"circular": False})

    blaster = BioBlast([subject], [query])
    blaster.blastn()
    blaster.blastn()
    alignments = blaster.results
    print(alignments)
Ejemplo n.º 7
0
def test_partial_alignment(left_spacer, ij):
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [ns(left_spacer) + record[ij[0]:ij[1]]]

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1

    compare_result(results[0], ij[0] + 1, ij[1], 1 + left_spacer,
                   ij[1] - ij[0] + left_spacer)
Ejemplo n.º 8
0
def test_partial_alignment_reverse_complement(left_spacer, ij):
    record = rand_record(1000)
    queries = [record[:]]
    subjects = [record[ij[0]:ij[1]]]
    subjects[0] = ns(left_spacer) + subjects[0].reverse_complement()

    queries = make_linear(queries)
    subjects = make_linear(subjects)

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    assert len(results) == 1

    compare_result(results[0], ij[0] + 1, ij[1], len(subjects[0].seq),
                   left_spacer + 1)
Ejemplo n.º 9
0
    def test_circular_complete_subject(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[500:] + record[:400]]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]
        print(result)
        assert result["subject"]["start"] == 1
        assert result["subject"]["end"] == 900
        assert result["query"]["start"] == 501
        assert result["query"]["end"] == 400
Ejemplo n.º 10
0
def test_raises_pyblast_when_not_unique(here):
    subjects = load_genbank_glob(
        join(here, "data/test_data/genbank/templates/*.gb"))
    queries = load_genbank_glob(
        join(here, "data/test_data/genbank/designs/*.gb"))
    print("n_queres: {}".format(len(queries)))
    print("n_subjects: {}".format(len(subjects)))
    with pytest.raises(PyBlastException):
        BioBlast(subjects, queries)
Ejemplo n.º 11
0
def test_reverse_alignment_simple():
    record = rand_record(1000)
    query = record
    subject = record[10:990].reverse_complement()

    subjects = make_linear([subject])
    queries = make_linear([query])

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    for k, v in bioblast.seq_db.records.items():
        print(k)
        print(v)
    print(json.dumps(results, indent=2))
    assert results[0]["query"]["start"] == 10 + 1
    assert results[0]["query"]["end"] == 990
    assert results[0]["subject"]["start"] == 980
    assert results[0]["subject"]["end"] == 1
Ejemplo n.º 12
0
def test_not_raise_pyblast_when_unique(here):
    subjects = load_genbank_glob(
        join(here, "data/test_data/genbank/templates/*.gb"))
    queries = load_genbank_glob(
        join(here, "data/test_data/genbank/designs/*.gb"))

    force_unique_record_ids(subjects + queries)
    print("n_queres: {}".format(len(queries)))
    BioBlast(subjects, queries)
Ejemplo n.º 13
0
def test_multiquery_blast(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)
    queries = load_genbank_glob(join(here,
                                     "data/test_data/genbank/designs/*.gb"),
                                force_unique_ids=True)
    print("n_queres: {}".format(len(queries)))
    print("n_subjects: {}".format(len(subjects)))
    bioblast = BioBlast(subjects, queries)

    results = bioblast.blastn()
    recids = set()
    for res in results:
        recid = res["query"]["origin_record_id"]
        recids.add(recid)
    print("n_records: {}".format(len(results)))
    assert len(recids) == len(queries)
Ejemplo n.º 14
0
    def test_circular_complete_query_1(self):
        """In this situation, the subject is completely aligned with a circular query
        starting at index 500 (starting index = 0). Note that the
         pyblast results start at index 1."""
        record = rand_record(1000)
        queries = [record]
        subjects = [ns(100) + record[500:] + record[:500] + ns(100)]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()
        result = results[0]

        assert result["query"]["start"] == 501
        assert result["query"]["raw_end"] == 1500
        assert result["subject"]["start"] == 101
        assert result["subject"]["end"] == 1100
Ejemplo n.º 15
0
    def test_circular_over_query(self):
        record = rand_record(1000)
        queries = [record]
        subjects = [record[-100:] + record[:100]]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]

        result_seq = str((record[result["query"]["start"] - 1:] +
                          record[:result["query"]["end"]]).seq)
        expected_seq = str(subjects[0].seq)
        assert result_seq == expected_seq

        compare_result(results[0], 1000 - 100 + 1, 100, 1, 200)
Ejemplo n.º 16
0
def test_example2():
    from pyblast import BioBlast
    from pyblast.utils import make_linear, make_circular
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    import json

    seq = "ACGTTGTAGTGTAGTTGATGATGATGTCTGTGTCGTGTGATGTGCTAGGGGTTGATGTGAGTAGTTAGTGGTAGTGTTTAGGGGCGGCGCGGAGTATGCTG"
    queries = [SeqRecord(Seq(seq))]

    subjects = [SeqRecord(Seq(seq[-20:] + seq[:30]))]

    # pyblast requires a 'topology' annotation on the SeqRecords.
    # we can make records circular or linear using `make_linear` or `make_circular` methods
    subjects = make_linear(subjects)
    queries = make_circular(queries)

    blast = BioBlast(subjects, queries)
    results = blast.blastn()
    print(json.dumps(results, indent=2))
Ejemplo n.º 17
0
def test_basic_run_reverse_complement():
    junk1 = "atgctatgctgatgctgctgtgctgatgctgatgtgtattgctgtatcgcgcgagttagc"
    junk2 = "g" * 30
    frag = "aaacttcccaccccataccctattaccactgccaattacctagtggtttcatttactctaaacctgtgattcctctgaattattttcatttta"

    query = SeqRecord(seq=Seq(frag), annotations={
        "circular": False
    }).reverse_complement()
    subject = SeqRecord(seq=Seq(junk1 + frag + junk2),
                        annotations={"circular": False})

    make_linear([query])
    # print(type(query))
    # print(type(subject))
    blaster = BioBlast([subject], [query])
    blaster.blastn()
    alignments = blaster.results
    for a in alignments:
        print(json.dumps(a, indent=2))
        assert a["subject"]["strand"] == -1
Ejemplo n.º 18
0
    def test_circular_complete_query_4(self):
        """In this situation, the subject is wraps around the query for 10
        extra bases on the left and right site.

        Note that pyblast results start at index 1.
        """
        record = rand_record(1000)
        queries = [record]
        subjects = [ns(100) + record[-10 + 500:] + record[:500 + 10] + ns(100)]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()
        result = results[0]

        assert result["query"]["start"] == 491
        assert result["query"]["raw_end"] == 1510
        assert result["subject"]["start"] == 101
        assert result["subject"]["end"] == 1120
Ejemplo n.º 19
0
def test_interaction_network():
    """We expect self alignments to be removed from the results."""
    records = [None, None, None, None]

    records[0] = rand_record(500)
    records[1] = rand_record(100) + records[0][:-100] + rand_record(1000)
    records[2] = rand_record(200) + records[1][:700] + rand_record(500)
    records[3] = records[2][-500:] + rand_record(500)

    force_unique_record_ids(records)

    queries = make_linear(records)

    bioblast = BioBlast(queries, queries)
    results = bioblast.blastn()
    assert results
    for r in results:
        k1 = r["query"]["origin_key"]
        k2 = r["subject"]["origin_key"]
        print(k1, k2)
        assert not k1 == k2
Ejemplo n.º 20
0
    def test_circular_complete_query_parametrized_rc(self, extra_right,
                                                     extra_left):
        record = rand_record(1000)
        queries = [record]
        subjects = [
            ns(100) + record[(500 - extra_left):] +
            record[:(500 + extra_right)] + ns(100)
        ]

        subjects = [subjects[0].reverse_complement()]

        queries = make_circular(queries)
        subjects = make_linear(subjects)

        bioblast = BioBlast(subjects, queries)
        results = bioblast.blastn()

        result = results[0]
        print(json.dumps(result, indent=2))

        assert result["query"]["start"] == 501 - extra_left
        assert result["query"]["raw_end"] == 1500 + extra_right
        assert result["subject"]["start"] == 1100 + extra_right + extra_left
        assert result["subject"]["end"] == 101

        # to spans
        query_span = bioblast.parse_result_to_span(result["query"],
                                                   output_index=0)
        subject_span = bioblast.parse_result_to_span(result["subject"],
                                                     output_index=0)

        assert len(subject_span) == len(
            query_span) == 1000 + extra_right + extra_left
        assert query_span.a == 500 - extra_left
        assert query_span.b == 500 + extra_right

        assert subject_span.a == 100
        assert subject_span.b == 1100 + extra_right + extra_left
Ejemplo n.º 21
0
def test_unnamed_queries(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)

    seqstr1 = str(subjects[0].seq)[:1000]
    seqstr2 = str(subjects[1].seq)[:1000]

    queries = [
        SeqRecord(Seq(seqstr1)),
        SeqRecord(Seq(seqstr2))
        # SeqRecord(Seq(str(subjects[1][:1000]))),
    ]
    force_unique_record_ids(make_linear(queries))

    bioblast = BioBlast(subjects, queries)
    results = bioblast.blastn()
    recids = set()
    for res in results:
        recid = res["query"]["origin_record_id"]
        recids.add(recid)
    print("n_records: {}".format(len(results)))
    assert len(recids) == len(queries)
Ejemplo n.º 22
0
    def make_blast():

        subjects = load_fasta_glob(join(
            here, "data/test_data/primers/primers.fasta"),
                                   force_unique_ids=True)
        subjects = make_linear(subjects)
        queries = load_genbank_glob(
            join(
                here,
                "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb"
            ),
            force_unique_ids=True,
        )
        return BioBlast(subjects, queries)
Ejemplo n.º 23
0
    def make_blast():

        subjects = load_genbank_glob(join(
            here, "data/test_data/genbank/templates/*.gb"),
                                     force_unique_ids=True)
        queries = load_genbank_glob(
            join(
                here,
                "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb"
            ),
            force_unique_ids=True,
        )
        queries = make_circular(queries)
        assert is_circular(queries[0])
        return BioBlast(subjects, queries)
Ejemplo n.º 24
0
def test_unnamed_queries_raises_duplicate_error(here):
    subjects = load_genbank_glob(join(here,
                                      "data/test_data/genbank/templates/*.gb"),
                                 force_unique_ids=True)

    seqstr1 = str(subjects[0].seq)[:1000]
    seqstr2 = str(subjects[1].seq)[:1000]

    queries = [
        SeqRecord(Seq(seqstr1)),
        SeqRecord(Seq(seqstr2))
        # SeqRecord(Seq(str(subjects[1][:1000]))),
    ]
    make_linear(queries)
    with pytest.raises(PyBlastException):
        BioBlast(subjects, queries)
Ejemplo n.º 25
0
def test_ungapped():
    frag = "GtctaaaggtgaagaattattcactggtgttgtcccaattttggttgaattagatggtgatgttaatggtcacaaattttctgtctccggtgaaggtgaaggtgatgctacttacggtaaattgaccttaaaatttatttgtactactggtaaattgccagttccatggccaaccttagtcactactttcggttatggtgttcaatgttttgcgagatacccagatcatatgaaacaacatgactttttcaagtctgccatgccagaaggttatgttcaagaaagaactatttttttcaaagatgacggtaactacaagaccagagctgaagtcaagtttgaaggtgataccttagttaatagaatcgaattaaaaggtattgattttaaagaagatggtaacattttaggtcacaaattggaatacaactataactctcacaatgtttacatcatggctgacaaacaaaagaatggtatcaaagttaacttcaaaattagacacaacattgaagatggttctgttcaattagctgaccattatcaacaaaatactccaattggtgatggtccagtcttgttaccagacaaccattacttatccactcaatctgccttatccaaagatccaaacgaaaagagagaccacatggtcttgttagaatttgttactgctgctggtattacccatggtatggatgaattgtacaaaTAGTGATACCGTCGACCTCGAGTCAattagttatgtcacgcttacattcacgccctccccccacatccgctctaaccgaaaaggaaggagttagacaacctgaagtctaggtccctatttatttttttatagttatgttagtattaagaacgttatttatatttcaaatttttcttt"

    query = SeqRecord(seq=Seq(frag), annotations={"circular": False})
    subject = SeqRecord(
        seq=Seq(frag[:400] + "atgctatgctgatgctgctgtgctgat" + frag[400:]),
        annotations={"circular": False},
    )

    # print(type(query))
    # print(type(subject))
    blaster = BioBlast([subject], [query])
    blaster.update_config({"ungapped": None})
    blaster.blastn()
    alignments = blaster.results
    print(alignments)
Ejemplo n.º 26
0
            ))
]
subjects = [
    SeqRecord(Seq("TCGTGTAGTTGAGTGTTACGTTGCATGTCGTTACGTGATCG"), id="aa1"),
    SeqRecord(Seq("TCGTGTAGTTGAGTGTTACGTTGCATGTCGGGGACGTGATCG"), id="aa2")
]

# pyblast requires a 'topology' annotation on the SeqRecords.
# we can make records circular or linear using `make_linear` or `make_circular` methods

t0 = time.time()

subjects = make_linear(subjects)
queries = make_linear(queries)

blast = BioBlast(subjects, queries)
results = blast.blastn()

t1 = time.time() - t0
#print(t1)
#print(results)

fa_file = "../samples/P21333.fasta"
sequences = SeqIO.parse(fa_file, "fasta")
queries = []
subjects = []
for record in sequences:
    queries.append(record)

#sequences = SeqIO.parse("viral/viral_classification/sample_genomes/HIV.B.fasta", "fasta")
sequences = SeqIO.parse("../samples/H1.txt", "fasta")