Beispiel #1
0
def test_bulk_index_kmers(dna_kmer_index):
    A = dna_kmer_index.db.alphabet
    dna_kmer_index.db.initialize()

    S = A.parse('ATGCA', name='foo')
    T = A.parse('ATGCC', name='bar')
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)

    S_rec, T_rec = dna_kmer_index.db.load_fasta(fasta)
    assert S_rec is not None and T_rec is not None
    assert dna_kmer_index.num_kmers() == 4
    assert dna_kmer_index.total_length_indexed() == len(S) + len(T)

    # find the occurences of 'ATG'
    S_id, T_id = S_rec.id, T_rec.id
    atg_int = dna_kmer_index.kmer_as_int((0, 3, 2))
    atg_hits = [(hits, score) for kmer, hits, score in dna_kmer_index.kmers()
                if kmer == atg_int]
    assert len(atg_hits) == 1 and atg_hits[0][0] == [(S_id, 0), (T_id, 0)]

    dna_kmer_index.score_kmers()
    atg_hits = [(hits, score) for kmer, hits, score in dna_kmer_index.kmers()
                if kmer == atg_int]
    atg_score = atg_hits[0][1]
    assert atg_score is not None and score > 0

    # ATG is the most common kmer, it most have the highest score:
    assert all(atg_score >= score for _, _, score in dna_kmer_index.kmers())

    # shouldn't do anything
    dna_kmer_index.score_kmers(only_missing=True)
Beispiel #2
0
def test_database_populate_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()

    fasta = StringIO()
    fasta.name = '/x.fasta'

    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=False)
    assert len(inserted) == 2
    assert all(isinstance(r, Record) for r in inserted)
    assert all(rec.source_file == fasta.name for rec in inserted), \
        'source file of sequence records must be set'
    assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \
        'should be able to retrieve sequences by position in source'

    with patch('biseqt.database.open', create=True) as open_mock:
        open_mock.return_value = MagicMock(spec=file, wraps=fasta)
        assert db.load_from_record(inserted[0]) == S, \
            'load_from_record should work without an open file handle'
Beispiel #3
0
def test_write_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AAA', name='foo')
    T = A.parse('TTT', name='bar')

    with NamedTemporaryFile() as f:
        write_fasta(f, [S, T])
        f.seek(0)
        assert [s for s, _ in read_fasta(f, A)] == [S, T], \
            'read_fasta(write_fasta()) should be identity'

    f = StringIO('')
    write_fasta(f, [S, T])
    f.seek(0)
    assert f.read() == '>foo\nAAA\n>bar\nTTT\n', 'should work on StringIO'

    f = StringIO('')
    # duplicate names not allowed
    with pytest.raises(AssertionError):
        write_fasta(f, [S, S])

    f = StringIO('')
    S = A.parse('AAATTT', name='foo')
    write_fasta(f, [S], width=3)  # should take 3 lines
    f.seek(0)
    assert sum(1 for _ in f) == 3, 'FASTA width should be modifiable'
Beispiel #4
0
def seed_index():
    """Creates a database, a kmer index, and a seed index with word length 5
    stored in memory and returns the seed index. The database is populated with
    3 random sequences of length 100 and all kmers and seeds are indexed."""
    A = Alphabet('ACGT')
    num_seqs = 3
    seq_len = 100
    wordlen = 5

    db = DB(':memory:', A)
    seed_index = SeedIndex(KmerIndex(db, wordlen))
    seed_index.db.initialize()

    fasta = StringIO()
    seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs))
    write_fasta(fasta, seqs)
    fasta.seek(0)

    db.load_fasta(fasta)
    seed_index.index_seeds()
    return seed_index
Beispiel #5
0
def test_database_populate_fasta_rc():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=True)

    assert len(inserted) == 4
    assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \
        == [S.content_id, T.content_id], \
        'reverse complements should know what their origin is'

    def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id

    found_T_rc = next(db.find(condition=cond_T_rc))
    T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name)
    assert db.load_from_record(found_T_rc, fasta) == T_rc, \
        'reverse complements should load properly from a record'