def test_bulk_index_kmers(dna_kmer_index): A = dna_kmer_index.db.alphabet dna_kmer_index.db.initialize() S = A.parse('ATGCA', name='foo') T = A.parse('ATGCC', name='bar') fasta = StringIO() write_fasta(fasta, [S, T]) fasta.seek(0) S_rec, T_rec = dna_kmer_index.db.load_fasta(fasta) assert S_rec is not None and T_rec is not None assert dna_kmer_index.num_kmers() == 4 assert dna_kmer_index.total_length_indexed() == len(S) + len(T) # find the occurences of 'ATG' S_id, T_id = S_rec.id, T_rec.id atg_int = dna_kmer_index.kmer_as_int((0, 3, 2)) atg_hits = [(hits, score) for kmer, hits, score in dna_kmer_index.kmers() if kmer == atg_int] assert len(atg_hits) == 1 and atg_hits[0][0] == [(S_id, 0), (T_id, 0)] dna_kmer_index.score_kmers() atg_hits = [(hits, score) for kmer, hits, score in dna_kmer_index.kmers() if kmer == atg_int] atg_score = atg_hits[0][1] assert atg_score is not None and score > 0 # ATG is the most common kmer, it most have the highest score: assert all(atg_score >= score for _, _, score in dna_kmer_index.kmers()) # shouldn't do anything dna_kmer_index.score_kmers(only_missing=True)
def test_database_populate_fasta(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() fasta.name = '/x.fasta' write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=False) assert len(inserted) == 2 assert all(isinstance(r, Record) for r in inserted) assert all(rec.source_file == fasta.name for rec in inserted), \ 'source file of sequence records must be set' assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \ 'should be able to retrieve sequences by position in source' with patch('biseqt.database.open', create=True) as open_mock: open_mock.return_value = MagicMock(spec=file, wraps=fasta) assert db.load_from_record(inserted[0]) == S, \ 'load_from_record should work without an open file handle'
def test_write_fasta(): A = Alphabet('ACGT') S = A.parse('AAA', name='foo') T = A.parse('TTT', name='bar') with NamedTemporaryFile() as f: write_fasta(f, [S, T]) f.seek(0) assert [s for s, _ in read_fasta(f, A)] == [S, T], \ 'read_fasta(write_fasta()) should be identity' f = StringIO('') write_fasta(f, [S, T]) f.seek(0) assert f.read() == '>foo\nAAA\n>bar\nTTT\n', 'should work on StringIO' f = StringIO('') # duplicate names not allowed with pytest.raises(AssertionError): write_fasta(f, [S, S]) f = StringIO('') S = A.parse('AAATTT', name='foo') write_fasta(f, [S], width=3) # should take 3 lines f.seek(0) assert sum(1 for _ in f) == 3, 'FASTA width should be modifiable'
def seed_index(): """Creates a database, a kmer index, and a seed index with word length 5 stored in memory and returns the seed index. The database is populated with 3 random sequences of length 100 and all kmers and seeds are indexed.""" A = Alphabet('ACGT') num_seqs = 3 seq_len = 100 wordlen = 5 db = DB(':memory:', A) seed_index = SeedIndex(KmerIndex(db, wordlen)) seed_index.db.initialize() fasta = StringIO() seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs)) write_fasta(fasta, seqs) fasta.seek(0) db.load_fasta(fasta) seed_index.index_seeds() return seed_index
def test_database_populate_fasta_rc(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=True) assert len(inserted) == 4 assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \ == [S.content_id, T.content_id], \ 'reverse complements should know what their origin is' def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id found_T_rc = next(db.find(condition=cond_T_rc)) T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name) assert db.load_from_record(found_T_rc, fasta) == T_rc, \ 'reverse complements should load properly from a record'