def test_write_fasta(): A = Alphabet('ACGT') S = A.parse('AAA', name='foo') T = A.parse('TTT', name='bar') with NamedTemporaryFile() as f: write_fasta(f, [S, T]) f.seek(0) assert [s for s, _ in read_fasta(f, A)] == [S, T], \ 'read_fasta(write_fasta()) should be identity' f = StringIO('') write_fasta(f, [S, T]) f.seek(0) assert f.read() == '>foo\nAAA\n>bar\nTTT\n', 'should work on StringIO' f = StringIO('') # duplicate names not allowed with pytest.raises(AssertionError): write_fasta(f, [S, S]) f = StringIO('') S = A.parse('AAATTT', name='foo') write_fasta(f, [S], width=3) # should take 3 lines f.seek(0) assert sum(1 for _ in f) == 3, 'FASTA width should be modifiable'
def test_lossless_reads(): A = Alphabet('ACGT') S = rand_seq(A, 100) with pytest.raises(AssertionError): next(rand_read(S, len_mean=200, num=1)) # len_mean must be < len(S) with pytest.raises(AssertionError): # at most one of num or expected_coverage given next(rand_read(S, len_mean=50, num=1, expected_coverage=1)) assert sum(1 for _ in rand_read(S, len_mean=50, num=10)) == 10, \ 'The number of sampled reads should be controllable' assert sum(1 for _ in rand_read(S, len_mean=50)) == 1, \ 'If neither num or expected coverage is given only one sample is read' # there should be no noise added read, pos = next(rand_read(S, len_mean=40, num=1)) assert S[pos:pos+len(read)] == read S = A.parse('ACT' * 100) reads = [x for x in rand_read(S, len_mean=100, len_sd=0.01, num=100)] assert set(len(read) for read, _ in reads) > 1, \ 'Read lengths should be randomly chosen' len_mean = sum(len(read) for read, _ in reads) / 100. assert len_mean > 50 and len_mean < 150, \ 'Normal distribution of read lengths works' # index edge cases A = Alphabet(['00', '01']) S = A.parse('01' * 10) _bak = np.random.normal np.random.normal = mock.Mock(return_value=[1]) assert next(rand_read(S, len_mean=1, num=1))[0] == A.parse('01'), \ 'sequences in alphabets with > 1 long letters can be sampled too' np.random.normal = _bak
def test_alignment_std_local(err): A = Alphabet('ACGT') M = MutationProcess(A, subst_probs=err, go_prob=err, ge_prob=err) subst_scores, (go_score, ge_score) = M.log_odds_scores() S = rand_seq(A, 100) T, tx = M.mutate(S) T = A.parse('A' * 100) + T + A.parse('G' * 100) mutation_aln = Alignment(S, T, tx) mutation_score = mutation_aln.calculate_score(subst_scores, go_score, ge_score) aligner = Aligner(S, T, subst_scores=subst_scores, go_score=go_score, ge_score=ge_score, alnmode=STD_MODE, alntype=LOCAL) with aligner: reported_score = aligner.solve() assert round(reported_score, 3) >= round(mutation_score, 3), \ 'optimal alignment scores better than the known transcript' alignment = aligner.traceback() aln_score = alignment.calculate_score(subst_scores, go_score, ge_score) assert round(aln_score, 3) == round(reported_score, 3), \ 'The alignment score should be calculated correctly' ori_len = Alignment.projected_len(alignment.transcript, on='origin') mut_len = Alignment.projected_len(alignment.transcript, on='mutant') assert ori_len <= len(S) and mut_len < len(T), \ 'Local alignments do not cover the entirety of both sequences'
def test_database_insert(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() attrs = {'key': 'value'} rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs) assert isinstance(rec.id, int) assert rec.content_id == S.content_id assert rec.source_pos == 10 assert rec.source_file == 'source.fa' assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \ 'attributes must be populated correctly' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (rec.id,)) # NOTE for some reason if we just say next(cursor) == ... # the cursor remains open after the context is over (which should # not happen as per docs). This leads to BusyError further down. assert cursor.fetchall() == [(S.content_id,)], \ 'content identifier is properly populated' # add a second sequence T = A.parse('GCTG', name='bar') new_rec = db.insert(T) assert new_rec.id != rec.id, 'new ids are assigned to new sequences' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (new_rec.id,)) assert next(cursor) == (T.content_id,), \ 'correct id must be populated'
def test_sequence_parsing(): A = Alphabet(['00', '01', '10', '11']) with pytest.raises(AssertionError): A.parse('000') S = A.parse('001011') assert len(S) == 3 and S == Sequence(A, [0, 2, 3]), \ 'alphabets with > 1 long letters should be able to parse strings'
def test_named_sequence_tranforms(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') assert S.reverse(name='bar') == A.parse('TCAA', name='bar'), \ 'reverse of named sequences should be a named sequence' complement = S.transform(mappings=['AT', 'CG'], name='bar') assert complement == A.parse('TTGA', name='bar'), \ 'result of transforming a named sequence is a named sequence' assert 'transformed' in S.transform(mappings=['AT', 'CG']).name
def test_named_sequence(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') assert isinstance(S, NamedSequence) assert eval(repr(S)) == S, 'repr() should provide eval-able string' assert S.name == 'foo' assert S.content_id == A.parse(str(S), name='bar').content_id, \ 'content id should only depend on the contents of the sequence' assert S == A.parse(str(S), name='foo'), \ 'equality should work'
def test_pw_render_basic(): A = Alphabet('ACGT') S = A.parse('AACT') aln = Alignment(S, S, 'M' * len(S)) assert aln.render_term(colored=False).count('\033') == 0, \ 'colored output should allow being turned off' assert aln.render_term(colored=True).count('\033') > 0, \ 'colored output should allow being turned on' # validate input with pytest.raises(AssertionError): aln.render_term(margin=-1) with pytest.raises(AssertionError): aln.render_term(term_width=5) aln = Alignment(S + S, S + S, 'M' * len(S), origin_start=len(S)) no_margin = aln.render_term(margin=0, colored=False) assert '[%d]' % len(S) in no_margin, 'margin should allow being turned off' with_margin = aln.render_term(margin=1, colored=False) assert '[%d]' % (len(S) - 1) in with_margin, \ 'margin should allow being turned on' # shouldn't choke on too large margins full_margin = aln.render_term(margin=30, colored=False) assert str(S) + '.' * len(S) in full_margin, 'overhanging margins work' assert len(set(len(l) for l in full_margin.rstrip().split('\n'))) == 1, \ 'both lines of the output should have the same length' # deletion: # AACT # AG-T aln = Alignment(S + S, A.parse('AGT'), 'MSDM', origin_start=len(S)) with_del = aln.render_term(colored=False) assert 'AG-T' in with_del, 'deletions are represented by - in mutant' lines = with_del.rstrip().split('\n') assert lines[0].index('C') == lines[1].index('-'), \ 'deleted content and - should be aligned' # shouldn't crash when printing deletions with color aln.render_term(colored=True) # insertion: # AAC-T # AACGT aln = Alignment(S + S, A.parse('AACGT'), 'MMMIM', origin_start=len(S)) with_ins = aln.render_term(colored=False) assert 'AAC-T' in with_ins, 'insertions are represented by - in origin' lines = with_ins.rstrip().split('\n') assert lines[0].index('-') == lines[1].index('G'), \ 'inserted content and - should be aligned' # shouldn't crash when printing with color with_ins = aln.render_term(colored=True)
def test_database_populate_fasta(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() fasta.name = '/x.fasta' write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=False) assert len(inserted) == 2 assert all(isinstance(r, Record) for r in inserted) assert all(rec.source_file == fasta.name for rec in inserted), \ 'source file of sequence records must be set' assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \ 'should be able to retrieve sequences by position in source' with patch('biseqt.database.open', create=True) as open_mock: open_mock.return_value = MagicMock(spec=file, wraps=fasta) assert db.load_from_record(inserted[0]) == S, \ 'load_from_record should work without an open file handle'
def test_sequence_magic(): A = Alphabet('HT') contents = [0, 1, 0, 1] S = Sequence(A, contents) assert str(S) == 'HTHT' assert len(S) == 4, 'len() should work' assert S == Sequence(A, contents), 'equals if same contents and alphabet' assert S and not Sequence(A, []), 'truthy iff not empty' assert S[0] == 0, 'indexing by int should give an int' assert isinstance(S[0:1], Sequence) and str(S[0:1]) == 'H', \ 'indexing by a slice should give another sequence object' assert S + A.parse('TT') == A.parse('HTHTTT'), 'add by appending' assert S + 'TT' == A.parse('HTHTTT'), 'add by appending raw sequences'
def test_lossy_reads(): A = Alphabet('ACGT') S = A.parse('ACT' * 100) gap_kw = {'go_prob': 0.2, 'ge_prob': 0.3} M = MutationProcess(A, subst_probs=0.3, **gap_kw) read, pos, tx = next(M.noisy_read(S, len_mean=50, num=1)) assert tx.count('S') > 0 and tx.count('I') + tx.count('D') > 0, \ 'Random mutations should be performed to get lossy reads'
def test_pw_render_width(): A = Alphabet('ACGT') N = 100 S = A.parse('A' * (2 * N)) tx, term_width = 'M' * N, N/2 aln = Alignment(S, S, tx, origin_start=N) render = aln.render_term(margin=2*N, colored=False, term_width=term_width) line_lens = [len(l) for l in render.rstrip().split('\n')] assert all(length <= term_width for length in line_lens), \ 'terminal width should be adjustable' assert any(length == term_width for length in line_lens), \ 'terminal width should be fully used' assert len(set(line_lens)) <= 2, \ 'alignments longer than terminal width should work'
def test_database_overwrite(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() db.insert(S, source_file='old_source.fa') db.insert(S, source_file='new_source.fa') with db.connection() as conn: cursor = conn.cursor() cursor.execute( 'SELECT source_file FROM sequence WHERE content_id = ?', (S.content_id,) ) res = [x[0] for x in cursor] assert len(res) == 1 and res[0] == 'old_source.fa', \ 'Sequences with observed content id should be ignored'
def test_sequence_transforms(): A = Alphabet(['00', '01', '11']) S = A.parse('0001') assert S.transform(mappings={'00': '01'}) == A.parse('0101'), \ 'dict mappings are unidirectional' assert S.transform(mappings=[('00', '01')]) == A.parse('0100'), \ 'list mappings are bidirectional' assert S.transform(mappings=[(0, 1)]) == A.parse('0100'), \ 'list mappings are bidirectional' S = A.parse('0011') assert S.transform(mappings={'00': '01'}) == A.parse('0111'), \ 'unmapped letters remain untouched' assert S.reverse() == A.parse('1100'), 'reverse() works'
def test_read_fasta_basic(): A = Alphabet('ACGT') with NamedTemporaryFile() as f: f.write('> name1\nAAA\n\nTTT') f.flush() f.seek(0) recs = [r for r in read_fasta(f, A)] assert len(recs) == 1, 'should work when reading from file' assert recs[0][0] == A.parse('AAATTT', name='name1'), \ 'should properly parse what is in the file' assert isinstance(recs[0][0], NamedSequence), \ 'should return NamedSequence objects' assert recs[0][1] == 0, 'should report the right file positions' # duplicate names not allowed with pytest.raises(AssertionError): [r for r in read_fasta(StringIO('>name\nAAA\n> name\nTTT\n'), A)]
def test_database_find(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') T = A.parse('GGCT', name='bar') db = DB(':memory:', A) db.initialize() db.insert(S) db.insert(T) sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%' found = [rec for rec in db.find(sql_condition=sql_condition)] assert len(found) == 1 and found[0].content_id == T.content_id, \ 'find() should work with sql_condition' def condition(rec): return rec.attrs['name'] == 'foo' found = [rec for rec in db.find(condition=condition)] assert len(found) == 1 and found[0].content_id == S.content_id, \ 'find() should work with callable condition'
def test_database_events(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') # NOTE python 2 does not support non-local, non-global variables, put it in # the function object. test_database_events.callback_called = 0 def callback(self, *args): test_database_events.callback_called += 1 db = DB(':memory:', A) db.add_event_listener('db-initialized', callback) db.add_event_listener('sequence-inserted', callback) db.initialize() assert test_database_events.callback_called == 1, \ 'event callbacks for "initialize" should be executed' db.insert(S) assert test_database_events.callback_called == 2, \ 'event callbacks for "insert-sequence" should be executed'
def test_database_populate_fasta_rc(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=True) assert len(inserted) == 4 assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \ == [S.content_id, T.content_id], \ 'reverse complements should know what their origin is' def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id found_T_rc = next(db.find(condition=cond_T_rc)) T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name) assert db.load_from_record(found_T_rc, fasta) == T_rc, \ 'reverse complements should load properly from a record'
def test_mutation_process(): A = Alphabet('ACGT') S = A.parse('ACT' * 100) gap_kw = {'go_prob': 0, 'ge_prob': 0} def _check_subst_probs(subst_probs, msg): assert len(subst_probs) == len(A) for i in range(len(A)): correct = [.7 if i == j else .1 for j in range(len(A))] assert np.allclose(M.subst_probs[i], correct), msg M = MutationProcess(A, subst_probs=.3) _check_subst_probs(M.subst_probs, 'Substitution probabilities given as a single float') M = MutationProcess(A, subst_probs=[[.7 if i == j else .1 for j in range(len(A))] for i in range(len(A))]) _check_subst_probs(M.subst_probs, 'Substitution probabilities given as a matrix') T, tx = MutationProcess(A, subst_probs=0, **gap_kw).mutate(S) assert T == S and tx == 'MMM' * 100, \ 'all mutation probabilities can be set to zero' T, tx = MutationProcess(A, subst_probs=0.1, **gap_kw).mutate(S) assert all(op in 'MS' for op in tx) and 'S' in tx, \ 'there can be mutation processes with only substitutions' T, tx = MutationProcess(A, subst_probs=0.01, **gap_kw).mutate(S) assert tx.count('S') < 0.1 * len(S), 'substitution probabilities work' with pytest.raises(AssertionError): MutationProcess(A, go_prob=0.2, ge_prob=0.1) # go_prob <= ge_prob gap_kw = {'go_prob': 0.05, 'ge_prob': 0.1} T, tx = MutationProcess(A, subst_probs=0, **gap_kw).mutate(S) indels = sum(1 for op in tx if op in 'ID') assert indels > 0 and indels < 0.5 * len(S), 'gap probabilities work'
def test_pw_render_longlet(): A = Alphabet(['00', '11']) aln = Alignment(A.parse('0011'), A.parse('11'), 'DM') assert '--11' in aln.render_term(colored=False), \ 'alphabets with > 1 long letters should be rendered properly'
def test_rand_seq(): _bak = np.random.choice np.random.choice = mock.Mock(return_value=[0, 0, 0]) A = Alphabet('ACGT') assert rand_seq(A, 10) == A.parse('AAA') np.random.choice = _bak