Beispiel #1
0
def test_database_populate_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()

    fasta = StringIO()
    fasta.name = '/x.fasta'

    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=False)
    assert len(inserted) == 2
    assert all(isinstance(r, Record) for r in inserted)
    assert all(rec.source_file == fasta.name for rec in inserted), \
        'source file of sequence records must be set'
    assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \
        'should be able to retrieve sequences by position in source'

    with patch('biseqt.database.open', create=True) as open_mock:
        open_mock.return_value = MagicMock(spec=file, wraps=fasta)
        assert db.load_from_record(inserted[0]) == S, \
            'load_from_record should work without an open file handle'
Beispiel #2
0
def test_write_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AAA', name='foo')
    T = A.parse('TTT', name='bar')

    with NamedTemporaryFile() as f:
        write_fasta(f, [S, T])
        f.seek(0)
        assert [s for s, _ in read_fasta(f, A)] == [S, T], \
            'read_fasta(write_fasta()) should be identity'

    f = StringIO('')
    write_fasta(f, [S, T])
    f.seek(0)
    assert f.read() == '>foo\nAAA\n>bar\nTTT\n', 'should work on StringIO'

    f = StringIO('')
    # duplicate names not allowed
    with pytest.raises(AssertionError):
        write_fasta(f, [S, S])

    f = StringIO('')
    S = A.parse('AAATTT', name='foo')
    write_fasta(f, [S], width=3)  # should take 3 lines
    f.seek(0)
    assert sum(1 for _ in f) == 3, 'FASTA width should be modifiable'
Beispiel #3
0
def test_lossless_reads():
    A = Alphabet('ACGT')
    S = rand_seq(A, 100)
    with pytest.raises(AssertionError):
        next(rand_read(S, len_mean=200, num=1))  # len_mean must be < len(S)
    with pytest.raises(AssertionError):
        # at most one of num or expected_coverage given
        next(rand_read(S, len_mean=50, num=1, expected_coverage=1))

    assert sum(1 for _ in rand_read(S, len_mean=50, num=10)) == 10, \
        'The number of sampled reads should be controllable'
    assert sum(1 for _ in rand_read(S, len_mean=50)) == 1, \
        'If neither num or expected coverage is given only one sample is read'

    # there should be no noise added
    read, pos = next(rand_read(S, len_mean=40, num=1))
    assert S[pos:pos+len(read)] == read

    S = A.parse('ACT' * 100)
    reads = [x for x in rand_read(S, len_mean=100, len_sd=0.01, num=100)]
    assert set(len(read) for read, _ in reads) > 1, \
        'Read lengths should be randomly chosen'
    len_mean = sum(len(read) for read, _ in reads) / 100.
    assert len_mean > 50 and len_mean < 150, \
        'Normal distribution of read lengths works'

    # index edge cases
    A = Alphabet(['00', '01'])
    S = A.parse('01' * 10)
    _bak = np.random.normal
    np.random.normal = mock.Mock(return_value=[1])
    assert next(rand_read(S, len_mean=1, num=1))[0] == A.parse('01'), \
        'sequences in alphabets with > 1 long letters can be sampled too'
    np.random.normal = _bak
Beispiel #4
0
def test_database_insert():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    attrs = {'key': 'value'}
    rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs)
    assert isinstance(rec.id, int)
    assert rec.content_id == S.content_id
    assert rec.source_pos == 10
    assert rec.source_file == 'source.fa'
    assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \
        'attributes must be populated correctly'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (rec.id,))
        # NOTE for some reason if we just say next(cursor) ==  ...
        # the cursor remains open after the context is over (which should
        # not happen as per docs). This leads to BusyError further down.
        assert cursor.fetchall() == [(S.content_id,)], \
            'content identifier is properly populated'

    # add a second sequence
    T = A.parse('GCTG', name='bar')
    new_rec = db.insert(T)
    assert new_rec.id != rec.id, 'new ids are assigned to new sequences'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (new_rec.id,))
        assert next(cursor) == (T.content_id,), \
            'correct id must be populated'
Beispiel #5
0
def test_alignment_std_local(err):
    A = Alphabet('ACGT')
    M = MutationProcess(A, subst_probs=err, go_prob=err, ge_prob=err)
    subst_scores, (go_score, ge_score) = M.log_odds_scores()

    S = rand_seq(A, 100)
    T, tx = M.mutate(S)
    T = A.parse('A' * 100) + T + A.parse('G' * 100)
    mutation_aln = Alignment(S, T, tx)
    mutation_score = mutation_aln.calculate_score(subst_scores, go_score,
                                                  ge_score)

    aligner = Aligner(S, T, subst_scores=subst_scores, go_score=go_score,
                      ge_score=ge_score, alnmode=STD_MODE, alntype=LOCAL)
    with aligner:
        reported_score = aligner.solve()
        assert round(reported_score, 3) >= round(mutation_score, 3), \
            'optimal alignment scores better than the known transcript'
        alignment = aligner.traceback()
        aln_score = alignment.calculate_score(subst_scores, go_score, ge_score)
        assert round(aln_score, 3) == round(reported_score, 3), \
            'The alignment score should be calculated correctly'

        ori_len = Alignment.projected_len(alignment.transcript, on='origin')
        mut_len = Alignment.projected_len(alignment.transcript, on='mutant')
        assert ori_len <= len(S) and mut_len < len(T), \
            'Local alignments do not cover the entirety of both sequences'
Beispiel #6
0
def test_sequence_parsing():
    A = Alphabet(['00', '01', '10', '11'])
    with pytest.raises(AssertionError):
        A.parse('000')

    S = A.parse('001011')
    assert len(S) == 3 and S == Sequence(A, [0, 2, 3]), \
        'alphabets with > 1 long letters should be able to parse strings'
Beispiel #7
0
def test_lossy_reads():
    A = Alphabet('ACGT')
    S = A.parse('ACT' * 100)
    gap_kw = {'go_prob': 0.2, 'ge_prob': 0.3}
    M = MutationProcess(A, subst_probs=0.3, **gap_kw)
    read, pos, tx = next(M.noisy_read(S, len_mean=50, num=1))
    assert tx.count('S') > 0 and tx.count('I') + tx.count('D') > 0, \
        'Random mutations should be performed to get lossy reads'
Beispiel #8
0
def test_named_sequence_tranforms():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    assert S.reverse(name='bar') == A.parse('TCAA', name='bar'), \
        'reverse of named sequences should be a named sequence'
    complement = S.transform(mappings=['AT', 'CG'], name='bar')
    assert complement == A.parse('TTGA', name='bar'), \
        'result of transforming a named sequence is a named sequence'

    assert 'transformed' in S.transform(mappings=['AT', 'CG']).name
Beispiel #9
0
def test_named_sequence():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    assert isinstance(S, NamedSequence)
    assert eval(repr(S)) == S, 'repr() should provide eval-able string'
    assert S.name == 'foo'
    assert S.content_id == A.parse(str(S), name='bar').content_id, \
        'content id should only depend on the contents of the sequence'
    assert S == A.parse(str(S), name='foo'), \
        'equality should work'
Beispiel #10
0
def test_pw_render_basic():
    A = Alphabet('ACGT')
    S = A.parse('AACT')
    aln = Alignment(S, S, 'M' * len(S))
    assert aln.render_term(colored=False).count('\033') == 0, \
        'colored output should allow being turned off'
    assert aln.render_term(colored=True).count('\033') > 0, \
        'colored output should allow being turned on'
    # validate input
    with pytest.raises(AssertionError):
        aln.render_term(margin=-1)
    with pytest.raises(AssertionError):
        aln.render_term(term_width=5)

    aln = Alignment(S + S, S + S, 'M' * len(S), origin_start=len(S))
    no_margin = aln.render_term(margin=0, colored=False)
    assert '[%d]' % len(S) in no_margin, 'margin should allow being turned off'

    with_margin = aln.render_term(margin=1, colored=False)
    assert '[%d]' % (len(S) - 1) in with_margin, \
        'margin should allow being turned on'

    # shouldn't choke on too large margins
    full_margin = aln.render_term(margin=30, colored=False)
    assert str(S) + '.' * len(S) in full_margin, 'overhanging margins work'
    assert len(set(len(l) for l in full_margin.rstrip().split('\n'))) == 1, \
        'both lines of the output should have the same length'

    # deletion:
    #   AACT
    #   AG-T
    aln = Alignment(S + S, A.parse('AGT'), 'MSDM', origin_start=len(S))
    with_del = aln.render_term(colored=False)
    assert 'AG-T' in with_del, 'deletions are represented by - in mutant'
    lines = with_del.rstrip().split('\n')
    assert lines[0].index('C') == lines[1].index('-'), \
        'deleted content and - should be aligned'
    # shouldn't crash when printing deletions with color
    aln.render_term(colored=True)

    # insertion:
    #   AAC-T
    #   AACGT
    aln = Alignment(S + S, A.parse('AACGT'), 'MMMIM', origin_start=len(S))
    with_ins = aln.render_term(colored=False)
    assert 'AAC-T' in with_ins, 'insertions are represented by - in origin'
    lines = with_ins.rstrip().split('\n')
    assert lines[0].index('-') == lines[1].index('G'), \
        'inserted content and - should be aligned'
    # shouldn't crash when printing with color
    with_ins = aln.render_term(colored=True)
Beispiel #11
0
def test_pw_render_width():
    A = Alphabet('ACGT')
    N = 100
    S = A.parse('A' * (2 * N))
    tx, term_width = 'M' * N, N/2
    aln = Alignment(S, S, tx, origin_start=N)
    render = aln.render_term(margin=2*N, colored=False, term_width=term_width)
    line_lens = [len(l) for l in render.rstrip().split('\n')]
    assert all(length <= term_width for length in line_lens), \
        'terminal width should be adjustable'
    assert any(length == term_width for length in line_lens), \
        'terminal width should be fully used'
    assert len(set(line_lens)) <= 2, \
        'alignments longer than terminal width should work'
Beispiel #12
0
def test_database_overwrite():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S, source_file='old_source.fa')
    db.insert(S, source_file='new_source.fa')
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute(
            'SELECT source_file FROM sequence WHERE content_id = ?',
            (S.content_id,)
        )
        res = [x[0] for x in cursor]
        assert len(res) == 1 and res[0] == 'old_source.fa', \
            'Sequences with observed content id should be ignored'
Beispiel #13
0
def test_sequence_magic():
    A = Alphabet('HT')
    contents = [0, 1, 0, 1]
    S = Sequence(A, contents)

    assert str(S) == 'HTHT'
    assert len(S) == 4, 'len() should work'
    assert S == Sequence(A, contents), 'equals if same contents and alphabet'
    assert S and not Sequence(A, []), 'truthy iff not empty'

    assert S[0] == 0, 'indexing by int should give an int'
    assert isinstance(S[0:1], Sequence) and str(S[0:1]) == 'H', \
        'indexing by a slice should give another sequence object'

    assert S + A.parse('TT') == A.parse('HTHTTT'), 'add by appending'
    assert S + 'TT' == A.parse('HTHTTT'), 'add by appending raw sequences'
Beispiel #14
0
def test_read_fasta_basic():
    A = Alphabet('ACGT')
    with NamedTemporaryFile() as f:
        f.write('> name1\nAAA\n\nTTT')
        f.flush()
        f.seek(0)
        recs = [r for r in read_fasta(f, A)]
        assert len(recs) == 1, 'should work when reading from file'
        assert recs[0][0] == A.parse('AAATTT', name='name1'), \
            'should properly parse what is in the file'
        assert isinstance(recs[0][0], NamedSequence), \
            'should return NamedSequence objects'
        assert recs[0][1] == 0, 'should report the right file positions'

    # duplicate names not allowed
    with pytest.raises(AssertionError):
        [r for r in read_fasta(StringIO('>name\nAAA\n> name\nTTT\n'), A)]
Beispiel #15
0
def test_database_find():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    T = A.parse('GGCT', name='bar')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S)
    db.insert(T)

    sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%'
    found = [rec for rec in db.find(sql_condition=sql_condition)]
    assert len(found) == 1 and found[0].content_id == T.content_id, \
        'find() should work with sql_condition'

    def condition(rec): return rec.attrs['name'] == 'foo'

    found = [rec for rec in db.find(condition=condition)]
    assert len(found) == 1 and found[0].content_id == S.content_id, \
        'find() should work with callable condition'
Beispiel #16
0
def test_database_events():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')

    # NOTE python 2 does not support non-local, non-global variables, put it in
    # the function object.
    test_database_events.callback_called = 0

    def callback(self, *args):
        test_database_events.callback_called += 1

    db = DB(':memory:', A)
    db.add_event_listener('db-initialized', callback)
    db.add_event_listener('sequence-inserted', callback)
    db.initialize()
    assert test_database_events.callback_called == 1, \
        'event callbacks for "initialize" should be executed'

    db.insert(S)
    assert test_database_events.callback_called == 2, \
        'event callbacks for "insert-sequence" should be executed'
Beispiel #17
0
def test_sequence_transforms():
    A = Alphabet(['00', '01', '11'])
    S = A.parse('0001')
    assert S.transform(mappings={'00': '01'}) == A.parse('0101'), \
        'dict mappings are unidirectional'
    assert S.transform(mappings=[('00', '01')]) == A.parse('0100'), \
        'list mappings are bidirectional'
    assert S.transform(mappings=[(0, 1)]) == A.parse('0100'), \
        'list mappings are bidirectional'

    S = A.parse('0011')
    assert S.transform(mappings={'00': '01'}) == A.parse('0111'), \
        'unmapped letters remain untouched'
    assert S.reverse() == A.parse('1100'), 'reverse() works'
Beispiel #18
0
def test_database_populate_fasta_rc():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=True)

    assert len(inserted) == 4
    assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \
        == [S.content_id, T.content_id], \
        'reverse complements should know what their origin is'

    def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id

    found_T_rc = next(db.find(condition=cond_T_rc))
    T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name)
    assert db.load_from_record(found_T_rc, fasta) == T_rc, \
        'reverse complements should load properly from a record'
Beispiel #19
0
def test_mutation_process():
    A = Alphabet('ACGT')
    S = A.parse('ACT' * 100)
    gap_kw = {'go_prob': 0, 'ge_prob': 0}

    def _check_subst_probs(subst_probs, msg):
        assert len(subst_probs) == len(A)
        for i in range(len(A)):
            correct = [.7 if i == j else .1 for j in range(len(A))]
            assert np.allclose(M.subst_probs[i], correct), msg
    M = MutationProcess(A, subst_probs=.3)
    _check_subst_probs(M.subst_probs,
                       'Substitution probabilities given as a single float')
    M = MutationProcess(A, subst_probs=[[.7 if i == j else .1
                                         for j in range(len(A))]
                                        for i in range(len(A))])
    _check_subst_probs(M.subst_probs,
                       'Substitution probabilities given as a matrix')

    T, tx = MutationProcess(A, subst_probs=0, **gap_kw).mutate(S)
    assert T == S and tx == 'MMM' * 100, \
        'all mutation probabilities can be set to zero'

    T, tx = MutationProcess(A, subst_probs=0.1, **gap_kw).mutate(S)
    assert all(op in 'MS' for op in tx) and 'S' in tx, \
        'there can be mutation processes with only substitutions'

    T, tx = MutationProcess(A, subst_probs=0.01, **gap_kw).mutate(S)
    assert tx.count('S') < 0.1 * len(S), 'substitution probabilities work'

    with pytest.raises(AssertionError):
        MutationProcess(A, go_prob=0.2, ge_prob=0.1)  # go_prob <= ge_prob

    gap_kw = {'go_prob': 0.05, 'ge_prob': 0.1}
    T, tx = MutationProcess(A, subst_probs=0, **gap_kw).mutate(S)
    indels = sum(1 for op in tx if op in 'ID')
    assert indels > 0 and indels < 0.5 * len(S), 'gap probabilities work'
Beispiel #20
0
def test_pw_render_longlet():
    A = Alphabet(['00', '11'])
    aln = Alignment(A.parse('0011'), A.parse('11'), 'DM')
    assert '--11' in aln.render_term(colored=False), \
        'alphabets with > 1 long letters should be rendered properly'
Beispiel #21
0
def test_rand_seq():
    _bak = np.random.choice
    np.random.choice = mock.Mock(return_value=[0, 0, 0])
    A = Alphabet('ACGT')
    assert rand_seq(A, 10) == A.parse('AAA')
    np.random.choice = _bak