def test_database_insert(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() attrs = {'key': 'value'} rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs) assert isinstance(rec.id, int) assert rec.content_id == S.content_id assert rec.source_pos == 10 assert rec.source_file == 'source.fa' assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \ 'attributes must be populated correctly' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (rec.id,)) # NOTE for some reason if we just say next(cursor) == ... # the cursor remains open after the context is over (which should # not happen as per docs). This leads to BusyError further down. assert cursor.fetchall() == [(S.content_id,)], \ 'content identifier is properly populated' # add a second sequence T = A.parse('GCTG', name='bar') new_rec = db.insert(T) assert new_rec.id != rec.id, 'new ids are assigned to new sequences' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (new_rec.id,)) assert next(cursor) == (T.content_id,), \ 'correct id must be populated'
def test_database_populate_fasta(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() fasta.name = '/x.fasta' write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=False) assert len(inserted) == 2 assert all(isinstance(r, Record) for r in inserted) assert all(rec.source_file == fasta.name for rec in inserted), \ 'source file of sequence records must be set' assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \ 'should be able to retrieve sequences by position in source' with patch('biseqt.database.open', create=True) as open_mock: open_mock.return_value = MagicMock(spec=file, wraps=fasta) assert db.load_from_record(inserted[0]) == S, \ 'load_from_record should work without an open file handle'
def test_database_basic(): A = Alphabet('ACGT') db = DB(':memory:', A) db.initialize() db.initialize() # should be able to call it twice with db.connection() as conn: # a sequence table should be created conn.cursor().execute('SELECT * FROM sequence LIMIT 1;') with pytest.raises(AssertionError): DB('/cannot/possibly/exist/directory/', A)
def test_database_overwrite(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() db.insert(S, source_file='old_source.fa') db.insert(S, source_file='new_source.fa') with db.connection() as conn: cursor = conn.cursor() cursor.execute( 'SELECT source_file FROM sequence WHERE content_id = ?', (S.content_id,) ) res = [x[0] for x in cursor] assert len(res) == 1 and res[0] == 'old_source.fa', \ 'Sequences with observed content id should be ignored'
def test_database_find(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') T = A.parse('GGCT', name='bar') db = DB(':memory:', A) db.initialize() db.insert(S) db.insert(T) sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%' found = [rec for rec in db.find(sql_condition=sql_condition)] assert len(found) == 1 and found[0].content_id == T.content_id, \ 'find() should work with sql_condition' def condition(rec): return rec.attrs['name'] == 'foo' found = [rec for rec in db.find(condition=condition)] assert len(found) == 1 and found[0].content_id == S.content_id, \ 'find() should work with callable condition'
def test_database_events(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') # NOTE python 2 does not support non-local, non-global variables, put it in # the function object. test_database_events.callback_called = 0 def callback(self, *args): test_database_events.callback_called += 1 db = DB(':memory:', A) db.add_event_listener('db-initialized', callback) db.add_event_listener('sequence-inserted', callback) db.initialize() assert test_database_events.callback_called == 1, \ 'event callbacks for "initialize" should be executed' db.insert(S) assert test_database_events.callback_called == 2, \ 'event callbacks for "insert-sequence" should be executed'
def test_database_populate_fasta_rc(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=True) assert len(inserted) == 4 assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \ == [S.content_id, T.content_id], \ 'reverse complements should know what their origin is' def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id found_T_rc = next(db.find(condition=cond_T_rc)) T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name) assert db.load_from_record(found_T_rc, fasta) == T_rc, \ 'reverse complements should load properly from a record'
class ReadMapper(object): def __init__(self, alphabet, wordlen, db_path): self.db = DB(db_path, alphabet) self.kmer_index = KmerIndex(self.db, wordlen) self.seed_index = SeedIndex(self.kmer_index) self.bands_indexed = False def log(self, *args, **kwargs): self.db.log(*args, **kwargs) def initialize(self, reads_fa, refs_fa=None, num_reads=-1): self.db.initialize() with open(reads_fa) as f: self.db.load_fasta(f, num=num_reads, rc=True) if refs_fa is not None: with open(refs_fa) as f: self.db.load_fasta(f, rc=False) def index_bands(self, **kw): self.kmer_index.score_kmers() self.seed_index.score_diagonals(**kw) self.bands_indexed = True def load_reads(self): recs_by_content_id = {r.content_id: r for r in list(self.db.find())} reads = [] for record in recs_by_content_id.values(): if 'rc_of' in record.attrs: pair = (recs_by_content_id[record.attrs['rc_of']], record) reads.append(Read(self.seed_index, *pair)) return sorted(reads, key=lambda read: read.record.id) def load_refs(self): recs_by_content_id = {r.content_id: r for r in list(self.db.find())} for record in recs_by_content_id.values(): if 'rc_of' in record.attrs: recs_by_content_id.pop(record.attrs['rc_of']) recs_by_content_id.pop(record.content_id) return recs_by_content_id.values() def map_all_to_all(self, min_band_score, **aligner_kw): assert self.bands_indexed, 'Bands must be indexed first' self.log('Mapping all reads against each other') reads = self.load_reads() # NOTE comes in sorted order of id indic = ProgressIndicator(num_total=len(reads)) indic.start() for read in reads: indic.progress() # NOTE only compare to reads after us others = (r.record for r in reads if r.record.id > read.record.id) for other in others: rec, target_rec, aln = read.map(other, min_band_score=min_band_score, **aligner_kw) if rec is None: continue yield rec, target_rec, aln indic.finish() def map_all_to_refs(self, min_band_score, **aligner_kw): # FIXME it would be nice to only calculate bands for read v. ref not # all pairwise of reads too assert self.bands_indexed, 'Bands must be indexed first' self.log('Mapping all reads against reference sequences') reads, refs = self.load_reads(), self.load_refs() indic = ProgressIndicator(num_total=len(reads)) indic.start() for read in reads: indic.progress() rec, target_rec, aln = read.map(refs, min_band_score=min_band_score, **aligner_kw) if rec is not None: yield rec, target_rec, aln indic.finish() def mappings_from_sam(self, sampath): """Loads mappings from a SAM mapping file and translates sequence names to integer identifiers as stored by :class:`biseqt.database.DB`. Args: db (database.DB): The sequence database where ids are looked up. sampath (str): The path to SAM mappings file. Yields: tuple: A 3-tuple containing the read record, the reference name and the ``pysam.calignedsegment.AlignedSegment`` mapping it to the reference. """ self.log('Loading SAM mappings from %s.' % sampath) reads_by_name = {r.record.attrs['name']: r for r in self.load_reads()} samfile = pysam.AlignmentFile(sampath) for mapping in samfile.fetch(): qname, rname = mapping.query_name, mapping.reference_name # NOTE this is because BLASR does a weird thing with sequence names qname = qname.rsplit('/', 1)[0] if qname not in reads_by_name: continue yield reads_by_name[qname], rname, mapping def overlaps_from_sam_mappings(self, sampath, min_overlap=-1): """Finds all pairs of overlapping sequences based on their mappings to a reference. Args: sampath (str): The path to SAM mappings file. min_overlap (int): The minimum required length for overlaps to be reported; default is -1 in which case no overlap is excluded. Yields: tuple: A tuple of sequence integer ids (in increasing order) that are deemed as overlapping based on SAM mappings. """ self.log('Finding overlaps from SAM mappings.') mappings = {read.record.id: (read, ref, mapping) for read, ref, mapping in self.mappings_from_sam(sampath)} seqids = sorted(mappings.keys()) for id0, id1 in combinations(seqids, 2): (r0, ref0, map0), (r1, ref1, map1) = mappings[id0], mappings[id1] if ref0 != ref1: continue # TODO ignoring query_alignment_start and query_alignment_end overlap_len = min(map0.reference_end, map1.reference_end) - \ max(map0.reference_start, map1.reference_start) if overlap_len <= 0 or overlap_len < min_overlap: continue # FIXME the second thing we yield is not reported by our own # map_all_to_all. if map0.is_reverse == map1.is_reverse: yield r0.record, r1.record yield r0.rc_record, r1.rc_record else: yield r0.record, r1.rc_record yield r0.rc_record, r1.record