def sequencing_sample(request): """Creates a random sequence, generates reads, with parameterized mutation probabilities, of equal length starting at whole multiples of half of read length. It is expected that successive reads have an overlap starting at their halfway position. Returns: tuple: A tuple containing the full genome, a list of reads, the gap probability and the seed index. """ A = Alphabet('ACGT') gap_prob, subst_prob, wordlen = request.param seq_len, read_len = 2000, 500 seq = rand_seq(A, seq_len).to_named('genome') mutation_process = MutationProcess(A, subst_probs=subst_prob, go_prob=gap_prob, ge_prob=gap_prob) reads = [] for i in range(0, seq_len - read_len, int(read_len/2)): read, _ = mutation_process.mutate(seq[i: i + read_len]) reads += [read.to_named('read#%d' % i)] db = DB(':memory:', A) kmer_index = KmerIndex(db, wordlen) seed_index = SeedIndex(kmer_index) seed_index.db.initialize() records = [db.insert(r) for r in reads] return seq, reads, records, gap_prob, seed_index
def test_database_basic(): A = Alphabet('ACGT') db = DB(':memory:', A) db.initialize() db.initialize() # should be able to call it twice with db.connection() as conn: # a sequence table should be created conn.cursor().execute('SELECT * FROM sequence LIMIT 1;') with pytest.raises(AssertionError): DB('/cannot/possibly/exist/directory/', A)
def test_database_insert(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() attrs = {'key': 'value'} rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs) assert isinstance(rec.id, int) assert rec.content_id == S.content_id assert rec.source_pos == 10 assert rec.source_file == 'source.fa' assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \ 'attributes must be populated correctly' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (rec.id,)) # NOTE for some reason if we just say next(cursor) == ... # the cursor remains open after the context is over (which should # not happen as per docs). This leads to BusyError further down. assert cursor.fetchall() == [(S.content_id,)], \ 'content identifier is properly populated' # add a second sequence T = A.parse('GCTG', name='bar') new_rec = db.insert(T) assert new_rec.id != rec.id, 'new ids are assigned to new sequences' with db.connection() as conn: cursor = conn.cursor() cursor.execute('SELECT content_id FROM sequence WHERE id = ?', (new_rec.id,)) assert next(cursor) == (T.content_id,), \ 'correct id must be populated'
def seed_index(): """Creates a database, a kmer index, and a seed index with word length 5 stored in memory and returns the seed index. The database is populated with 3 random sequences of length 100 and all kmers and seeds are indexed.""" A = Alphabet('ACGT') num_seqs = 3 seq_len = 100 wordlen = 5 db = DB(':memory:', A) seed_index = SeedIndex(KmerIndex(db, wordlen)) seed_index.db.initialize() fasta = StringIO() seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs)) write_fasta(fasta, seqs) fasta.seek(0) db.load_fasta(fasta) seed_index.index_seeds() return seed_index
def test_database_populate_fasta(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() fasta.name = '/x.fasta' write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=False) assert len(inserted) == 2 assert all(isinstance(r, Record) for r in inserted) assert all(rec.source_file == fasta.name for rec in inserted), \ 'source file of sequence records must be set' assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \ 'should be able to retrieve sequences by position in source' with patch('biseqt.database.open', create=True) as open_mock: open_mock.return_value = MagicMock(spec=file, wraps=fasta) assert db.load_from_record(inserted[0]) == S, \ 'load_from_record should work without an open file handle'
def test_database_events(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') # NOTE python 2 does not support non-local, non-global variables, put it in # the function object. test_database_events.callback_called = 0 def callback(self, *args): test_database_events.callback_called += 1 db = DB(':memory:', A) db.add_event_listener('db-initialized', callback) db.add_event_listener('sequence-inserted', callback) db.initialize() assert test_database_events.callback_called == 1, \ 'event callbacks for "initialize" should be executed' db.insert(S) assert test_database_events.callback_called == 2, \ 'event callbacks for "insert-sequence" should be executed'
def test_database_find(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') T = A.parse('GGCT', name='bar') db = DB(':memory:', A) db.initialize() db.insert(S) db.insert(T) sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%' found = [rec for rec in db.find(sql_condition=sql_condition)] assert len(found) == 1 and found[0].content_id == T.content_id, \ 'find() should work with sql_condition' def condition(rec): return rec.attrs['name'] == 'foo' found = [rec for rec in db.find(condition=condition)] assert len(found) == 1 and found[0].content_id == S.content_id, \ 'find() should work with callable condition'
def test_database_overwrite(): A = Alphabet('ACGT') S = A.parse('AACT', name='foo') db = DB(':memory:', A) db.initialize() db.insert(S, source_file='old_source.fa') db.insert(S, source_file='new_source.fa') with db.connection() as conn: cursor = conn.cursor() cursor.execute( 'SELECT source_file FROM sequence WHERE content_id = ?', (S.content_id,) ) res = [x[0] for x in cursor] assert len(res) == 1 and res[0] == 'old_source.fa', \ 'Sequences with observed content id should be ignored'
def test_database_populate_fasta_rc(): A = Alphabet('ACGT') S = A.parse('AACT', name='S') T = A.parse('GCAT', name='T') db = DB(':memory:', A) db.initialize() fasta = StringIO() write_fasta(fasta, [S, T]) fasta.seek(0) inserted = db.load_fasta(fasta, rc=True) assert len(inserted) == 4 assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \ == [S.content_id, T.content_id], \ 'reverse complements should know what their origin is' def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id found_T_rc = next(db.find(condition=cond_T_rc)) T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name) assert db.load_from_record(found_T_rc, fasta) == T_rc, \ 'reverse complements should load properly from a record'
def __init__(self, alphabet, wordlen, db_path): self.db = DB(db_path, alphabet) self.kmer_index = KmerIndex(self.db, wordlen) self.seed_index = SeedIndex(self.kmer_index) self.bands_indexed = False
class ReadMapper(object): def __init__(self, alphabet, wordlen, db_path): self.db = DB(db_path, alphabet) self.kmer_index = KmerIndex(self.db, wordlen) self.seed_index = SeedIndex(self.kmer_index) self.bands_indexed = False def log(self, *args, **kwargs): self.db.log(*args, **kwargs) def initialize(self, reads_fa, refs_fa=None, num_reads=-1): self.db.initialize() with open(reads_fa) as f: self.db.load_fasta(f, num=num_reads, rc=True) if refs_fa is not None: with open(refs_fa) as f: self.db.load_fasta(f, rc=False) def index_bands(self, **kw): self.kmer_index.score_kmers() self.seed_index.score_diagonals(**kw) self.bands_indexed = True def load_reads(self): recs_by_content_id = {r.content_id: r for r in list(self.db.find())} reads = [] for record in recs_by_content_id.values(): if 'rc_of' in record.attrs: pair = (recs_by_content_id[record.attrs['rc_of']], record) reads.append(Read(self.seed_index, *pair)) return sorted(reads, key=lambda read: read.record.id) def load_refs(self): recs_by_content_id = {r.content_id: r for r in list(self.db.find())} for record in recs_by_content_id.values(): if 'rc_of' in record.attrs: recs_by_content_id.pop(record.attrs['rc_of']) recs_by_content_id.pop(record.content_id) return recs_by_content_id.values() def map_all_to_all(self, min_band_score, **aligner_kw): assert self.bands_indexed, 'Bands must be indexed first' self.log('Mapping all reads against each other') reads = self.load_reads() # NOTE comes in sorted order of id indic = ProgressIndicator(num_total=len(reads)) indic.start() for read in reads: indic.progress() # NOTE only compare to reads after us others = (r.record for r in reads if r.record.id > read.record.id) for other in others: rec, target_rec, aln = read.map(other, min_band_score=min_band_score, **aligner_kw) if rec is None: continue yield rec, target_rec, aln indic.finish() def map_all_to_refs(self, min_band_score, **aligner_kw): # FIXME it would be nice to only calculate bands for read v. ref not # all pairwise of reads too assert self.bands_indexed, 'Bands must be indexed first' self.log('Mapping all reads against reference sequences') reads, refs = self.load_reads(), self.load_refs() indic = ProgressIndicator(num_total=len(reads)) indic.start() for read in reads: indic.progress() rec, target_rec, aln = read.map(refs, min_band_score=min_band_score, **aligner_kw) if rec is not None: yield rec, target_rec, aln indic.finish() def mappings_from_sam(self, sampath): """Loads mappings from a SAM mapping file and translates sequence names to integer identifiers as stored by :class:`biseqt.database.DB`. Args: db (database.DB): The sequence database where ids are looked up. sampath (str): The path to SAM mappings file. Yields: tuple: A 3-tuple containing the read record, the reference name and the ``pysam.calignedsegment.AlignedSegment`` mapping it to the reference. """ self.log('Loading SAM mappings from %s.' % sampath) reads_by_name = {r.record.attrs['name']: r for r in self.load_reads()} samfile = pysam.AlignmentFile(sampath) for mapping in samfile.fetch(): qname, rname = mapping.query_name, mapping.reference_name # NOTE this is because BLASR does a weird thing with sequence names qname = qname.rsplit('/', 1)[0] if qname not in reads_by_name: continue yield reads_by_name[qname], rname, mapping def overlaps_from_sam_mappings(self, sampath, min_overlap=-1): """Finds all pairs of overlapping sequences based on their mappings to a reference. Args: sampath (str): The path to SAM mappings file. min_overlap (int): The minimum required length for overlaps to be reported; default is -1 in which case no overlap is excluded. Yields: tuple: A tuple of sequence integer ids (in increasing order) that are deemed as overlapping based on SAM mappings. """ self.log('Finding overlaps from SAM mappings.') mappings = {read.record.id: (read, ref, mapping) for read, ref, mapping in self.mappings_from_sam(sampath)} seqids = sorted(mappings.keys()) for id0, id1 in combinations(seqids, 2): (r0, ref0, map0), (r1, ref1, map1) = mappings[id0], mappings[id1] if ref0 != ref1: continue # TODO ignoring query_alignment_start and query_alignment_end overlap_len = min(map0.reference_end, map1.reference_end) - \ max(map0.reference_start, map1.reference_start) if overlap_len <= 0 or overlap_len < min_overlap: continue # FIXME the second thing we yield is not reported by our own # map_all_to_all. if map0.is_reverse == map1.is_reverse: yield r0.record, r1.record yield r0.rc_record, r1.rc_record else: yield r0.record, r1.rc_record yield r0.rc_record, r1.record