def restart_barcode_4sets(self, AT_max, GC_max, prev_bc4sets_fpath, tmp_fpath=None): with open(prev_bc4sets_fpath) as f: while True: try: seq_4set = [next(f).strip() for _ in range(4)] except StopIteration: break assert all(len(seq) == self.bc_len for seq in seq_4set), 'Prev bcs not specified length' self.dna_barcode_4sets.append(seq_4set) assert not next(f).strip(), next(f) log.info('Read previous barcodes file') for seq_4set in sorted(self.dna_barcode_4sets): for seq in seq_4set: self._add_barcode(dna2num(seq)) log.info('Added prev set {}: {}'.format(len(self.dna_barcode_4sets), seq_4set)) if tmp_fpath: with open(tmp_fpath, 'w') as out: for seq_4set in sorted(self.dna_barcode_4sets): out.write('\n'.join(seq_4set) + '\n\n') Aseqs = [seq for seq in self.dna_barcodes if seq.startswith('A')] Aseqs.sort() last_prev_Aidx = dna2num(Aseqs[-1]) self.find_barcode_4sets( AT_max, GC_max, tmp_fpath=tmp_fpath, last_prev_Aidx=last_prev_Aidx, )
def dna_nums_given_nerr_tup(nerr_tup): nsub, ndel, nins = nerr_tup return [ seqtools.dna2num(seq) for seq in self._freediv_subsphere_given_counts( nsub, ndel, nins) ]
def restart_Conway_closure(self, prev_fpath, tmp_fpath=None): log.info('Restarting {}...'.format(prev_fpath)) prev_bc_idxs = [dna2num(line.strip()) for line in open(prev_fpath)] for bc_idx in prev_bc_idxs: self._add_barcode(bc_idx) log.info('Adding previous {}: {}'.format(len(self.barcodes), num2dna(bc_idx, self.bc_len))) with open(tmp_fpath, 'w') as out: for bc_idx in prev_bc_idxs: out.write('{}\n'.format(num2dna(bc_idx, self.bc_len))) seq_iter = self.seq_idx_iter_func() max_prev = max(prev_bc_idxs) bc_idx = next(seq_iter) while bc_idx < max_prev: bc_idx = next(seq_iter) log.info('Reached last previous barcode: {}'.format(num2dna(max_prev, self.bc_len))) log.info('Restarting after {}'.format(num2dna(bc_idx, self.bc_len))) for seq_idx in seq_iter: if self._idx_is_available(seq_idx): self._add_barcode(seq_idx) log.info('Found barcode {}: {}'.format(len(self.barcodes), num2dna(seq_idx, self.bc_len))) if tmp_fpath: with open(tmp_fpath, 'a') as out: out.write('{}\n'.format(num2dna(seq_idx, self.bc_len)))
def decode(self, seq): seq_idx = seqtools.dna2num(seq) cw_idx = self._codebook[seq_idx] if cw_idx == 0: return else: cw_idx -= 1 return self._codewords[cw_idx]
def exclude_barcodes(self, exclude_fpath): log.info('Excluding barcodes in {}...'.format(exclude_fpath)) exclude_bc_idxs = [ dna2num(line.strip()) for line in open(exclude_fpath) ] for seq_idx in exclude_bc_idxs: assert self.reserved_words[seq_idx] == 0, num2dna( seq_idx, self.bc_len) self.reserved_words[seq_idx] = 1
def find_barcode_4sets( self, AT_max, GC_max, seqs_so_far=[], prev_spheres=[], tmp_fpath=None, last_prev_Aidx=None, ): """ A barcode 4-set is here defined as a set of four barcodes such that no two barcodes have the same base in the same position. I.e., all four bases are in each position in exactly one barcode. """ first_base = bases[len(seqs_so_far)] seq_idx_iter_func = idx_seq_iterator_avoiding_prev_bases( self.bc_len, AT_max, GC_max, first_base, seqs_so_far) # Restart previous run (For 'A' seqs only) seq_idx_iter = seq_idx_iter_func() if last_prev_Aidx is not None: for seq_idx in seq_idx_iter: if seq_idx >= last_prev_Aidx: break for seq_idx in seq_idx_iter: if self._idx_is_available(seq_idx): seq_sphere = set(self.iterate_decode_sphere(seq_idx)) for prev_sphere in prev_spheres: if prev_sphere & seq_sphere: break else: new_seqs = seqs_so_far + [num2dna(seq_idx, self.bc_len)] new_spheres = prev_spheres + [seq_sphere] if len(new_seqs) == 4: assert first_base == bases[-1], new_seqs for seq in new_seqs: self._add_barcode(dna2num(seq)) if tmp_fpath: with open(tmp_fpath, 'a') as out: out.write('\n'.join(new_seqs) + '\n\n') self.dna_barcode_4sets.append(new_seqs) log.info('Found barcode set {}: {}'.format( len(self.dna_barcode_4sets), new_seqs)) return else: self.find_barcode_4sets(AT_max, GC_max, new_seqs, new_spheres, tmp_fpath) if len(new_seqs) > 1: return elif self.reserved_words[seq_idx] == 0: self.reserved_words[seq_idx] = 1
def build_codebook_from_codewords(self, codewords, max_err): """ Builds codebook given list or set of codewords codewords :iterable: list or set of codewords max_err :int: max correctible error """ self.max_err = max_err self._codewords = list(codewords) self._codewords.sort() self._set_cw_len() if len(self._codewords) < 2**8: dtype = np.uint8 cw_bytes = 1 elif len(self._codewords) < 2**16: dtype = np.uint16 cw_bytes = 2 elif len(self._codewords) < 2**32: dtype = np.uint32 cw_bytes = 4 elif len(self._codewords) < 2**64: dtype = np.uint64 cw_bytes = 8 else: raise ValueError('More than 2^64 barcodes currently not supported') space_size = 4**self.cw_len + 1 needed_bytes = space_size * cw_bytes available_bytes = psutil.virtual_memory().available if needed_bytes > available_bytes: raise RuntimeError( 'Not enough memory. {:,d} bytes needed, {:,d} bytes available'. format(needed_bytes, available_bytes)) self._codebook = np.zeros((space_size, ), dtype=dtype) for i, cw in enumerate(self._codewords): cw_idx = i + 1 for seq in FreeDivSphere.FreeDivSphere(cw, self.max_err): seq_idx = seqtools.dna2num(seq) self._codebook[seq_idx] = cw_idx
def dnastr_codeword_is_available(self, dnastring): seq_idx = dna2num(dnastring) self._idx_is_available(seq_idx)
def add_dnastr_nonbarcode_codeword(self, dnastring): seq_idx = dna2num(dnastring) self.add_idx_nonbarcode_codeword(seq_idx)
def iterate_idxs(): for seq in seq_iterator_generator(*args, **kw_args)(): yield dna2num(seq)
def iterate_good_barcodes(): for seq in bc_list: yield seqtools.dna2num(seq)
def iterate_seqs(): for seq in possible_barcode_iterator(k, AT_max, GC_max)(): yield dna2num(seq)