def map_signal_to_squiggle(data, sequence, model='squiggle_r94', rate=1.0, back_prob=0.0, local_pen=2.0, skip_pen=5000.0, min_score=5.0): """Align a squiggle to a sequence using a simulated squiggle. :param data: `ndarray` containing raw signal data. :param sequence: base sequence to which to align data. :param model: model to use in simulating squiggle. :param rate: rate of translocation relative to squiggle model :param back_prob: probability of backward movement. :param local_pen: penalty for local alignment. :param skip_pen: penalty for skipping position in sequence. :param min_score: floor on match score. :returns: tuple containing (alignment score, alignment path) """ raw = RawTable(data) raw.trim().scale() squiggle = sequence_to_squiggle(sequence, model=model) path = np.ascontiguousarray(np.zeros(raw._rt.n, dtype=np.int32)) p_path = ffi.cast("int32_t *", ffi.from_buffer(path)) score = lib.squiggle_match_viterbi(raw.data(), rate, squiggle.data(), back_prob, local_pen, skip_pen, min_score, p_path) return score, path
def _decode_post(post, stay_pen=0.0, skip_pen=0.0, local_pen=2.0, use_slip=False): """Decode a posterior using Viterbi algorithm for transducer. :param post: a `ScrappyMatrix` containing transducer posteriors. :param stay_pen: penalty for staying. :param skip_pen: penalty for skipping a base. :param local_pen: penalty for local basecalling. :param use_slip: allow slipping (movement more than 2 bases). :returns: tuple containing (call, score, call positions per raw block). """ nblock, nstate = post.shape path = ffi.new("int[{}]".format(nblock + 1)) score = lib.decode_transducer(post.data(), stay_pen, skip_pen, local_pen, path, use_slip) pos = np.zeros(nblock + 1, dtype=np.int32) p_pos = ffi.cast("int *", pos.ctypes.data) basecall = lib.overlapper(path, nblock + 1, nstate - 1, p_pos) return ffi.string(basecall).decode(), score, pos
def map_signal_to_squiggle(data, sequence, back_prob=0.0, local_pen=2.0, min_score=5.0): """Align a squiggle to a sequence using a simulated squiggle. :param data: `ndarray` containing raw signal data. :param sequence: base sequence to which to align data. :param back_prob: probability of backward movement. :param local_pen: penalty for local alignment. :param min_score: floor on match score. :returns: tuple containing (alignment score, alignment path) """ raw = RawTable(data) raw.trim().scale() squiggle = sequence_to_squiggle(sequence) if squiggle is None: return None path = np.ascontiguousarray(np.zeros(raw._rt.n, dtype=np.int32)) p_path = ffi.cast("int32_t *", ffi.from_buffer(path)) score = lib.squiggle_match_viterbi(raw.data(), squiggle, back_prob, local_pen, min_score, p_path) free_matrix(squiggle) return score, path
def _numpy_to_scrappy_matrix(numpy_array): """Convert a `ndarray` to a bare `scrappie_matrix`""" nc = numpy_array.shape[0] nr = numpy_array.shape[1] data = np.ascontiguousarray(numpy_array.astype(ftype, order='C', copy=False)) buf = ffi.cast("float *", data.ctypes.data) return lib.mat_from_array(buf, nr, nc)
def _decode_post_crf(post): """Decode a posterior using Viterbi algorithm for conditional random field. :param post: a `ScrappyMatrix` containing CRF transitions. :returns: tuple containing (basecall, score, call positions per raw data block). """ nblock, nstate = post.shape path = ffi.new("int[{}]".format(nblock + 1)) score = lib.decode_crf(post.data(), path) pos = np.ascontiguousarray(np.zeros(nblock + 1, dtype=np.int32)) p_pos = ffi.cast("int *", ffi.from_buffer(pos)) basecall = lib.crfpath_to_basecall(path, nblock, p_pos) return ffi.string(basecall).decode(), score, pos
def __init__(self, data, start=0, end=None): """Representation of a scrappie `raw_table`. :param data: `nd.array` containing raw data. ..note:: The class stores a reference to a contiguous numpy array of the correct type to be passed to the extension library. The class provides safety against the original data being garbage collected. To obtain an up-to-date (possibly trimmed and scaled) copy of the data use `raw_table.data(as_numpy=True)`. """ if end is None: end = len(data) self._data = np.ascontiguousarray(data.astype(ftype, order='C', copy=True)) rt = ffi.new('raw_table *') rt.n = len(self._data) rt.start = start rt.end = end rt.raw = ffi.cast("float *", ffi.from_buffer(self._data)) self._rt = rt[0]
def map_post_to_sequence(post, sequence, stay_pen=0, skip_pen=0, local_pen=4.0, viterbi=False, path=False, bands=None): """Block-based local-global alignment of a squiggle to a sequence using either Forward or Viterbi algorithm. For the latter the Viterbi path can optionally be calculated. :param post: a `ScrappyMatrix` containing log-probabilities (as from `calc_post`). :param sequence: a base sequence which to map. :param stay_pen: penalty for zero-state movement from one block to next. :param skip_pen: penalty for two-state movement from one block to next. :param local_pen: penalty for local alignment through blocks :param viterbi: use Viterbi algorithm rather than forward. :param path: calculate alignment path (only valid for `viterbi==True` and `bands==None`). :param bands: two sequences containing lower and upper extremal allowed positions for each block. Should be length corresponding to number of blocks of `post`. If a single number is given, a diagonal band with width 2 * `bands` * #states / #blocks will be used. If `None` is given banding is not used (a full DP matrix is evaluated). :returns: (score, path), (or (None, *) in the case of failure). ..note:: if `viterbi`==False or `path`==False, the returned path will be `None`. """ if path and not viterbi: raise ValueError('Cannot calulate path with `viterbi==False`.') if not isinstance(post, ScrappyMatrix): raise TypeError('`post` should be a ScrappyMatrix.') nblock, nstate = post.shape alpha_len, kmer_len = guess_state_properties(nstate) seq_len = len(sequence) - kmer_len + 1 p_seq = _none_if_null( lib.encode_bases_to_integers(sequence.encode(), len(sequence), kmer_len)) if p_seq is None: raise RuntimeError( 'An unknown error occurred whilst encoding sequence.') if viterbi and path: path_data = np.zeros(nblock, dtype=np.int32) p_path = ffi.cast("int *", ffi.from_buffer(path_data)) else: path_data = None p_path = ffi.NULL if bands is None: if viterbi: score = lib.map_to_sequence_viterbi(post.data(), stay_pen, skip_pen, local_pen, p_seq, seq_len, p_path) else: score = lib.map_to_sequence_forward(post.data(), stay_pen, skip_pen, local_pen, p_seq, seq_len) else: if isinstance(bands, int): # create a monotonic diagonal band gradient = seq_len / nblock bands = 2 * bands * gradient hband = bands / 2 bands = [ np.ascontiguousarray(np.array(x, dtype=np.uintp)) for x in ([(max(0, x * gradient - hband)) for x in range(nblock)], [(min(seq_len, x * gradient + hband)) for x in range(nblock)]) ] elif len(bands) == 2: bands = [np.ascontiguousarray(x, dtype=np.uintp) for x in bands] else: raise ValueError( '`bands` should be `None`, an integer, or length 2.') p_poslow, p_poshigh = (ffi.cast("size_t *", ffi.from_buffer(x)) for x in bands) if not lib.are_bounds_sane(p_poslow, p_poshigh, nblock, seq_len): raise ValueError('Supplied banding structure is not valid.') if viterbi: func = lib.map_to_sequence_viterbi_banded else: func = lib.map_to_sequence_forward_banded score = func(post.data(), stay_pen, skip_pen, local_pen, p_seq, seq_len, p_poslow, p_poshigh) score = _none_if_null(score) if score is None: raise RuntimeError('An unknown error occurred during alignment.') return score, path_data