def test_mapping_reftosignal(self): """ Test the conversion from remapped path to reftosignal output Returns: None """ sig = signal.Signal(dacs=np.zeros(12)) # testing path with a single skip (over 3rd base; first "T") path = np.array([-1, 0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 5, 6], dtype=np.int32) reference = 'ACTACGT' int_ref = signal_mapping.SignalMapping.get_integer_reference( reference, 'ACGT') sigtoref_res = signal_mapping.SignalMapping.from_remapping_path( path, int_ref, 1, sig).Ref_to_signal self.assertEqual(sigtoref_res.tolist(), [0, 2, 5, 5, 8, 10, 11, 12]) # now test with clipped bases sig = signal.Signal(dacs=np.zeros(15)) # testing path with a single skip (over 4th base; first "T") path = np.array([-1, -1, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 6, 7, -1, -1], dtype=np.int32) reference = 'AACTACGTTT' int_ref = signal_mapping.SignalMapping.get_integer_reference( reference, 'ACGT') sigtoref_res = signal_mapping.SignalMapping.from_remapping_path( path, int_ref, 1, sig).Ref_to_signal self.assertEqual(sigtoref_res.tolist(), [-1, 1, 3, 6, 6, 9, 11, 12, 13, 16, 16]) return
def test_mapping_reftosignal(self): """Test the conversion from remapped path to reftosignal output """ sig = signal.Signal(dacs=np.zeros(12)) # testing path with a single skip (over 3rd base; first "T") path = np.array([0,0,1,1,1,3,3,3,4,4,5,6], dtype=np.int32) reference = 'ACTACGT' sigtoref_res = mapping.Mapping( sig, path, reference).get_reftosignal() self.assertEqual(sigtoref_res.tolist(), [0, 2, 5, 5, 8, 10, 11, 12]) # now test with clipped bases sig = signal.Signal(dacs=np.zeros(15)) # testing path with a single skip (over 4th base; first "T") path = np.array([-1,1,1,2,2,2,4,4,4,5,5,6,7,-1,-1], dtype=np.int32) reference = 'AACTACGTTT' sigtoref_res = mapping.Mapping( sig, path, reference).get_reftosignal() self.assertEqual(sigtoref_res.tolist(), [-1, 1, 3, 6, 6, 9, 11, 12, 13, 16, 16]) return
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id, r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info): read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id) channel_info = dict(fast5utils.get_channel_info(read).items()) rd_factor = channel_info['range'] / channel_info['digitisation'] shift_from_pA = (scale_params[0] + channel_info['offset']) * rd_factor scale_from_pA = scale_params[1] * rd_factor read_attrs = dict(fast5utils.get_read_attributes(read).items()) # prepare taiyaki signal object sig = tai_signal.Signal(dacs=dacs) sig.channel_info = channel_info sig.read_attributes = read_attrs sig.offset = channel_info['offset'] sig.range = channel_info['range'] sig.digitisation = channel_info['digitisation'] path = np.full((dacs.shape[0] // stride) + 1, -1) # skip last value since this is where the two seqs end for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]): # if the query position maps to the end of the mapping skip it if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]: continue path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos remapping = tai_mapping.Mapping.from_remapping_path( sig, path, ref_seq, stride) try: remapping.add_integer_reference(ref_out_info.alphabet) except Exception: raise mh.MegaError('Invalid reference sequence encountered') return (remapping.get_read_dictionary(shift_from_pA, scale_from_pA, read_id), prepare_mapping_funcs.RemapResult.SUCCESS)
def get_remapping( sig_fn, dacs, scale_params, ref_seq, stride, read_id, r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info, ): read = fast5_io.get_read(sig_fn, read_id) channel_info = dict(fast5utils.get_channel_info(read).items()) read_params = { "trim_start": 0, "trim_end": 0, "shift": scale_params[0], "scale": scale_params[1], } sig = tai_signal.Signal( dacs=dacs, channel_info=channel_info, read_id=read_id, read_params=read_params, ) ref_to_sig = np.empty(len(ref_seq) + 1, dtype=np.int32) # skip last value since this is where the two seqs end for ref_pos, q_pos in enumerate(r_to_q_poss): ref_to_sig[ref_pos] = rl_cumsum[q_pos + r_ref_pos.q_trim_start] * stride try: int_ref = tai_mapping.SignalMapping.get_integer_reference( ref_seq, ref_out_info.alphabet_info.alphabet ) except Exception: raise mh.MegaError("Invalid reference sequence encountered") sig_mapping = tai_mapping.SignalMapping(ref_to_sig, int_ref, signalObj=sig) # annotate mod motifs if ref_out_info.ref_mods_all_motifs is not None: # annotate all mod base motif positions with alts int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs) # set new Reference with mods annotated sig_mapping.Reference = int_ref return ( sig_mapping.get_read_dictionary(), prepare_mapping_funcs.RemapResult.SUCCESS, )
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id, r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info): read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id) channel_info = dict(fast5utils.get_channel_info(read).items()) rd_factor = channel_info['range'] / channel_info['digitisation'] read_params = { 'trim_start': 0, 'trim_end': 0, 'shift': (scale_params[0] + channel_info['offset']) * rd_factor, 'scale': scale_params[1] * rd_factor } sig = tai_signal.Signal(dacs=dacs, channel_info=channel_info, read_id=read_id, read_params=read_params) path = np.full((dacs.shape[0] // stride) + 1, -1) # skip last value since this is where the two seqs end for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]): # if the query position maps to the end of the mapping skip it if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]: continue path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos try: int_ref = tai_mapping.SignalMapping.get_integer_reference( ref_seq, ref_out_info.alphabet) except Exception: raise mh.MegaError('Invalid reference sequence encountered') sig_mapping = tai_mapping.SignalMapping.from_remapping_path( path, int_ref, stride, sig) # annotate mod motifs if ref_out_info.ref_mods_all_motifs is not None: # annotate all mod base motif positions with alts int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs, ref_out_info.collapse_alphabet) # set new Reference with mods annotated sig_mapping.Reference = int_ref return (sig_mapping.get_read_dictionary(), prepare_mapping_funcs.RemapResult.SUCCESS)
def oneread_remap(read_tuple, references, model, device, per_read_params_dict, alphabet_info): """ Worker function for remapping reads using flip-flop model on raw signal :param read_tuple : read, identified by a tuple (filepath, read_id) :param references :dict mapping fast5 filenames to reference strings :param model :pytorch model (the torch data structure, not a filename) :param device :integer specifying which GPU to use for remapping, or 'cpu' to use CPU :param per_read_params_dict :dictionary where keys are UUIDs, values are dicts containing keys trim_start trim_end shift scale :param alphabet_info : AlphabetInfo object for basecalling :returns: tuple of dictionary as specified in mapped_signal_files.Read class and a message string indicating an error if one occured """ filename, read_id = read_tuple try: with fast5_interface.get_fast5_file(filename, 'r') as f5file: read = f5file.get_read(read_id) sig = signal.Signal(read) except Exception: return None, READ_ID_INFO_NOT_FOUND_ERR_TEXT if read_id in references: read_ref = references[read_id] else: return None, NO_REF_FOUND_ERR_TEXT try: read_params_dict = per_read_params_dict[read_id] except KeyError: return None, NO_PARAMS_ERR_TEXT sig.set_trim_absolute(read_params_dict['trim_start'], read_params_dict['trim_end']) try: torch.set_num_threads( 1 ) # Prevents torch doing its own parallelisation on top of our imap_map # Standardise (i.e. shift/scale so that approximately mean =0, std=1) signalArray = (sig.current - read_params_dict['shift']) / read_params_dict['scale'] # Make signal into 3D tensor with shape [siglength,1,1] and move to appropriate device (GPU number or CPU) signalTensor = torch.tensor( signalArray[:, np.newaxis, np.newaxis].astype(taiyaki_dtype), device=device) # The model must live on the same device modelOnDevice = model.to(device) # Apply the network to the signal, generating transition weight matrix, and put it back into a numpy array with torch.no_grad(): transweights = modelOnDevice(signalTensor).cpu().numpy() except Exception: return None, REMAP_ERR_TEXT # Extra dimensions introduced by np.newaxis above removed by np.squeeze can_read_ref = alphabet_info.collapse_sequence(read_ref) remappingscore, path = flipflop_remap.flipflop_remap( np.squeeze(transweights), can_read_ref, alphabet=alphabet_info.can_bases, localpen=0.0) # read_ref comes out as a bytes object, so we need to convert to str # localpen=0.0 does local alignment # flipflop_remap() establishes a mapping between the network outputs and the reference. # What we need is a mapping between the signal and the reference. # To resolve this we need to know the stride of the model (how many samples for each network output) model_stride = helpers.guess_model_stride(model) remapping = mapping.Mapping.from_remapping_path(sig, path, read_ref, model_stride) remapping.add_integer_reference(alphabet_info.alphabet) return remapping.get_read_dictionary(read_params_dict['shift'], read_params_dict['scale'], read_id), REMAP_SUCCESS_TEXT
def oneread_remap(read_tuple, references, model, device, per_read_params_dict, alphabet, collapse_alphabet): """ Worker function for remapping reads using flip-flop model on raw signal :param read_tuple : read, identified by a tuple (filepath, read_id) :param references :dict mapping fast5 filenames to reference strings :param model :pytorch model (the torch data structure, not a filename) :param device :integer specifying which GPU to use for remapping, or 'cpu' to use CPU :param per_read_params_dict :dictionary where keys are UUIDs, values are dicts containing keys trim_start trim_end shift scale :param alphabet : alphabet for basecalling (passed on to mapped-read file) :param collapse_alphabet : collapsed alphabet for basecalling (passed on to mapped-read file) :returns: dictionary as specified in mapped_signal_files.Read class """ filename, read_id = read_tuple try: with fast5_interface.get_fast5_file(filename, 'r') as f5file: read = f5file.get_read(read_id) sig = signal.Signal(read) except Exception as e: # We want any single failure in the batch of reads to not disrupt other reads being processed. sys.stderr.write( 'No read information on read {} found in file {}.\n{}\n'.format( read_id, filename, repr(e))) return None if read_id in references: read_ref = references[read_id].decode("utf-8") else: sys.stderr.write('No fasta reference found for {}.\n'.format(read_id)) return None if read_id in per_read_params_dict: read_params_dict = per_read_params_dict[read_id] else: return None sig.set_trim_absolute(read_params_dict['trim_start'], read_params_dict['trim_end']) try: torch.set_num_threads( 1 ) # Prevents torch doing its own parallelisation on top of our imap_map # Standardise (i.e. shift/scale so that approximately mean =0, std=1) signalArray = (sig.current - read_params_dict['shift']) / read_params_dict['scale'] # Make signal into 3D tensor with shape [siglength,1,1] and move to appropriate device (GPU number or CPU) signalTensor = torch.tensor( signalArray[:, np.newaxis, np.newaxis].astype(taiyaki_dtype), device=device) # The model must live on the same device modelOnDevice = model.to(device) # Apply the network to the signal, generating transition weight matrix, and put it back into a numpy array with torch.no_grad(): transweights = modelOnDevice(signalTensor).cpu().numpy() except Exception as e: sys.stderr.write( "Failure applying basecall network to remap read {}.\n{}\n".format( sig.read_id, repr(e))) return None # Extra dimensions introduced by np.newaxis above removed by np.squeeze remappingscore, path = flipflop_remap.flipflop_remap( np.squeeze(transweights), read_ref, localpen=0.0) # read_ref comes out as a bytes object, so we need to convert to str # localpen=0.0 does local alignment # flipflop_remap() establishes a mapping between the network outputs and the reference. # What we need is a mapping between the signal and the reference. # To resolve this we need to know the stride of the model (how many samples for each network output) model_stride = helpers.guess_model_stride(model, device=device) remapping = mapping.Mapping.from_remapping_path(sig, path, read_ref, model_stride) return remapping.get_read_dictionary(read_params_dict['shift'], read_params_dict['scale'], read_id, alphabet=alphabet, collapse_alphabet=collapse_alphabet)
def oneread_remap(read_tuple, model, per_read_params_dict, alphabet_info, max_read_length, device='cpu', localpen=0.0): """ Worker function for remapping reads using flip-flop model on raw signal Args: read_tuple (tuple) : read, identified by a tuple (filepath, read_id, read reference) model (pytorch Module): pytorch model device (int or float): integer specifying which GPU to use for remapping, or 'cpu' to use CPU per_read_params_dict (dict) : dictionary where keys are UUIDs, values are dicts containing keys trim_start trim_end shift scale alphabet_info (AlphabetInfo object): for basecalling max_read_length (int) : Don't attempt to remap reads with references longer than this localpen (float): Penalty for local mapping Returns: tuple :(dict,str) containing 1. dictionary as specified in signal_mapping.SignalMapping.get_read_dictionary 2. message string indicating an error if one occured """ filename, read_id, read_ref = read_tuple if read_ref is None: return None, RemapResult.NO_REF_FOUND if max_read_length is not None and len(read_ref) > max_read_length: return None, RemapResult.REF_TOO_LONG try: read_params_dict = per_read_params_dict[read_id] except KeyError: return None, RemapResult.NO_PARAMS try: with fast5_interface.get_fast5_file(filename, 'r') as f5file: read = f5file.get_read(read_id) sig = signal.Signal(read, read_params=read_params_dict) except Exception: return None, RemapResult.READ_ID_INFO_NOT_FOUND try: # Prevents torch doing its own parallelisation on top of our imap_map torch.set_num_threads(1) # Make signal into 3D tensor with shape [siglength,1,1] and move to # appropriate device (GPU number or CPU) signalTensor = torch.tensor( sig.standardized_current[:, np.newaxis, np.newaxis].astype(np.float32), device=device) # The model must live on the same device modelOnDevice = model.to(device) # Apply the network to the signal, generating transition weight matrix, # and put it back into a numpy array with torch.no_grad(): transweights = modelOnDevice(signalTensor).cpu().numpy() except Exception: return None, RemapResult.NETWORK_ERROR # Extra dimensions introduced by np.newaxis above removed by np.squeeze can_read_ref = alphabet_info.collapse_sequence(read_ref) remappingscore, path = flipflop_remap.flipflop_remap( np.squeeze(transweights), can_read_ref, alphabet=alphabet_info.can_bases, localpen=localpen) # read_ref comes out as a bytes object, so we need to convert to str # localpen=0.0 does local alignment # flipflop_remap() establishes a mapping between the network outputs and # the reference. # What we need is a mapping between the signal and the reference. # To resolve this we need to know the stride of the model (how many samples # for each network output) model_stride = helpers.guess_model_stride(model) int_ref = signal_mapping.SignalMapping.get_integer_reference( read_ref, alphabet_info.alphabet) sig_mapping = signal_mapping.SignalMapping.from_remapping_path( path, int_ref, model_stride, sig) try: sig_mapping_dict = sig_mapping.get_read_dictionary() except signal_mapping.TaiyakiSigMapError as e: return None, str(e) return sig_mapping_dict, RemapResult.SUCCESS