def raw_worker(fast5_file_name, trim, open_pore_fraction, kmer_len, transducer, bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None): """ Worker function for basecall_network.py for basecalling from raw data This worker used the global variable `calc_post` which is set by init_worker. `calc_post` is an unpickled compiled sloika model that is used to calculate a posterior matrix over states :param open_pore_fraction: maximum allowed fraction of signal length to trim due to classification as open pore signal :param trim: (int, int) events to remove from read beginning and end :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post` :param fast5_file_name: filename for single-read fast5 file with raw data """ from sloika import batch, config try: with fast5.Reader(fast5_file_name) as f5: signal = f5.get_read(raw=True) sn = f5.filename_short except Exception as e: sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format(fast5_file_name, e)) return None signal = batch.trim_open_pore(signal, open_pore_fraction) signal = util.trim_array(signal, *trim) if signal.size == 0: sys.stderr.write("Read too short in file {}\n".format(fast5_file_name)) return None inMat = (signal - np.median(signal)) / mad(signal) inMat = inMat[:, None, None].astype(config.sloika_dtype) score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet)) return sn, score, call, inMat.shape[0]
def test_should_fail_when_events_section_is_missing(self, relative_file_path): '''Segmentation in these files is located at /Analyses/Segmentation_000/Summary/segmentation not at /Analyses/Segment_Linear_000/Summary/split_hairpin where we are looking. We initially attempted to look for it in the new place, but then discovered that not only the location has changed but also the structure. In particular, there doesn't appear to be an event index for the segmentation any more. Event index is required for get_section_events function to operate.''' filename = os.path.join(self.dataDir, relative_file_path) with fast5.Reader(filename) as f5: with self.assertRaises(ValueError) as context: ev = f5.get_section_events('template') # on precise and trusty the second part of this error message (the actual exception) is slightly # different from xenial: # # KeyError("unable to open object (Symbol table: Can\\\'t open object)",)\',)' # KeyError("Unable to open object (Object \\\'split_hairpin\\\' doesn\\\'t exist)",)\',)' # # so we compare only the first parts msg = repr(context.exception).split('\\n')[0] self.assertEqual(msg, 'ValueError(\'Could not retrieve template-complement split ' 'point data from attributes of /Analyses/Segmentation_000' '/Summary/split_hairpin')
def events_worker(fast5_file_name, section, segmentation, trim, kmer_len, transducer, bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None): """ Worker function for basecall_network.py for basecalling from events This worker used the global variable `calc_post` which is set by init_worker. `calc_post` is an unpickled compiled sloika model that is used to calculate a posterior matrix over states :param section: part of read to basecall, 'template' or 'complement' :param segmentation: location of segmentation analysis for extracting target read section :param trim: (int, int) events to remove from read beginning and end :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post` :param fast5_file_name: filename for single-read fast5 file with event detection and segmentation """ from sloika import features try: with fast5.Reader(fast5_file_name) as f5: ev = f5.get_section_events(section, analysis=segmentation) sn = f5.filename_short except Exception as e: sys.stderr.write("Error getting events for section {!r} in file {}\n{!r}\n".format(section, fast5_file_name, e)) return None ev = util.trim_array(ev, *trim) if ev.size == 0: sys.stderr.write("Read too short in file {}\n".format(fast5_file_name)) return None inMat = features.from_events(ev, tag='')[:, None, :] score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet)) return sn, score, call, inMat.shape[0]
def chunk_remap_worker(fn, trim, min_prob, kmer_len, prior, slip, chunk_len, use_scaled, normalisation, min_length, section, segmentation, references): try: with fast5.Reader(fn) as f5: sn = f5.filename_short try: ev = f5.get_section_events(section, analysis=segmentation) except ValueError: ev = f5.get_basecall_data(section) except Exception as e: sys.stderr.write('Failure reading events from {}.\n{}\n'.format( fn, repr(e))) return None try: read_ref = references[sn] except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format( fn, repr(e))) return None ev = trim_ends_and_filter(ev, trim, min_length, chunk_len) if ev is None: sys.stderr.write('{} is too short.\n'.format(fn)) return None (score, ev, path, seq) = remap(read_ref, ev, min_prob, kmer_len, prior, slip) (chunks, labels, bad_ev) = chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation) return sn + '.fast5', score, len(ev), path, seq, chunks, labels, bad_ev
def raw_chunk_worker(fn, chunk_len, kmer_len, min_length, trim, normalisation, downsample_factor, interpolation=False): """ Worker for creating labelled features from raw data :param fn: A filename to read from. :param chunk_len: Length on each chunk :param kmer_len: Kmer length for training :param min_length: Minumum number of samples before read can be considered. :param trim: Tuple (beginning, end) of number of samples to trim from read. :param normalisation: Normalisation method [per-chunk | per-read | none] :param downsample_factor: factor by which to downsample labels :param interpolation: interpolate sequence positions between those in mapping table """ try: with fast5.Reader(fn) as f5: mapping_table, att = f5.get_any_mapping_data('template') sig = f5.get_read(raw=True) sample_rate = f5.sample_rate start_sample = f5.get_read(raw=True, group=True).attrs['start_time'] except Exception as e: sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format( fn, repr(e))) return None mapping_table = convert_mapping_times_to_samples(mapping_table, start_sample, sample_rate) map_start = mapping_table['start'][0] + trim[0] map_end = mapping_table['start'][-1] + mapping_table['length'][-1] - trim[1] mapped_signal, mapping_table = trim_signal_and_mapping( sig, mapping_table, map_start, map_end) try: assert mapping_table_is_registered(mapped_signal, mapping_table) except Exception as e: sys.stderr.write( 'Failed to properly register raw signal and mapping table in {}.\n{}\n' .format(fn, repr(e))) return None if len(mapped_signal) < max(chunk_len, min_length): sys.stderr.write('{} is too short.\n'.format(fn)) return None new_inMat, sig_labels, sig_bad = raw_chunkify(mapped_signal, mapping_table, chunk_len, kmer_len, normalisation, downsample_factor, interpolation, att) return (np.ascontiguousarray(new_inMat), np.ascontiguousarray(sig_labels), np.ascontiguousarray(sig_bad))
def reference_extraction_worker(file_name, section): with fast5.Reader(file_name) as file_handle: try: fasta = file_handle.get_reference_fasta( section=section).decode('utf-8') except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format( file_name, repr(e))) return None iowrapper = StringIO(fasta) read_ref = str(next(SeqIO.parse(iowrapper, 'fasta')).seq) return (file_name, read_ref)
def raw_chunk_remap_worker(fn, trim, min_prob, kmer_len, min_length, prior, slip, chunk_len, normalisation, downsample_factor, interpolation, open_pore_fraction, references): """ Worker function for `chunkify raw_remap` remapping reads using raw signal""" try: with fast5.Reader(fn) as f5: signal = f5.get_read(raw=True) sn = f5.filename_short except Exception as e: sys.stderr.write('Failure reading events from {}.\n{}\n'.format( fn, repr(e))) return None try: read_ref = references[sn] except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format( fn, repr(e))) return None signal = batch.trim_open_pore(signal, open_pore_fraction) signal = util.trim_array(signal, *trim) if len(signal) < max(chunk_len, min_length): sys.stderr.write('{} is too short.\n'.format(fn)) return None try: (score, mapping_table, path, seq) = raw_remap(read_ref, signal, min_prob, kmer_len, prior, slip) except Exception as e: sys.stderr.write("Failure remapping read {}.\n{}\n".format( sn, repr(e))) return None # mapping_attrs required if using interpolation mapping_attrs = { 'reference': read_ref, 'direction': '+', 'ref_start': 0, } (chunks, labels, bad_ev) = raw_chunkify(signal, mapping_table, chunk_len, kmer_len, normalisation, downsample_factor, interpolation, mapping_attrs) return sn + '.fast5', score, len( mapping_table), path, seq, chunks, labels, bad_ev
def chunk_worker(fn, section, chunk_len, kmer_len, min_length, trim, use_scaled, normalisation): """ Chunkifies data for training :param fn: A filename to read from :param section: Section of read to process (template / complement) :param chunk_len: Length of each chunk :param kmer_len: Kmer length for training :param min_length: Minimum number of events before read can be considered :param trim: Tuple (beginning, end) of number of events to trim from read :param use_scaled: Use prescaled event statistics :param normalisation: Type of normalisation to perform :yields: A tuple containing a 3D :class:`ndarray` of size (X, chunk_len, nfeatures) containing the features for the batch, a 2D :class:`ndarray` of size (X, chunk_len) containing the associated labels, and a 2D :class:`ndarray` of size (X, chunk_len) indicating bad events. 1 <= X <= batch_size. """ # Import within worker to avoid initialising GPU in main thread import sloika.features try: with fast5.Reader(fn) as f5: ev, _ = f5.get_any_mapping_data(section) except Exception as e: sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format( fn, repr(e))) return None ev = trim_ends_and_filter(ev, trim, min_length, chunk_len) if ev is None: sys.stderr.write('{} is too short.\n'.format(fn)) return None return chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation)
def test(self, relative_file_path, analysis, number_of_events): filename = os.path.join(self.dataDir, relative_file_path) with fast5.Reader(filename) as f5: ev = f5.get_section_events('template', analysis=analysis) self.assertEqual(len(ev), number_of_events)
def test_filename_short(self): filename = os.path.join(self.dataDir, 'reads', 'read03.fast5') with fast5.Reader(filename) as f5: sn = f5.filename_short self.assertEqual(f5.filename_short, 'read03')
def test_unknown(self): filename = os.path.join(self.dataDir, 'reads', 'read03.fast5') with fast5.Reader(filename) as f5: ev, _ = f5.get_any_mapping_data('template') self.assertEqual(len(ev), 4491)
def test(self, relative_file_path, number_of_events, raw): filename = os.path.join(self.dataDir, relative_file_path) with fast5.Reader(filename) as f5: ev = f5.get_read(raw=raw) self.assertEqual(len(ev), number_of_events)