def test_060_write_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id) # Metadata duration and start_time should be integers, not floats print tmp_file with Fast5(tmp_file, 'r') as h: for key in ['duration', 'start_time']: self.assertIsInstance(h.attributes[key], int) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing float data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write float, data on read not scaled correctly, got {} not {}' .format(actual, expected)) os.unlink(tmp_file)
def _write_read(self, read): if read.raw.dtype != np.int16: raise TypeError('Raw data must be of type int16.') read_group = '/read_{}'.format(read.read_id['read_id']) Fast5._add_attrs_to_fh(self.current_file, {'run_id': read.tracking_id['run_id']}, read_group, convert=str) # add all attributes for grp_name in ('tracking_id', 'context_tags'): # spec has all of these as str data = getattr(read, grp_name) Fast5._add_attrs_to_fh(self.current_file, data, '{}/{}'.format(read_group, grp_name), convert=str) Fast5._add_attrs_to_fh(self.current_file, read.channel_id, '{}/channel_id'.format(read_group)) # add the data (and some more attrs) data_path = '{}/Raw'.format(read_group) read_id = Fast5._convert_meta_times(read.read_id, read.channel_id['sampling_rate']) read_id = Fast5.convert_raw_meta(read_id) Fast5._add_attrs_to_fh(self.current_file, read_id, data_path) signal_path = '{}/Signal'.format(data_path) self.current_file.create_dataset(signal_path, data=read.raw, compression='gzip', shuffle=True, dtype='i2')
def __init__(self, read_id, read_number, tracking_id, channel_id, context_tags, raw): self.read_id = read_id self.read_number = read_number self.tracking_id = tracking_id self.channel_id = channel_id self.context_tags = context_tags self.raw = raw # ensure typing and required fields self.channel_id = Fast5.convert_channel_id(self.channel_id) self.tracking_id = Fast5.convert_tracking_id(self.tracking_id)
def _process_read(self, read, read_metrics): self.n_reads += 1 filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads) filename = add_prefix(filename, self.prefix) # add filename to read_metrics so it can be reported in summaries read_metrics['filename'] = filename filename = os.path.join(self.outpath, filename) channel_id = { 'channel_number': self.channel, 'range': read.channel_meta['range'], 'digitisation': read.channel_meta['digitisation'], 'offset': read.channel_meta['offset'], 'sample_rate': read.channel_meta['sample_rate'], 'sampling_rate': read.channel_meta['sample_rate'] } if read.events is None: raise RuntimeError('Read has no events data, cannot write fast5') events = read.events read_id = { 'start_time': events['start'][0], 'duration': events['start'][-1] + events['length'][-1] - events['start'][0], 'read_number': self.n_reads, 'start_mux': read_metrics['mux'], 'read_id': read.meta['read_id'], 'scaling_used': 1, 'median_before': read_metrics['median_current_before'], } with Fast5.New(filename, 'a', tracking_id=read.tracking_meta, context_tags=read.context_meta, channel_id=channel_id) as h: h.set_read(events, read_id) if read.raw is not None: h.set_raw(read.adc_raw)
def setUp(self): self.h = Fast5( os.path.join(os.path.dirname(__file__), 'data', self.test_file)) # Use to create new temp files self.tmp_events_float = np.array( [(0.0, 1.0, 10.0, 2.0)], dtype=[(x, 'float') for x in ['start', 'length', 'mean', 'stdv']]) self.tmp_events_int = np.array([(0, 5000, 10.0, 2.0)], dtype=[('start', 'uint32'), ('length', 'uint32'), ('mean', 'float'), ('stdv', 'float')]) self.tmp_channel_id = { 'channel_number': 1, 'range': 1.0, 'digitisation': 1.0, 'offset': 0.0, 'sample_rate': 5000.0, 'sampling_rate': 5000.0 } self.tmp_read_id = { 'start_time': 0.0, 'duration': 1.0, 'read_number': 1, 'start_mux': 1, 'read_id': str(uuid4()), 'scaling_used': 1 }
def _valid_file(base): for fname in base: try: fh = Fast5(fname) except Exception as e: logger.warn('Could not open {}.'.format(fname)) else: yield fh
def write_read(self, read): if self.by_id: filename = '{}.fast5'.format(read.read_id['read_id']) else: filename = '{}read_ch{}_file{}.fast5'.format( self.prefix, read.channel_id['channel_number'], read.read_number ) filename = os.path.join(self.out_path, filename) with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h: h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number)
def test_065_write_int_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h: h.set_read(self.tmp_events_int, self.tmp_read_id) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing uint data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write unit, data on read not scaled correctly, got {} not {}'. format(actual, expected)) os.unlink(tmp_file)
def extract_channel_reads(source, output, prefix, flat, by_id, channel): if flat: out_path = output else: out_path = os.path.join(output, str(channel)) os.makedirs(out_path) with BulkFast5(source) as src: raw_data = src.get_raw(channel, use_scaling=False) meta = src.get_metadata(channel) tracking_id = src.get_tracking_meta() context_tags = src.get_context_meta() channel_id = { 'channel_number': channel, 'range': meta['range'], 'digitisation': meta['digitisation'], 'offset': meta['offset'], 'sample_rate': meta['sample_rate'], 'sampling_rate': meta['sample_rate'] } median_before = None counter = 1 for read_number, read in enumerate(src.get_reads(channel)): if median_before is None: median_before = read['median'] continue if read['classification'] != 'strand': median_before = read['median'] else: counter += 1 start, length = read['read_start'], read['read_length'] read_id = { 'start_time': read['read_start'], 'duration': read['read_length'], 'read_number': read_number, 'start_mux': src.get_mux(channel, raw_index=start, wells_only=True), 'read_id': read['read_id'], 'scaling_used': 1, 'median_before': median_before } raw_slice = raw_data[start:start+length] if by_id: filename = '{}.fast5'.format(read['read_id']) else: filename = '{}_read_ch{}_file{}.fast5'.format( prefix, channel, read_number ) filename = os.path.join(out_path, filename) with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_slice, meta=read_id, read_number=read_number) return counter, channel
def basecall_file(fname=None, event_detect=True): """Read event data from file and print scrappie basecall. :param fname: filename to read data from (if not given assumed to be given on command line. :param event_detect: do event detection? :returns: tuple (basecall score, sequence). """ is_main = False if fname is None: #called as entrypoint fname = sys.argv[1] is_main = True # magic numbers ed_params = { 'window_lengths': [4, 8], 'thresholds': [1.5, 9.0], 'peak_height': 0.2, } with Fast5(fname) as fh: if event_detect: events = minknow_event_detect(fh.get_read(raw=True), fh.sample_rate, **ed_params) else: events = fh.get_read() events, _ = segment(events, section='template') results = basecall_events(events) if results is None: return None if is_main: print("{} score={}\n{}".format(fname, *results)) else: return results
def setUpClass(self): """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data""" print '* Fast5 Basecaller and Mapper' self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA' self.qstring = '!' * len(self.seq) self.model_file = 'example_template.model' self.events_file = 'example_template.events' self.model_file = 'example_template.model' self.bc_scale_file = 'example_template.bc_scale' self.bc_path_file = 'example_template.bc_path' self.map_scale_file = 'example_template.map_scale' self.map_path_file = 'example_template.map_path' self.map_post_file = 'example_template.map_post' self.ref_name = 'test_seq' # Open new file header = [ 'channel_number', 'offset', 'range', 'digitisation', 'sampling_rate' ] channel_id = {x: 0 for x in header} fakefile = tempfile.NamedTemporaryFile() self.fh = Fast5.New(fakefile.name, channel_id=channel_id, read='a') # load data to set within fast5 file self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True) self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True) # use namedtuple to imitate a Scale object Scale = namedtuple( 'Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']) bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t')) bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t') self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq) map_scale = Scale( *np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t')) map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t') map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t') n_states = len(self.seq) - len(self.model['kmer'][0]) + 1 self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name) self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)