def _process_read(self, read, read_metrics): self.n_reads += 1 filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads) filename = add_prefix(filename, self.prefix) # add filename to read_metrics so it can be reported in summaries read_metrics['filename'] = filename filename = os.path.join(self.outpath, filename) channel_id = { 'channel_number': self.channel, 'range': read.channel_meta['range'], 'digitisation': read.channel_meta['digitisation'], 'offset': read.channel_meta['offset'], 'sample_rate': read.channel_meta['sample_rate'], 'sampling_rate': read.channel_meta['sample_rate'] } if read.events is None: raise RuntimeError('Read has no events data, cannot write fast5') events = read.events read_id = { 'start_time': events['start'][0], 'duration': events['start'][-1] + events['length'][-1] - events['start'][0], 'read_number': self.n_reads, 'start_mux': read_metrics['mux'], 'read_id': read.meta['read_id'], 'scaling_used': 1, 'median_before': read_metrics['median_current_before'], } with Fast5.New(filename, 'a', tracking_id=read.tracking_meta, context_tags=read.context_meta, channel_id=channel_id) as h: h.set_read(events, read_id) if read.raw is not None: h.set_raw(read.adc_raw)
def test_060_write_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id) # Metadata duration and start_time should be integers, not floats print tmp_file with Fast5(tmp_file, 'r') as h: for key in ['duration', 'start_time']: self.assertIsInstance(h.attributes[key], int) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing float data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write float, data on read not scaled correctly, got {} not {}' .format(actual, expected)) os.unlink(tmp_file)
def write_read(self, read): if self.by_id: filename = '{}.fast5'.format(read.read_id['read_id']) else: filename = '{}read_ch{}_file{}.fast5'.format( self.prefix, read.channel_id['channel_number'], read.read_number ) filename = os.path.join(self.out_path, filename) with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h: h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number)
def extract_channel_reads(source, output, prefix, flat, by_id, channel): if flat: out_path = output else: out_path = os.path.join(output, str(channel)) os.makedirs(out_path) with BulkFast5(source) as src: raw_data = src.get_raw(channel, use_scaling=False) meta = src.get_metadata(channel) tracking_id = src.get_tracking_meta() context_tags = src.get_context_meta() channel_id = { 'channel_number': channel, 'range': meta['range'], 'digitisation': meta['digitisation'], 'offset': meta['offset'], 'sample_rate': meta['sample_rate'], 'sampling_rate': meta['sample_rate'] } median_before = None counter = 1 for read_number, read in enumerate(src.get_reads(channel)): if median_before is None: median_before = read['median'] continue if read['classification'] != 'strand': median_before = read['median'] else: counter += 1 start, length = read['read_start'], read['read_length'] read_id = { 'start_time': read['read_start'], 'duration': read['read_length'], 'read_number': read_number, 'start_mux': src.get_mux(channel, raw_index=start, wells_only=True), 'read_id': read['read_id'], 'scaling_used': 1, 'median_before': median_before } raw_slice = raw_data[start:start+length] if by_id: filename = '{}.fast5'.format(read['read_id']) else: filename = '{}_read_ch{}_file{}.fast5'.format( prefix, channel, read_number ) filename = os.path.join(out_path, filename) with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_slice, meta=read_id, read_number=read_number) return counter, channel
def test_065_write_int_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h: h.set_read(self.tmp_events_int, self.tmp_read_id) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing uint data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write unit, data on read not scaled correctly, got {} not {}'. format(actual, expected)) os.unlink(tmp_file)
def setUpClass(self): """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data""" print '* Fast5 Basecaller and Mapper' self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA' self.qstring = '!' * len(self.seq) self.model_file = 'example_template.model' self.events_file = 'example_template.events' self.model_file = 'example_template.model' self.bc_scale_file = 'example_template.bc_scale' self.bc_path_file = 'example_template.bc_path' self.map_scale_file = 'example_template.map_scale' self.map_path_file = 'example_template.map_path' self.map_post_file = 'example_template.map_post' self.ref_name = 'test_seq' # Open new file header = [ 'channel_number', 'offset', 'range', 'digitisation', 'sampling_rate' ] channel_id = {x: 0 for x in header} fakefile = tempfile.NamedTemporaryFile() self.fh = Fast5.New(fakefile.name, channel_id=channel_id, read='a') # load data to set within fast5 file self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True) self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True) # use namedtuple to imitate a Scale object Scale = namedtuple( 'Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']) bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t')) bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t') self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq) map_scale = Scale( *np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t')) map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t') map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t') n_states = len(self.seq) - len(self.model['kmer'][0]) + 1 self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name) self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)