def test_060_construct_new_file_checks(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with self.assertRaises(IOError): fh = Fast5.New(tmp_file, 'r') fh = Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id) # This should be fine with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id)
def test_067_write_raw_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_raw(self.tmp_raw, meta=self.tmp_read_id, read_number=1) with self.assertRaises(TypeError): with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_raw(self.tmp_raw.astype(float), meta=self.tmp_read_id, read_number=1)
def test_061_write_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id) # Metadata duration and start_time should be integers, not floats with Fast5(tmp_file, 'r') as h: for key in ['duration', 'start_time']: self.assertIsInstance(h.attributes[key], int) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing float data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write float, data on read not scaled correctly, got {} not {}' .format(actual, expected)) os.unlink(tmp_file)
def setUpClass(self): """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data""" print('* Fast5 Basecaller and Mapper') self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA' self.qstring = '!'*len(self.seq) self.model_file = 'example_template.model' self.events_file = 'example_template.events' self.model_file = 'example_template.model' self.bc_scale_file = 'example_template.bc_scale' self.bc_path_file = 'example_template.bc_path' self.map_scale_file = 'example_template.map_scale' self.map_path_file = 'example_template.map_path' self.map_post_file = 'example_template.map_post' self.ref_name = 'test_seq' # Open new file header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate'] channel_id = {x:0 for x in header} tracking_id = tracking_id = { 'exp_start_time': '1970-01-00T00:00:00Z', 'run_id': 'a'*32, 'flow_cell_id': 'FAH00000', } fakefile = tempfile.NamedTemporaryFile() self.fh = Fast5.New(fakefile.name, channel_id=channel_id, tracking_id=tracking_id, read='a') # load data to set within fast5 file self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True) self.model['kmer'] = self.model['kmer'].astype(str) self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True) # use namedtuple to imitate a Scale object Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']) bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t')) bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t') self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq) map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t')) map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t') map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t') n_states = len(self.seq) - len(self.model['kmer'][0]) + 1 self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name) self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)
def test_065_write_int_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id) as h: h.set_read(self.tmp_events_int, self.tmp_read_id) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual(events['start'].dtype.descr[0][1], '<f8', 'Writing uint data did not give float data on read.' ) actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual(actual, expected, 'Write unit, data on read not scaled correctly, got {} not {}'.format( actual, expected ) ) os.unlink(tmp_file)
def create_fast5(raw_data, fast5_filename): raw_data = np.array(raw_data) # create fast5 (from https://nanoporetech.github.io/fast5_research/examples.html) # example of how to digitize data start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) rng = stop - start digitisation = 8192.0 bins = np.arange(start, stop, rng / digitisation) # np.int16 is required, the library will refuse to write anything other raw_data_binned = np.digitize(raw_data, bins).astype(np.int16) # The following are required meta data channel_id = { 'digitisation': digitisation, 'offset': 0, 'range': rng, 'sampling_rate': 4000, 'channel_number': 1, } read_id = { 'start_time': 0, 'duration': len(raw_data), 'read_number': 1, 'start_mux': 1, 'read_id': str(uuid4()), 'scaling_used': 1, 'median_before': 0, } tracking_id = { 'exp_start_time': '1970-01-01T00:00:00Z', 'run_id': str(uuid4()).replace('-', ''), 'flow_cell_id': 'FAH00000', } context_tags = {} with Fast5.New(fast5_filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_data_binned, meta=read_id, read_number=1)
def digitize_write(raw_data, read_id, params): digitisation = 8192.0 start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) rng = stop - start bins = np.arange(start, stop, rng / digitisation) # np.int16 is required, the library will refuse to write anything other #raw_data = np.digitize(raw_data, bins).astype(np.int16) raw_data = np.round(raw_data) raw_data = raw_data.astype(np.int16) filename = params.fast5_path + read_id + '.fast5' # The following are required meta data channel_id = { 'digitisation': digitisation, 'offset': 0, 'range': rng, 'sampling_rate': 4000, 'channel_number': 1, } read_id = { 'start_time': 0, 'duration': len(raw_data), 'read_number': 1, 'start_mux': 1, 'read_id': str(read_id), 'scaling_used': 1, 'median_before': 0, } tracking_id = { 'exp_start_time': '1970-01-01T00:00:00Z', 'run_id': str(uuid4()).replace('-', ''), 'flow_cell_id': 'FAH00000', } context_tags = {} with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_data, meta=read_id, read_number=1)