def test_060_construct_new_file_checks(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with self.assertRaises(IOError):
            fh = Fast5.New(tmp_file, 'r')
            fh = Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id)
            fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id)

        # This should be fine
        with Fast5.New(tmp_file,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)
    def test_067_write_raw_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
        with Fast5.New(tmp_file,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_raw(self.tmp_raw, meta=self.tmp_read_id, read_number=1)

        with self.assertRaises(TypeError):
            with Fast5.New(tmp_file,
                           tracking_id=self.tmp_tracking_id) as h:
    def test_061_write_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)

        # Metadata duration and start_time should be integers, not floats
        with Fast5(tmp_file, 'r') as h:
            for key in ['duration', 'start_time']:
                self.assertIsInstance(h.attributes[key], int)

        with Fast5(tmp_file) as h:
            events = h.get_read()
                events['start'].dtype.descr[0][1], '<f8',
                'Writing float data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
                actual, expected,
                'Write float, data on read not scaled correctly, got {} not {}'
                .format(actual, expected))

Example #4
    def setUpClass(self):
        """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data"""
        print('* Fast5 Basecaller and Mapper')

        self.qstring = '!'*len(self.seq)
        self.model_file = 'example_template.model'
        self.events_file = ''
        self.model_file = 'example_template.model'
        self.bc_scale_file = 'example_template.bc_scale'
        self.bc_path_file = 'example_template.bc_path'
        self.map_scale_file = 'example_template.map_scale'
        self.map_path_file = 'example_template.map_path'
        self.map_post_file = 'example_template.map_post'
        self.ref_name = 'test_seq'

        # Open new file
        header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate']
        channel_id = {x:0 for x in header}
        tracking_id = tracking_id = {
            'exp_start_time': '1970-01-00T00:00:00Z',
            'run_id': 'a'*32,
            'flow_cell_id': 'FAH00000',
        fakefile = tempfile.NamedTemporaryFile()
        self.fh = Fast5.New(, channel_id=channel_id, tracking_id=tracking_id, read='a')

        # load data to set within fast5 file
        self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True)

        self.model['kmer'] = self.model['kmer'].astype(str) = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True)

        # use namedtuple to imitate a Scale object
        Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'])

        bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t'))
        bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t')

        self.fh.set_basecall_data(, bc_scale, bc_path, self.model, self.seq)

        map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t'))
        map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t')
        map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t')

        n_states = len(self.seq) - len(self.model['kmer'][0]) + 1
        self.fh.set_mapping_data(, map_scale, map_path, self.model, self.seq, self.ref_name)
        self.fh.set_mapping_data(, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)
Example #5
    def test_065_write_int_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_int, self.tmp_read_id)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(events['start'].dtype.descr[0][1], '<f8',
                'Writing uint data did not give float data on read.'
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(actual, expected,
                'Write unit, data on read not scaled correctly, got {} not {}'.format(
                    actual, expected

Example #6
def create_fast5(raw_data, fast5_filename):
    raw_data = np.array(raw_data)
    # create fast5 (from
    # example of how to digitize data
    start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
    rng = stop - start
    digitisation = 8192.0
    bins = np.arange(start, stop, rng / digitisation)
    # np.int16 is required, the library will refuse to write anything other
    raw_data_binned = np.digitize(raw_data, bins).astype(np.int16)

    # The following are required meta data
    channel_id = {
        'digitisation': digitisation,
        'offset': 0,
        'range': rng,
        'sampling_rate': 4000,
        'channel_number': 1,
    read_id = {
        'start_time': 0,
        'duration': len(raw_data),
        'read_number': 1,
        'start_mux': 1,
        'read_id': str(uuid4()),
        'scaling_used': 1,
        'median_before': 0,
    tracking_id = {
        'exp_start_time': '1970-01-01T00:00:00Z',
        'run_id': str(uuid4()).replace('-', ''),
        'flow_cell_id': 'FAH00000',
    context_tags = {}

    with Fast5.New(fast5_filename,
                   channel_id=channel_id) as h:
        h.set_raw(raw_data_binned, meta=read_id, read_number=1)
Example #7
def digitize_write(raw_data, read_id, params):
    digitisation = 8192.0
    start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
    rng = stop - start
    bins = np.arange(start, stop, rng / digitisation)
    # np.int16 is required, the library will refuse to write anything other
    #raw_data = np.digitize(raw_data, bins).astype(np.int16)
    raw_data = np.round(raw_data)
    raw_data = raw_data.astype(np.int16)
    filename = params.fast5_path + read_id + '.fast5'
    # The following are required meta data
    channel_id = {
        'digitisation': digitisation,
        'offset': 0,
        'range': rng,
        'sampling_rate': 4000,
        'channel_number': 1,
    read_id = {
        'start_time': 0,
        'duration': len(raw_data),
        'read_number': 1,
        'start_mux': 1,
        'read_id': str(read_id),
        'scaling_used': 1,
        'median_before': 0,
    tracking_id = {
        'exp_start_time': '1970-01-01T00:00:00Z',
        'run_id': str(uuid4()).replace('-', ''),
        'flow_cell_id': 'FAH00000',
    context_tags = {}
    with Fast5.New(filename,
                   channel_id=channel_id) as h:
        h.set_raw(raw_data, meta=read_id, read_number=1)