Ejemplo n.º 1
0
    def test_060_write_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)

        # Metadata duration and start_time should be integers, not floats
        print tmp_file
        with Fast5(tmp_file, 'r') as h:
            for key in ['duration', 'start_time']:
                self.assertIsInstance(h.attributes[key], int)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(
                events['start'].dtype.descr[0][1], '<f8',
                'Writing float data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(
                actual, expected,
                'Write float, data on read not scaled correctly, got {} not {}'
                .format(actual, expected))

        os.unlink(tmp_file)
Ejemplo n.º 2
0
    def _write_read(self, read):
        if read.raw.dtype != np.int16:
            raise TypeError('Raw data must be of type int16.')

        read_group = '/read_{}'.format(read.read_id['read_id'])
        Fast5._add_attrs_to_fh(self.current_file,
                               {'run_id': read.tracking_id['run_id']},
                               read_group,
                               convert=str)

        # add all attributes
        for grp_name in ('tracking_id', 'context_tags'):
            # spec has all of these as str
            data = getattr(read, grp_name)
            Fast5._add_attrs_to_fh(self.current_file,
                                   data,
                                   '{}/{}'.format(read_group, grp_name),
                                   convert=str)
        Fast5._add_attrs_to_fh(self.current_file, read.channel_id,
                               '{}/channel_id'.format(read_group))

        # add the data (and some more attrs)
        data_path = '{}/Raw'.format(read_group)
        read_id = Fast5._convert_meta_times(read.read_id,
                                            read.channel_id['sampling_rate'])
        read_id = Fast5.convert_raw_meta(read_id)
        Fast5._add_attrs_to_fh(self.current_file, read_id, data_path)
        signal_path = '{}/Signal'.format(data_path)
        self.current_file.create_dataset(signal_path,
                                         data=read.raw,
                                         compression='gzip',
                                         shuffle=True,
                                         dtype='i2')
Ejemplo n.º 3
0
    def __init__(self, read_id, read_number, tracking_id, channel_id, context_tags, raw):
        self.read_id = read_id
        self.read_number = read_number
        self.tracking_id = tracking_id
        self.channel_id = channel_id
        self.context_tags = context_tags
        self.raw = raw

        # ensure typing and required fields
        self.channel_id = Fast5.convert_channel_id(self.channel_id)
        self.tracking_id = Fast5.convert_tracking_id(self.tracking_id)
Ejemplo n.º 4
0
    def _process_read(self, read, read_metrics):
        self.n_reads += 1

        filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads)
        filename = add_prefix(filename, self.prefix)
        # add filename to read_metrics so it can be reported in summaries
        read_metrics['filename'] = filename
        filename = os.path.join(self.outpath, filename)

        channel_id = {
            'channel_number': self.channel,
            'range': read.channel_meta['range'],
            'digitisation': read.channel_meta['digitisation'],
            'offset': read.channel_meta['offset'],
            'sample_rate': read.channel_meta['sample_rate'],
            'sampling_rate': read.channel_meta['sample_rate']
        }
        if read.events is None:
            raise RuntimeError('Read has no events data, cannot write fast5')
        events = read.events
        read_id = {
            'start_time': events['start'][0],
            'duration': events['start'][-1] + events['length'][-1] - events['start'][0],
            'read_number': self.n_reads,
            'start_mux': read_metrics['mux'],
            'read_id': read.meta['read_id'],
            'scaling_used': 1,
            'median_before': read_metrics['median_current_before'],
        }

        with Fast5.New(filename, 'a', tracking_id=read.tracking_meta,
                       context_tags=read.context_meta, channel_id=channel_id) as h:
            h.set_read(events, read_id)
            if read.raw is not None:
                h.set_raw(read.adc_raw)
Ejemplo n.º 5
0
    def setUp(self):
        self.h = Fast5(
            os.path.join(os.path.dirname(__file__), 'data', self.test_file))

        # Use to create new temp files
        self.tmp_events_float = np.array(
            [(0.0, 1.0, 10.0, 2.0)],
            dtype=[(x, 'float') for x in ['start', 'length', 'mean', 'stdv']])
        self.tmp_events_int = np.array([(0, 5000, 10.0, 2.0)],
                                       dtype=[('start', 'uint32'),
                                              ('length', 'uint32'),
                                              ('mean', 'float'),
                                              ('stdv', 'float')])
        self.tmp_channel_id = {
            'channel_number': 1,
            'range': 1.0,
            'digitisation': 1.0,
            'offset': 0.0,
            'sample_rate': 5000.0,
            'sampling_rate': 5000.0
        }
        self.tmp_read_id = {
            'start_time': 0.0,
            'duration': 1.0,
            'read_number': 1,
            'start_mux': 1,
            'read_id': str(uuid4()),
            'scaling_used': 1
        }
Ejemplo n.º 6
0
 def _valid_file(base):
     for fname in base:
         try:
             fh = Fast5(fname)
         except Exception as e:
             logger.warn('Could not open {}.'.format(fname))
         else:
             yield fh
Ejemplo n.º 7
0
 def write_read(self, read):
     if self.by_id:
         filename = '{}.fast5'.format(read.read_id['read_id'])
     else:
         filename = '{}read_ch{}_file{}.fast5'.format(
             self.prefix, read.channel_id['channel_number'], read.read_number
         )
     filename = os.path.join(self.out_path, filename)
     with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h:
         h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number)
Ejemplo n.º 8
0
    def test_065_write_int_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_int, self.tmp_read_id)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(
                events['start'].dtype.descr[0][1], '<f8',
                'Writing uint data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(
                actual, expected,
                'Write unit, data on read not scaled correctly, got {} not {}'.
                format(actual, expected))

        os.unlink(tmp_file)
Ejemplo n.º 9
0
def extract_channel_reads(source, output, prefix, flat, by_id, channel):

    if flat:
        out_path = output
    else:
        out_path = os.path.join(output, str(channel))
        os.makedirs(out_path)

    with BulkFast5(source) as src:
        raw_data = src.get_raw(channel, use_scaling=False)
        meta = src.get_metadata(channel)
        tracking_id = src.get_tracking_meta()
        context_tags = src.get_context_meta()
        channel_id = {
            'channel_number': channel,
            'range': meta['range'],
            'digitisation': meta['digitisation'],
            'offset': meta['offset'],
            'sample_rate': meta['sample_rate'],
            'sampling_rate': meta['sample_rate']
        }
        median_before = None
        counter = 1
        for read_number, read in enumerate(src.get_reads(channel)):
            if median_before is None:
                median_before = read['median']
                continue

            if read['classification'] != 'strand':
                median_before = read['median']
            else:
                counter += 1
                start, length = read['read_start'], read['read_length']
                read_id = {
                    'start_time': read['read_start'],
                    'duration': read['read_length'],
                    'read_number': read_number,
                    'start_mux': src.get_mux(channel, raw_index=start, wells_only=True),
                    'read_id': read['read_id'],
                    'scaling_used': 1,
                    'median_before': median_before
                }

                raw_slice = raw_data[start:start+length]
                if by_id:
                    filename = '{}.fast5'.format(read['read_id'])
                else:
                    filename =  '{}_read_ch{}_file{}.fast5'.format(
                        prefix, channel, read_number
                    )
                filename = os.path.join(out_path, filename)
                with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h:
                    h.set_raw(raw_slice, meta=read_id, read_number=read_number)
    return counter, channel
Ejemplo n.º 10
0
def basecall_file(fname=None, event_detect=True):
    """Read event data from file and print scrappie basecall.

    :param fname: filename to read data from (if not given assumed
        to be given on command line.
    :param event_detect: do event detection?

    :returns: tuple (basecall score, sequence).
    """
    is_main = False
    if fname is None:  #called as entrypoint
        fname = sys.argv[1]
        is_main = True

    # magic numbers
    ed_params = {
        'window_lengths': [4, 8],
        'thresholds': [1.5, 9.0],
        'peak_height': 0.2,
    }

    with Fast5(fname) as fh:
        if event_detect:
            events = minknow_event_detect(fh.get_read(raw=True),
                                          fh.sample_rate, **ed_params)
        else:
            events = fh.get_read()
    events, _ = segment(events, section='template')

    results = basecall_events(events)
    if results is None:
        return None
    if is_main:
        print("{} score={}\n{}".format(fname, *results))
    else:
        return results
    def setUpClass(self):
        """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data"""
        print '* Fast5 Basecaller and Mapper'

        self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA'
        self.qstring = '!' * len(self.seq)
        self.model_file = 'example_template.model'
        self.events_file = 'example_template.events'
        self.model_file = 'example_template.model'
        self.bc_scale_file = 'example_template.bc_scale'
        self.bc_path_file = 'example_template.bc_path'
        self.map_scale_file = 'example_template.map_scale'
        self.map_path_file = 'example_template.map_path'
        self.map_post_file = 'example_template.map_post'
        self.ref_name = 'test_seq'

        # Open new file
        header = [
            'channel_number', 'offset', 'range', 'digitisation',
            'sampling_rate'
        ]
        channel_id = {x: 0 for x in header}
        fakefile = tempfile.NamedTemporaryFile()
        self.fh = Fast5.New(fakefile.name, channel_id=channel_id, read='a')

        # load data to set within fast5 file
        self.model = np.genfromtxt(self.get_file_path(self.model_file),
                                   dtype=None,
                                   delimiter='\t',
                                   names=True)
        self.events = np.genfromtxt(self.get_file_path(self.events_file),
                                    dtype=None,
                                    delimiter='\t',
                                    names=True)

        # use namedtuple to imitate a Scale object
        Scale = namedtuple(
            'Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'])

        bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file),
                                        dtype=None,
                                        delimiter='\t'))
        bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file),
                                dtype=np.int32,
                                delimiter='\t')

        self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model,
                                  self.seq)

        map_scale = Scale(
            *np.genfromtxt(self.get_file_path(self.map_scale_file),
                           dtype=None,
                           delimiter='\t'))
        map_path = np.genfromtxt(self.get_file_path(self.map_path_file),
                                 dtype=np.int32,
                                 delimiter='\t')
        map_post = np.genfromtxt(self.get_file_path(self.map_post_file),
                                 delimiter='\t')

        n_states = len(self.seq) - len(self.model['kmer'][0]) + 1
        self.fh.set_mapping_data(self.events, map_scale, map_path, self.model,
                                 self.seq, self.ref_name)
        self.fh.set_mapping_data(self.events,
                                 map_scale,
                                 map_path,
                                 self.model,
                                 self.seq,
                                 self.ref_name,
                                 post=map_post)