Example #1
0
    def _process_read(self, read, read_metrics):
        self.n_reads += 1

        filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads)
        filename = add_prefix(filename, self.prefix)
        # add filename to read_metrics so it can be reported in summaries
        read_metrics['filename'] = filename
        filename = os.path.join(self.outpath, filename)

        channel_id = {
            'channel_number': self.channel,
            'range': read.channel_meta['range'],
            'digitisation': read.channel_meta['digitisation'],
            'offset': read.channel_meta['offset'],
            'sample_rate': read.channel_meta['sample_rate'],
            'sampling_rate': read.channel_meta['sample_rate']
        }
        if read.events is None:
            raise RuntimeError('Read has no events data, cannot write fast5')
        events = read.events
        read_id = {
            'start_time': events['start'][0],
            'duration': events['start'][-1] + events['length'][-1] - events['start'][0],
            'read_number': self.n_reads,
            'start_mux': read_metrics['mux'],
            'read_id': read.meta['read_id'],
            'scaling_used': 1,
            'median_before': read_metrics['median_current_before'],
        }

        with Fast5.New(filename, 'a', tracking_id=read.tracking_meta,
                       context_tags=read.context_meta, channel_id=channel_id) as h:
            h.set_read(events, read_id)
            if read.raw is not None:
                h.set_raw(read.adc_raw)
Example #2
0
    def __init__(self,
                 outpath=None,
                 summary_file=None,
                 class_filter=None,
                 prefix='',
                 meta=None):
        """Accumulate read metrics into a file of tab-separated values.

        :param outpath: output path for file.
        :param summary_file: filename for output, if `None` output is written
            to stdout.
        :param class_filter: restrict to accumulating defined classes of read.
        :param prefix: run identifier to include in summary file.
        :param meta: dict of meta data to output for each read.
                     keys are column names, values the value to output for each row.

        ..note:: if the output file pre-exists an attempt is made to append
            data to it (without writing a new header). The obviously cannot be
            achieved when output is to stdout.
        """
        super(MetricSummary, self).__init__(class_filter=class_filter)
        self.outpath = outpath
        self.filename = summary_file
        self.prefix = prefix
        self.columns = None
        self.meta = meta

        if self.filename is not None:
            self.filename = add_prefix(self.filename, self.prefix)
        self._open()
        self.converter = lambda x: str(int(x)) if isinstance(x, bool) else str(
            x)  # convert True/False to 1/0
Example #3
0
    def generate_plots(self, ch_results, run_results):
        outpath = self.filtered_summary.outpath
        prefix = self.filtered_summary.prefix
        for col in self.plot_hist_cols:
            mask = np.ones(len(ch_results), dtype=bool)
            if 'to_first_{}'.format(self.block_class) in col:
                # mask out channels without at least 1 block
                mask = ch_results['n_{}'.format(self.block_class)] > 0
            data = ch_results[col][mask]
            scale, units = self._get_scale_units(col)
            x_label = string.capwords(col.replace('_', ' ') + ' ' + units)
            data *= scale
            plot_path = os.path.join(outpath, add_prefix(col + '_hist.png', prefix))
            if len(data) == 0:  # our mask has filtered out all the data
                logger.info('Skipping plot {}, no data after masking.'.format(plot_path))
            else:
                self.plot_exp_hist(data, plot_path, x_label=x_label, prefix=prefix, n_bins=self.hist_bins)

        for sum_name, cols_to_sum in self.plot_hist_sum_cols.items():
            plot_path = os.path.join(outpath, add_prefix(sum_name + '_hist.png', prefix))
            mask = np.ones(len(ch_results), dtype=bool)
            if 'to_first_{}'.format(self.block_class) in sum_name:
                # mask out channels without at least 1 block
                mask = ch_results['n_{}'.format(self.block_class)] > 0
            rows = ch_results[mask]
            if len(rows) == 0:  # our mask has filtered out all the data
                logger.info('Skipping plot {}, no data after masking.'.format(plot_path))
            else:
                data = np.zeros(len(rows), dtype=rows[cols_to_sum[0]].dtype)
                for col in cols_to_sum:
                    data += rows[col]

                scale, units = self._get_scale_units(col)
                x_label = string.capwords(sum_name.replace('_', ' ') + ' ' + units)
                data *= scale
                self.plot_exp_hist(data, plot_path, x_label=x_label, prefix=prefix, n_bins=self.hist_bins)

        for col in self.plot_count_cols:
            plot_path = os.path.join(outpath, add_prefix(col + '_counts.png', prefix))
            scale, units = self._get_scale_units(col)
            x_label = string.capwords(col.replace('_', ' ') + ' ' + units)
            self.make_bar_chart(ch_results[col], plot_path, prefix=prefix, x_label=x_label)

        plot_path = os.path.join(outpath, add_prefix('pcnt_time.png', prefix))
        self.make_stacked_duty_time(run_results, plot_path, col_prefix='pcnt_time_',
                               colours_dict=self.static_classes,
                               x_label=prefix)
Example #4
0
    def _get_levels(self, outpath, prefix):
        """Calculate distribution of event means, and infer open-pore level and  capture level.

        Assumes the pore level corresoponds to the highest-probability peak in
        the distribution, and that the capture level is the second highest.

        :param outpath: directory in which to plot the distribution and levels.
        :param prefix: prefix (prefixed to output plot path)
        :returns: tuple of floats, (pore_level, capture level)
        """
        with BulkFast5(self.fast5) as fh:
            events = fh.get_events(self.channel)

        kde = gaussian_kde(
            events['mean'], bw_method='silverman'
        )  # silverman is seemingly better for multi-modal dists
        x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100)

        pde_vals = kde(x)  # evaluate density over grid
        max_inds = argrelmax(kde(x))  # find all local maxima
        max_probs = pde_vals[max_inds]
        sorted_inds = np.argsort(max_probs)
        max_ind = max_inds[0][
            sorted_inds[-1]]  # index of maxima in x and max_probs
        second_max_ind = max_inds[0][sorted_inds[-2]]

        pore_level = x[max_ind]
        capture_level = x[second_max_ind]

        # plot kde, histogram and levels.
        fig, axis = plt.subplots()
        axis.hist(events['mean'], bins=100, color='k', label='histogram')
        axis.legend(loc='upper center', frameon=False)
        axis2 = axis.twinx()
        axis2.plot(x, kde(x), label='kde', color='k')
        axis2.plot(x[max_inds],
                   pde_vals[max_inds],
                   'o',
                   label='local maxima',
                   color='b')
        axis2.plot(x[max_ind],
                   pde_vals[max_ind],
                   'o',
                   label='open pore current',
                   color='r')
        axis2.plot(x[second_max_ind],
                   pde_vals[second_max_ind],
                   'o',
                   label='capture current',
                   color='g')
        axis2.legend(loc='upper left', frameon=False)
        plot_path = os.path.join(outpath,
                                 add_prefix('AdaptiveThresholdLevels', prefix))
        plt.savefig(plot_path, bbox_inches='tight', dpi=200)

        return pore_level, capture_level
Example #5
0
def main(args=None):
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)

    if args is None:
        args = sys.argv[1:]

    # process args and get yaml string of all options
    args, yaml_conf_out = process_args(args)

    logging.debug('args are: {}'.format(args))

    logger.info("Will stop after {} seconds of expt time.".format(
        args['max_time']))

    # Multiple components will write here
    if 'outpath' in args and args['outpath'] is not None:
        os.mkdir(args['outpath'])

    # save the config
    if args['config_out'] is not None:
        path = add_prefix(args['config_out'], args['prefix'])
        if 'outpath' in args and args['outpath'] is not None:
            path = os.path.join(args['outpath'], path)
        with open(path, 'w') as fh:
            fh.write(yaml_conf_out)

    # Get channel range from bulk if it was not specified
    if args['channels'] is None:
        with BulkFast5(args['fast5'], 'r') as f5:
            args['channels'] = list(f5.channels)
    else:
        args['channels'] = list(args['channels'])

    # Test pipeline can be constructed
    read_generator, metric_calculator, classifier, second_stage = get_pipeline(
        args, args['channels'][0])
    logger.info('Splitter      : {}.'.format(read_generator))
    logger.info('Metrifier     : {}.'.format(metric_calculator))
    logger.info('Classifier    : {}.'.format(classifier))
    logger.info('SecondStage   : {}.'.format(second_stage))

    # Accumulators gather results from individual channels
    accumulators = get_accumulators(args)
    logger.info('Accumulators  : {}.'.format(accumulators))
    for read_metrics in accumulate_channels(args):
        for accumulator in accumulators:
            accumulator.process_read(read_metrics)

    # Finish up accumulators
    for accumulator in accumulators:
        accumulator.finalize()
Example #6
0
    def finalize(self):
        """Write the channel report.
        """
        filepath = add_prefix(self.filename, self.prefix)
        if self.outpath is not None:
            filepath = os.path.join(self.outpath, filepath)

        with open(filepath, 'w') as fh:
            row = '\t'.join(
                ['channel', 'mux', 'first_{}_time'.format(self.report_class)])
            fh.write('{}\n'.format(row))
            for ch, muxes in self.has_read_class.items():
                for mux in muxes:
                    row = '\t'.join(
                        map(str, [ch, mux, self.first_time[ch][mux]]))
                    fh.write('{}\n'.format(row))
Example #7
0
    def setUpClass(cls):

        cls.data_path = os.path.join(os.path.dirname(__file__), 'data')
        cls.out_path = tempfile.mktemp(dir=cls.data_path)
        args = cls.get_read_builder_args()  # a list
        read_builder.main(args=args)
        args = read_builder.get_argparser().parse_args(args)  # an argstore obj
        # load in the read_builder strand summary
        cls.rb = np.genfromtxt(os.path.join(
            args.outpath, add_prefix(args.summary_file, args.prefix)),
                               names=True,
                               dtype=None)
        cls.sb = np.genfromtxt(os.path.join(cls.data_path,
                                            cls.get_ref_filename()),
                               names=True,
                               dtype=None)
        # remove reads with mux not in [1,2,3,4] as differences in mux
        # manipulation between panga and ossetra are expected to lead to
        # differences in the number of reads
        cls.sb = cls.sb[np.logical_and(cls.sb['mux'] > 0, cls.sb['mux'] < 5)]
        cls.rb = cls.rb[np.logical_and(cls.rb['mux'] > 0, cls.rb['mux'] < 5)]

        # define column name mapping for those numeric columns which are common
        cls.rb_to_sb_col_map = {
            'duration': 'strand_duration',
            'median_current_after': 'pore_after',
            'channel': 'channel',
            'drift': 'drift',
            'end_event': 'end_event',
            'median_current': 'median_current',
            'median_dwell': 'median_dwell',
            'median_sd': 'median_sd',
            'mux': 'mux',
            'num_events': 'num_events',
            'range_current': 'range_current',
            'start_event': 'start_event',
            'start_time': 'start_time',
        }

        print '\n* {}'.format(str(cls))
Example #8
0
    def _make_channel_report(self):
        """create summary with one row per channel/mux combination"""
        durations = self.filtered_summary.class_durations
        counts = self.filtered_summary.class_counts

        ch_mux_results = []
        for ch, keep_muxes in self.filtered_summary.keep_ch_mux.items():
            for mux in keep_muxes:
                if keep_muxes[mux]:
                    d = OrderedDict()
                    d['channel'] =  ch
                    d['mux'] =  mux
                    d['time_to_first_{}'.format(self.block_class)] =  self.time_to_block[ch][mux]
                    d['n_{}'.format(self.block_class)] =  counts[ch][mux][self.block_class]
                    d['sum_durations'] =  np.sum(durations[ch][mux].values())
                    # add percentage times
                    d.update({ 'pcnt_time_{}'.format(klass):
                              100.0*durations[ch][mux][klass]/d['sum_durations'] for klass in self.klasses})
                    # add summed klass counts to first block
                    d.update({ 'n_{}_to_first_{}'.format(klass, self.block_class):
                              self.n_reads_to_block[ch][mux][klass] for klass in self.to_first_klasses})
                    # add summed klass durations to first block
                    d.update({ 'sum_duration_{}_to_first_{}'.format(klass, self.block_class):
                              self.duration_reads_to_block[ch][mux][klass] for klass in self.to_first_klasses})
                    ch_mux_results.append(d)

        ch_report_fp = os.path.join(self.filtered_summary.outpath,
                                    add_prefix(self.block_ch_report, prefix=self.filtered_summary.prefix))
        with open(ch_report_fp, 'w') as fh:
            if len(ch_mux_results) > 0:
                cols = ch_mux_results[0].keys()
                convert = self.filtered_summary.converter
                meta = {}
                if self.filtered_summary.meta is not None:
                    meta = self.filtered_summary.meta
                meta_vals = [convert(x) for x in meta.values() ]
                fh.write('{}\n'.format('\t'.join(cols + meta.keys())))
                [ fh.write('\t'.join([convert(d[x]) for x in cols] + meta_vals) + '\n') for d in ch_mux_results ]

        return ch_mux_results
Example #9
0
    def _make_run_report(self, ch_mux_results):
        """create run report with all data aggregated into a single row"""

        #  convert per ch/mux data into a structured array so we can easily feed
        #  it into numpy to get medians / sums, etc.

        # ugly, assume any strings no longer than 20 char
        get_type = lambda x: type(x) if not isinstance(x, basestring) else 'S20'
        dtype = [ (str(key), get_type(value)) for key, value in ch_mux_results[0].items() ]
        results_ar = np.empty(len(ch_mux_results), dtype=dtype)
        for i, r in enumerate(ch_mux_results):
            results_ar[i] = tuple([ r[col] for col in results_ar.dtype.names ])

        agg = OrderedDict()
        agg['n_good_ch_mux'] = len(results_ar)
        agg['n_good_ch_mux_with_{}'.format(self.block_class)] = \
                len(np.where(results_ar['n_{}'.format(self.block_class)] > 0)[0])
        agg['sum_good_ch_mux_run_time'] = np.sum(results_ar['sum_durations'])

        col = 'time_to_first_{}'.format(self.block_class)
        # mask out any channels without a block
        mask = results_ar['n_{}'.format(self.block_class)] > 0
        agg['median_' + col] = np.median(results_ar[mask][col])
        # calculate mean of the exponential distribution to complement the median
        _, agg['exp_mean_' + col] = expon.fit(results_ar[mask][col], floc=0)


        for klass in self.klasses:
            # do percentage time in each class
            col = 'pcnt_time_{}'.format(klass)
            # weight % from each ch/mux by ch/mux sum duration, and renormalise
            agg[col] = np.sum(np.multiply(results_ar[col], results_ar['sum_durations'])) / agg['sum_good_ch_mux_run_time']

        for klass in self.to_first_klasses:
            # do median class count and duration before first block
            for col in ('n_{}_to_first_{}'.format(klass, self.block_class),
                        'sum_duration_{}_to_first_{}'.format(klass, self.block_class),
                        ):
                agg['median_' + col] = np.median(results_ar[mask][col])
                # calculate mean of the exponential distribution to complement the median
                _, agg['exp_mean_' + col] = expon.fit(results_ar[mask][col], floc=0)

        # do counts of channels with 0, 1, 2, 3, 4, 5 or >5 blocks
        col = 'n_{}'.format(self.block_class)
        bins = range(0,7) + [ max(7, np.max(results_ar[col]) + 1) ]
        counts, bins_np = np.histogram(results_ar[col], bins=bins, density=False)
        starts_counts = zip(bins_np, counts)
        for bin_start, count in starts_counts[:-1]:
            col = 'n_ch_mux_{}_{}'.format(bin_start,self.block_class)
            agg[col] = count
        rest_start, rest_count = starts_counts[-1]
        col = 'n_ch_mux_ge_{}_{}'.format(rest_start,self.block_class)
        agg[col] = rest_count


        run_report_fp = os.path.join(self.filtered_summary.outpath,
                                    add_prefix(self.block_run_report,
                                               prefix=self.filtered_summary.prefix))
        meta = self.filtered_summary.meta
        convert = self.filtered_summary.converter
        with open(run_report_fp, 'w') as fh:
            fh.write('{}\n'.format('\t'.join(agg.keys() + meta.keys())))
            fh.write('{}\n'.format('\t'.join([convert(v) for v in agg.values() + meta.values()])))

        return results_ar, agg
Example #10
0
    def _get_levels(self,
                    outpath,
                    prefix,
                    times=None,
                    pore_rank=0,
                    capture_rank=1,
                    thresh_factor=0.9):
        """Calculate distribution of event means, and infer open-pore level and  capture level.

        Assumes the pore level corresoponds to the highest-probability peak in
        the distribution, and that the capture level is the second highest.

        :param outpath: directory in which to plot the distribution and levels.
        :param prefix: prefix (prefixed to output plot path)
        :param times: (start time, end time) or None
        :param pore_rank: int, ranking of pore current within kde local maxima,
               defaults corresponds to highest probability peak.
        :param capture_rank: int, ranking of capture current within kde local maxima,
               defaults corresponds to second highest probability peak.
        :param thresh_factor: float, factor f with which to calculate boundary threshold;
                threshold = capture_level + f * (pore_level - capture_level)
                a value of 0.5 implies the midpoint between pore and capture.
        :returns: tuple of floats, (pore_level, capture_level, threshold)
        """
        with BulkFast5(self.fast5) as fh:
            logger.info('Loading events for channel {}'.format(self.channel))
            events = fh.get_events(self.channel, times=times)

        logger.info('Calculating kde for channel {}'.format(self.channel))
        kde = gaussian_kde(
            events['mean'], bw_method='silverman'
        )  # silverman is seemingly better for multi-modal dists
        logger.info('Done calculating kde for channel {}'.format(self.channel))
        x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100)

        pde_vals = kde(x)  # evaluate density over grid
        max_inds = argrelmax(kde(x))  # find all local maxima
        max_probs = pde_vals[max_inds]
        sorted_inds = np.argsort(max_probs)[::-1]  # so max prob is 1st elem
        pore_ind = max_inds[0][sorted_inds[pore_rank]]
        capture_ind = max_inds[0][sorted_inds[capture_rank]]

        pore_level = x[pore_ind]
        capture_level = x[capture_ind]
        threshold = capture_level + thresh_factor * (pore_level -
                                                     capture_level)

        # plot kde, histogram and levels.
        fig, axis = plt.subplots()
        axis.hist(events['mean'], bins=100, color='k', label='histogram')
        axis.legend(loc='upper center', frameon=False)
        axis.set_xlim((-100, 400))
        axis2 = axis.twinx()

        axis2.plot(x, kde(x), label='kde', color='k')
        axis2.plot(x[max_inds],
                   pde_vals[max_inds],
                   'o',
                   label='local maxima',
                   color='b')
        axis2.plot(x[pore_ind],
                   pde_vals[pore_ind],
                   'o',
                   label='open pore current',
                   color='r')
        axis2.plot(x[capture_ind],
                   pde_vals[capture_ind],
                   'o',
                   label='capture current',
                   color='g')
        axis.axvline(threshold, label='threshold', color='magenta')
        axis2.legend(loc='upper left', frameon=False)
        plot_path = os.path.join(
            outpath,
            add_prefix('AdaptiveThresholdLevels_{}'.format(
                self.channel, prefix)))
        plt.savefig(plot_path, bbox_inches='tight', dpi=200)
        with open(plot_path + '.txt', 'w') as fh:
            fh.write('#pore rank {}\n'.format(pore_rank))
            fh.write('#capture rank {}\n'.format(capture_rank))
            fh.write('#thresh_factor {}\n'.format(thresh_factor))
            fh.write('#pore level {}\n'.format(pore_level))
            fh.write('#capture level {}\n'.format(capture_level))
            fh.write('#threshold level {}\n'.format(threshold))
            # write local maxima in kde distribution
            fh.write('# probability maxima in kde \n')
            fh.write('\t'.join(['pA', 'kde']) + '\n')
            for i in range(len(max_probs)):
                j = max_inds[0][sorted_inds[i]]
                fh.write('\t'.join(map(str, [x[j], pde_vals[j]])) + '\n')
            # write sampled kde
            fh.write('# kde points \n')
            fh.write('\t'.join(['pA', 'kde']) + '\n')
            for xi, yi in zip(x, pde_vals):
                fh.write('\t'.join(map(str, [xi, yi])) + '\n')

        return pore_level, capture_level, threshold