def _process_read(self, read, read_metrics): self.n_reads += 1 filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads) filename = add_prefix(filename, self.prefix) # add filename to read_metrics so it can be reported in summaries read_metrics['filename'] = filename filename = os.path.join(self.outpath, filename) channel_id = { 'channel_number': self.channel, 'range': read.channel_meta['range'], 'digitisation': read.channel_meta['digitisation'], 'offset': read.channel_meta['offset'], 'sample_rate': read.channel_meta['sample_rate'], 'sampling_rate': read.channel_meta['sample_rate'] } if read.events is None: raise RuntimeError('Read has no events data, cannot write fast5') events = read.events read_id = { 'start_time': events['start'][0], 'duration': events['start'][-1] + events['length'][-1] - events['start'][0], 'read_number': self.n_reads, 'start_mux': read_metrics['mux'], 'read_id': read.meta['read_id'], 'scaling_used': 1, 'median_before': read_metrics['median_current_before'], } with Fast5.New(filename, 'a', tracking_id=read.tracking_meta, context_tags=read.context_meta, channel_id=channel_id) as h: h.set_read(events, read_id) if read.raw is not None: h.set_raw(read.adc_raw)
def __init__(self, outpath=None, summary_file=None, class_filter=None, prefix='', meta=None): """Accumulate read metrics into a file of tab-separated values. :param outpath: output path for file. :param summary_file: filename for output, if `None` output is written to stdout. :param class_filter: restrict to accumulating defined classes of read. :param prefix: run identifier to include in summary file. :param meta: dict of meta data to output for each read. keys are column names, values the value to output for each row. ..note:: if the output file pre-exists an attempt is made to append data to it (without writing a new header). The obviously cannot be achieved when output is to stdout. """ super(MetricSummary, self).__init__(class_filter=class_filter) self.outpath = outpath self.filename = summary_file self.prefix = prefix self.columns = None self.meta = meta if self.filename is not None: self.filename = add_prefix(self.filename, self.prefix) self._open() self.converter = lambda x: str(int(x)) if isinstance(x, bool) else str( x) # convert True/False to 1/0
def generate_plots(self, ch_results, run_results): outpath = self.filtered_summary.outpath prefix = self.filtered_summary.prefix for col in self.plot_hist_cols: mask = np.ones(len(ch_results), dtype=bool) if 'to_first_{}'.format(self.block_class) in col: # mask out channels without at least 1 block mask = ch_results['n_{}'.format(self.block_class)] > 0 data = ch_results[col][mask] scale, units = self._get_scale_units(col) x_label = string.capwords(col.replace('_', ' ') + ' ' + units) data *= scale plot_path = os.path.join(outpath, add_prefix(col + '_hist.png', prefix)) if len(data) == 0: # our mask has filtered out all the data logger.info('Skipping plot {}, no data after masking.'.format(plot_path)) else: self.plot_exp_hist(data, plot_path, x_label=x_label, prefix=prefix, n_bins=self.hist_bins) for sum_name, cols_to_sum in self.plot_hist_sum_cols.items(): plot_path = os.path.join(outpath, add_prefix(sum_name + '_hist.png', prefix)) mask = np.ones(len(ch_results), dtype=bool) if 'to_first_{}'.format(self.block_class) in sum_name: # mask out channels without at least 1 block mask = ch_results['n_{}'.format(self.block_class)] > 0 rows = ch_results[mask] if len(rows) == 0: # our mask has filtered out all the data logger.info('Skipping plot {}, no data after masking.'.format(plot_path)) else: data = np.zeros(len(rows), dtype=rows[cols_to_sum[0]].dtype) for col in cols_to_sum: data += rows[col] scale, units = self._get_scale_units(col) x_label = string.capwords(sum_name.replace('_', ' ') + ' ' + units) data *= scale self.plot_exp_hist(data, plot_path, x_label=x_label, prefix=prefix, n_bins=self.hist_bins) for col in self.plot_count_cols: plot_path = os.path.join(outpath, add_prefix(col + '_counts.png', prefix)) scale, units = self._get_scale_units(col) x_label = string.capwords(col.replace('_', ' ') + ' ' + units) self.make_bar_chart(ch_results[col], plot_path, prefix=prefix, x_label=x_label) plot_path = os.path.join(outpath, add_prefix('pcnt_time.png', prefix)) self.make_stacked_duty_time(run_results, plot_path, col_prefix='pcnt_time_', colours_dict=self.static_classes, x_label=prefix)
def _get_levels(self, outpath, prefix): """Calculate distribution of event means, and infer open-pore level and capture level. Assumes the pore level corresoponds to the highest-probability peak in the distribution, and that the capture level is the second highest. :param outpath: directory in which to plot the distribution and levels. :param prefix: prefix (prefixed to output plot path) :returns: tuple of floats, (pore_level, capture level) """ with BulkFast5(self.fast5) as fh: events = fh.get_events(self.channel) kde = gaussian_kde( events['mean'], bw_method='silverman' ) # silverman is seemingly better for multi-modal dists x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100) pde_vals = kde(x) # evaluate density over grid max_inds = argrelmax(kde(x)) # find all local maxima max_probs = pde_vals[max_inds] sorted_inds = np.argsort(max_probs) max_ind = max_inds[0][ sorted_inds[-1]] # index of maxima in x and max_probs second_max_ind = max_inds[0][sorted_inds[-2]] pore_level = x[max_ind] capture_level = x[second_max_ind] # plot kde, histogram and levels. fig, axis = plt.subplots() axis.hist(events['mean'], bins=100, color='k', label='histogram') axis.legend(loc='upper center', frameon=False) axis2 = axis.twinx() axis2.plot(x, kde(x), label='kde', color='k') axis2.plot(x[max_inds], pde_vals[max_inds], 'o', label='local maxima', color='b') axis2.plot(x[max_ind], pde_vals[max_ind], 'o', label='open pore current', color='r') axis2.plot(x[second_max_ind], pde_vals[second_max_ind], 'o', label='capture current', color='g') axis2.legend(loc='upper left', frameon=False) plot_path = os.path.join(outpath, add_prefix('AdaptiveThresholdLevels', prefix)) plt.savefig(plot_path, bbox_inches='tight', dpi=200) return pore_level, capture_level
def main(args=None): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) if args is None: args = sys.argv[1:] # process args and get yaml string of all options args, yaml_conf_out = process_args(args) logging.debug('args are: {}'.format(args)) logger.info("Will stop after {} seconds of expt time.".format( args['max_time'])) # Multiple components will write here if 'outpath' in args and args['outpath'] is not None: os.mkdir(args['outpath']) # save the config if args['config_out'] is not None: path = add_prefix(args['config_out'], args['prefix']) if 'outpath' in args and args['outpath'] is not None: path = os.path.join(args['outpath'], path) with open(path, 'w') as fh: fh.write(yaml_conf_out) # Get channel range from bulk if it was not specified if args['channels'] is None: with BulkFast5(args['fast5'], 'r') as f5: args['channels'] = list(f5.channels) else: args['channels'] = list(args['channels']) # Test pipeline can be constructed read_generator, metric_calculator, classifier, second_stage = get_pipeline( args, args['channels'][0]) logger.info('Splitter : {}.'.format(read_generator)) logger.info('Metrifier : {}.'.format(metric_calculator)) logger.info('Classifier : {}.'.format(classifier)) logger.info('SecondStage : {}.'.format(second_stage)) # Accumulators gather results from individual channels accumulators = get_accumulators(args) logger.info('Accumulators : {}.'.format(accumulators)) for read_metrics in accumulate_channels(args): for accumulator in accumulators: accumulator.process_read(read_metrics) # Finish up accumulators for accumulator in accumulators: accumulator.finalize()
def finalize(self): """Write the channel report. """ filepath = add_prefix(self.filename, self.prefix) if self.outpath is not None: filepath = os.path.join(self.outpath, filepath) with open(filepath, 'w') as fh: row = '\t'.join( ['channel', 'mux', 'first_{}_time'.format(self.report_class)]) fh.write('{}\n'.format(row)) for ch, muxes in self.has_read_class.items(): for mux in muxes: row = '\t'.join( map(str, [ch, mux, self.first_time[ch][mux]])) fh.write('{}\n'.format(row))
def setUpClass(cls): cls.data_path = os.path.join(os.path.dirname(__file__), 'data') cls.out_path = tempfile.mktemp(dir=cls.data_path) args = cls.get_read_builder_args() # a list read_builder.main(args=args) args = read_builder.get_argparser().parse_args(args) # an argstore obj # load in the read_builder strand summary cls.rb = np.genfromtxt(os.path.join( args.outpath, add_prefix(args.summary_file, args.prefix)), names=True, dtype=None) cls.sb = np.genfromtxt(os.path.join(cls.data_path, cls.get_ref_filename()), names=True, dtype=None) # remove reads with mux not in [1,2,3,4] as differences in mux # manipulation between panga and ossetra are expected to lead to # differences in the number of reads cls.sb = cls.sb[np.logical_and(cls.sb['mux'] > 0, cls.sb['mux'] < 5)] cls.rb = cls.rb[np.logical_and(cls.rb['mux'] > 0, cls.rb['mux'] < 5)] # define column name mapping for those numeric columns which are common cls.rb_to_sb_col_map = { 'duration': 'strand_duration', 'median_current_after': 'pore_after', 'channel': 'channel', 'drift': 'drift', 'end_event': 'end_event', 'median_current': 'median_current', 'median_dwell': 'median_dwell', 'median_sd': 'median_sd', 'mux': 'mux', 'num_events': 'num_events', 'range_current': 'range_current', 'start_event': 'start_event', 'start_time': 'start_time', } print '\n* {}'.format(str(cls))
def _make_channel_report(self): """create summary with one row per channel/mux combination""" durations = self.filtered_summary.class_durations counts = self.filtered_summary.class_counts ch_mux_results = [] for ch, keep_muxes in self.filtered_summary.keep_ch_mux.items(): for mux in keep_muxes: if keep_muxes[mux]: d = OrderedDict() d['channel'] = ch d['mux'] = mux d['time_to_first_{}'.format(self.block_class)] = self.time_to_block[ch][mux] d['n_{}'.format(self.block_class)] = counts[ch][mux][self.block_class] d['sum_durations'] = np.sum(durations[ch][mux].values()) # add percentage times d.update({ 'pcnt_time_{}'.format(klass): 100.0*durations[ch][mux][klass]/d['sum_durations'] for klass in self.klasses}) # add summed klass counts to first block d.update({ 'n_{}_to_first_{}'.format(klass, self.block_class): self.n_reads_to_block[ch][mux][klass] for klass in self.to_first_klasses}) # add summed klass durations to first block d.update({ 'sum_duration_{}_to_first_{}'.format(klass, self.block_class): self.duration_reads_to_block[ch][mux][klass] for klass in self.to_first_klasses}) ch_mux_results.append(d) ch_report_fp = os.path.join(self.filtered_summary.outpath, add_prefix(self.block_ch_report, prefix=self.filtered_summary.prefix)) with open(ch_report_fp, 'w') as fh: if len(ch_mux_results) > 0: cols = ch_mux_results[0].keys() convert = self.filtered_summary.converter meta = {} if self.filtered_summary.meta is not None: meta = self.filtered_summary.meta meta_vals = [convert(x) for x in meta.values() ] fh.write('{}\n'.format('\t'.join(cols + meta.keys()))) [ fh.write('\t'.join([convert(d[x]) for x in cols] + meta_vals) + '\n') for d in ch_mux_results ] return ch_mux_results
def _make_run_report(self, ch_mux_results): """create run report with all data aggregated into a single row""" # convert per ch/mux data into a structured array so we can easily feed # it into numpy to get medians / sums, etc. # ugly, assume any strings no longer than 20 char get_type = lambda x: type(x) if not isinstance(x, basestring) else 'S20' dtype = [ (str(key), get_type(value)) for key, value in ch_mux_results[0].items() ] results_ar = np.empty(len(ch_mux_results), dtype=dtype) for i, r in enumerate(ch_mux_results): results_ar[i] = tuple([ r[col] for col in results_ar.dtype.names ]) agg = OrderedDict() agg['n_good_ch_mux'] = len(results_ar) agg['n_good_ch_mux_with_{}'.format(self.block_class)] = \ len(np.where(results_ar['n_{}'.format(self.block_class)] > 0)[0]) agg['sum_good_ch_mux_run_time'] = np.sum(results_ar['sum_durations']) col = 'time_to_first_{}'.format(self.block_class) # mask out any channels without a block mask = results_ar['n_{}'.format(self.block_class)] > 0 agg['median_' + col] = np.median(results_ar[mask][col]) # calculate mean of the exponential distribution to complement the median _, agg['exp_mean_' + col] = expon.fit(results_ar[mask][col], floc=0) for klass in self.klasses: # do percentage time in each class col = 'pcnt_time_{}'.format(klass) # weight % from each ch/mux by ch/mux sum duration, and renormalise agg[col] = np.sum(np.multiply(results_ar[col], results_ar['sum_durations'])) / agg['sum_good_ch_mux_run_time'] for klass in self.to_first_klasses: # do median class count and duration before first block for col in ('n_{}_to_first_{}'.format(klass, self.block_class), 'sum_duration_{}_to_first_{}'.format(klass, self.block_class), ): agg['median_' + col] = np.median(results_ar[mask][col]) # calculate mean of the exponential distribution to complement the median _, agg['exp_mean_' + col] = expon.fit(results_ar[mask][col], floc=0) # do counts of channels with 0, 1, 2, 3, 4, 5 or >5 blocks col = 'n_{}'.format(self.block_class) bins = range(0,7) + [ max(7, np.max(results_ar[col]) + 1) ] counts, bins_np = np.histogram(results_ar[col], bins=bins, density=False) starts_counts = zip(bins_np, counts) for bin_start, count in starts_counts[:-1]: col = 'n_ch_mux_{}_{}'.format(bin_start,self.block_class) agg[col] = count rest_start, rest_count = starts_counts[-1] col = 'n_ch_mux_ge_{}_{}'.format(rest_start,self.block_class) agg[col] = rest_count run_report_fp = os.path.join(self.filtered_summary.outpath, add_prefix(self.block_run_report, prefix=self.filtered_summary.prefix)) meta = self.filtered_summary.meta convert = self.filtered_summary.converter with open(run_report_fp, 'w') as fh: fh.write('{}\n'.format('\t'.join(agg.keys() + meta.keys()))) fh.write('{}\n'.format('\t'.join([convert(v) for v in agg.values() + meta.values()]))) return results_ar, agg
def _get_levels(self, outpath, prefix, times=None, pore_rank=0, capture_rank=1, thresh_factor=0.9): """Calculate distribution of event means, and infer open-pore level and capture level. Assumes the pore level corresoponds to the highest-probability peak in the distribution, and that the capture level is the second highest. :param outpath: directory in which to plot the distribution and levels. :param prefix: prefix (prefixed to output plot path) :param times: (start time, end time) or None :param pore_rank: int, ranking of pore current within kde local maxima, defaults corresponds to highest probability peak. :param capture_rank: int, ranking of capture current within kde local maxima, defaults corresponds to second highest probability peak. :param thresh_factor: float, factor f with which to calculate boundary threshold; threshold = capture_level + f * (pore_level - capture_level) a value of 0.5 implies the midpoint between pore and capture. :returns: tuple of floats, (pore_level, capture_level, threshold) """ with BulkFast5(self.fast5) as fh: logger.info('Loading events for channel {}'.format(self.channel)) events = fh.get_events(self.channel, times=times) logger.info('Calculating kde for channel {}'.format(self.channel)) kde = gaussian_kde( events['mean'], bw_method='silverman' ) # silverman is seemingly better for multi-modal dists logger.info('Done calculating kde for channel {}'.format(self.channel)) x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100) pde_vals = kde(x) # evaluate density over grid max_inds = argrelmax(kde(x)) # find all local maxima max_probs = pde_vals[max_inds] sorted_inds = np.argsort(max_probs)[::-1] # so max prob is 1st elem pore_ind = max_inds[0][sorted_inds[pore_rank]] capture_ind = max_inds[0][sorted_inds[capture_rank]] pore_level = x[pore_ind] capture_level = x[capture_ind] threshold = capture_level + thresh_factor * (pore_level - capture_level) # plot kde, histogram and levels. fig, axis = plt.subplots() axis.hist(events['mean'], bins=100, color='k', label='histogram') axis.legend(loc='upper center', frameon=False) axis.set_xlim((-100, 400)) axis2 = axis.twinx() axis2.plot(x, kde(x), label='kde', color='k') axis2.plot(x[max_inds], pde_vals[max_inds], 'o', label='local maxima', color='b') axis2.plot(x[pore_ind], pde_vals[pore_ind], 'o', label='open pore current', color='r') axis2.plot(x[capture_ind], pde_vals[capture_ind], 'o', label='capture current', color='g') axis.axvline(threshold, label='threshold', color='magenta') axis2.legend(loc='upper left', frameon=False) plot_path = os.path.join( outpath, add_prefix('AdaptiveThresholdLevels_{}'.format( self.channel, prefix))) plt.savefig(plot_path, bbox_inches='tight', dpi=200) with open(plot_path + '.txt', 'w') as fh: fh.write('#pore rank {}\n'.format(pore_rank)) fh.write('#capture rank {}\n'.format(capture_rank)) fh.write('#thresh_factor {}\n'.format(thresh_factor)) fh.write('#pore level {}\n'.format(pore_level)) fh.write('#capture level {}\n'.format(capture_level)) fh.write('#threshold level {}\n'.format(threshold)) # write local maxima in kde distribution fh.write('# probability maxima in kde \n') fh.write('\t'.join(['pA', 'kde']) + '\n') for i in range(len(max_probs)): j = max_inds[0][sorted_inds[i]] fh.write('\t'.join(map(str, [x[j], pde_vals[j]])) + '\n') # write sampled kde fh.write('# kde points \n') fh.write('\t'.join(['pA', 'kde']) + '\n') for xi, yi in zip(x, pde_vals): fh.write('\t'.join(map(str, [xi, yi])) + '\n') return pore_level, capture_level, threshold