def to_report(filtered_csv, output_dir, dpi=72, thumb_dpi=20): """ Run Report """ validate_file(filtered_csv) validate_dir(output_dir) aggregators = { 'nbases': SumAggregator('length'), 'nreads': CountAggregator('length'), 'mean_subreadlength': MeanSubreadLengthAggregator('length'), 'max_readlength': MaxAggregator('length'), 'n50': N50Aggregator('length'), 'readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000), 'subread': SubreadLengthHistogram(dx=100) } passed_filter = lambda record: record.passed_filter is True passed_filter_func = functools.partial(_apply, [passed_filter], aggregators.values()) all_subread_aggregators = { 'raw_nreads': SumAggregator('length'), 'max_raw_readlength': MaxAggregator('length'), 'raw_readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000) } all_filter_func = functools.partial(_apply, [null_filter], all_subread_aggregators.values()) funcs = [passed_filter_func, all_filter_func] with open(filtered_csv, 'r') as f: # read in header header = f.readline() # validate_header(header) applyer(to_record, f, funcs) for aggregator in itertools.chain(aggregators.values(), all_subread_aggregators.values()): log.info(aggregator) # Check if any reads are found if all_subread_aggregators['raw_nreads'].attribute == 0: raise NoSubreadsFound( "No subreads found in {f}".format(f=filtered_csv)) # Now check if aggregators['nreads'].attribute == 0: msg = "No subreads passed the filter in {f}.".format(f=filtered_csv) raise NoSubreadsPassedFilter(msg) # this is where you change the plotting options plot_view = PlotViewProperties( Constants.P_POST_FILTER, Constants.PG_SUBREAD_LENGTH, custom_subread_length_histogram, Constants.I_FILTER_SUBREADS_HIST, xlabel=meta_rpt.get_meta_plotgroup( Constants.PG_SUBREAD_LENGTH).get_meta_plot( Constants.P_POST_FILTER).xlabel, ylabel=meta_rpt.get_meta_plotgroup( Constants.PG_SUBREAD_LENGTH).get_meta_plot( Constants.P_POST_FILTER).ylabel["L"], rlabel=meta_rpt.get_meta_plotgroup( Constants.PG_SUBREAD_LENGTH).get_meta_plot( Constants.P_POST_FILTER).ylabel["R"], thumb="filtered_subread_report_thmb.png", use_group_thumb=True, plot_group_title="", color=get_green(3), edgecolor=get_green(2)) view_config_d = {'subread': plot_view} id_aggregators = {'subread': aggregators['subread']} plot_groups = to_plot_groups(view_config_d, output_dir, id_aggregators) to_a = lambda n: aggregators[n].attribute attributes = _to_attributes(to_a('nreads'), to_a('nbases'), to_a('mean_subreadlength'), to_a('n50')) report = Report(Constants.R_ID, title="Subread filtering", plotgroups=plot_groups, attributes=attributes) log.debug(str(report)) return meta_rpt.apply_view(report)
def to_report(self, output_dir, report_id=Constants.R_ID): """ This needs to be cleaned up. Keeping the old interface for testing purposes. """ started_at = time.time() log.info("Found {n} movies.".format(n=len(self.movies))) log.info("Working from {n} alignment file{s}: {f}".format( n=len(self.alignment_file_list), s='s' if len(self.alignment_file_list) > 1 else '', f=self.alignment_file_list)) # make this a dict {attribute_key_name:Aggreggator} so it's easy to # access the instances after they've been computed. # there's duplicated keys in the attributes? # number_of_aligned_reads/mapped_reads_n _total_aggregators = self._get_total_aggregators() null_filter = lambda r: True total_model = StatisticsModel( _total_aggregators.values(), filter_func=null_filter) # need to create specific instances for a given movie. This is used to # create the mapping reports stats table movie_models = {} def _my_filter(movie_name1, movie_name2): return movie_name1 == movie_name2 for movie in self.movies: ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES] # Note this WILL NOT work because of how scope works in python # filter_by_movie_func = lambda m_name: movie.name == m_name _my_filter_func = functools.partial(_my_filter, movie) model = StatisticsModel(ags, filter_func=_my_filter_func) movie_models[movie] = model # The statistic models that will be run all_models = [total_model] + movie_models.values() log.debug(all_models) # Run all the analysis. Now the aggregators can be accessed analyze_movies(self.movies, self.alignment_file_list, all_models) # temp structure used to create the report table. The order is # important # add total values _to_a = lambda k: _total_aggregators[k].attribute _row = [_to_a(n) for n in self.COLUMN_ATTR] _row.insert(0, 'All Movies') movie_datum = [_row] # Add each individual movie stats for movie_name_, model_ in movie_models.iteritems(): _row = [movie_name_] for a in model_.aggregators: _row.append(a.attribute) movie_datum.append(_row) log.info(movie_datum) # create the Report table table = self._to_table(movie_datum) for movie_name, model in movie_models.iteritems(): log.info("Movie name {n}".format(n=movie_name)) for a in model.aggregators: log.info(movie_name + " " + repr(a)) log.info("") log.info("Total models") for a in total_model.aggregators: log.info(a) attributes = get_attributes(_total_aggregators) log.info("Attributes from streaming mapping Report.") for a in attributes: log.info(a) plot_config_views = self._get_plot_view_configs() plot_groups = [] ds = openDataFile(self.alignment_file) ds.updateCounts() if len(ds) > 0: # keeping the ids independent requires a bit of dictionary madness # {report_id:HistogramAggregator} id_to_aggregators = {k: _total_aggregators[v] for k, v in self.HISTOGRAM_IDS.iteritems()} plot_groups = to_plot_groups(plot_config_views, output_dir, id_to_aggregators) rb_pg = PlotGroup(Constants.PG_RAINBOW) rb_png = "mapped_concordance_vs_read_length.png" make_rainbow_plot(self.alignment_file, rb_png) rb_plt = Plot(Constants.P_RAINBOW, rb_png, caption=get_plot_caption(spec, Constants.PG_RAINBOW, Constants.P_RAINBOW)) rb_pg.add_plot(rb_plt) plot_groups.append(rb_pg) self.add_more_plots(plot_groups, output_dir) tables = [table] report = Report(report_id, attributes=attributes, plotgroups=plot_groups, tables=tables, dataset_uuids=self.dataset_uuids) log.debug(report) run_time = time.time() - started_at log.info("Completed running in {s:.2f} sec.".format(s=run_time)) return report
def to_report(filter_csv, output_dir, dpi=72): """Main point of entry The filter stats report has two main modes. All Reads: (i.e., PreFilter) SequencingZMW > 0 - total bases - total number of reads - n50 - mean readlength - mean readscore HQ Region: (i.e., PostFilter) PassedFilter > 0, SequencingZMW > 0 - total bases - total number of reads - n50 - mean readlength - mean readscore Generates: - Pre and Post filter ReadLength histograms with SDF (with thumbnails) - Pre and Post filter ReadScore Histogram with SDF (with thumbnails) - Pre and Post table of total bases, # of reads, mean readlengh, mean readscore :type filter_csv: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing CSV {f}".format(f=filter_csv)) # A little sanity checking csv_header_fields = _get_header_fields_from_csv(filter_csv) if not _validate_header(csv_header_fields): raise CsvParserError("Unable to find required fields {r} in {f}.".format( r=_REQUIRED_HEADER_FIELDS, f=filter_csv)) P = functools.partial row_to_rec_func = P(_row_to_record, _get_header_fields_from_csv(filter_csv)) # General Filters # General construct to create a func with signature f(record) -> Bool seq_zmw_filter_f = lambda record: record.SequencingZMW > 0 hq_filter_f = lambda record: record.PassedFilter > 0 # Pre-Filter Aggregator(s) nbases_ag = SumAggregator('Readlength') nreads_ag = CountAggregator('Readlength') readlength_ag = MeanAggregator('Readlength') max_readlength_ag = MaxAggregator('Readlength') min_readlength_ag = MinAggregator('Readlength') # the histogram is adaptively computed. The min value and dx is the readlength_hist_ag = HistogramAggregator('Readlength', 0, dx=100) read_score_hist_ag = HistogramAggregator('ReadScore', 0, dx=0.002) # n50_ag = N50Aggregator('Readlength', max_bins=200000) pre_n50_ag = N50Aggregator('Readlength', max_bins=200000) readscore_ag = SumAggregator('ReadScore', total=0) readscore_mean_ag = MeanAggregator('ReadScore') # Create/bind core Functions that can be based to the applyer method # Calling these 'Models'. A model is list of filters and an aggregator # Signature to _apply is ([filter1, filter2], aggregator, record) # calling functools.partial returns a function signature f(record) pre_filters = [seq_zmw_filter_f] pre_agros = [nbases_ag, nreads_ag, readscore_ag, readscore_mean_ag, min_readlength_ag, max_readlength_ag, readlength_ag, readlength_hist_ag, read_score_hist_ag, pre_n50_ag] pre_models = [P(_apply, pre_filters, pre_agros)] # Post-Filter Aggregator(s) post_nbases_ag = SumAggregator('Readlength') post_nreads_ag = CountAggregator('Readlength') post_readlength_ag = MeanAggregator('Readlength') post_min_readlength_ag = MinAggregator('Readlength') post_max_readlength_ag = MaxAggregator('Readlength') # the histogram is adaptively computed. The min value and dx is the post_readlength_hist_ag = HistogramAggregator('Readlength', 0, dx=100) post_readscore_hist_ag = HistogramAggregator('ReadScore', 0, dx=0.002) post_readscore_ag = SumAggregator('ReadScore') post_readscore_mean_ag = MeanAggregator('ReadScore') post_n50_ag = N50Aggregator('Readlength', max_bins=200000) # Post Filter Models post_filters = [seq_zmw_filter_f, hq_filter_f] post_agros = [post_nbases_ag, post_nreads_ag, post_readlength_ag, post_readscore_ag, post_min_readlength_ag, post_max_readlength_ag, post_readscore_mean_ag, post_n50_ag, post_readlength_hist_ag, post_readscore_hist_ag] post_models = [P(_apply, post_filters, post_agros)] models = pre_models + post_models with open(filter_csv, 'r') as f: # read in header _ = f.readline() applyer(row_to_rec_func, f, models) # Sanity Checking of data # Look for csv files with only the csv header (i.e., empty with no data) if nreads_ag.attribute < 1: msg = "No filtered reads found in {f}. Unable to generate FilterStats report".format( f=filter_csv) raise NoFilteredReadsError(msg) # Exit if all the reads were filtered out. if post_nreads_ag.attribute < 1: msg = "No filtered reads found in {f}. Unable to generate report.".format( f=filter_csv) raise NoPassedFilteredReadsError(msg) # this is getting really messy id_to_aggro = {Constants.A_BASE_N_POST_FILTER: post_nbases_ag, Constants.A_BASE_N_PRE_FILTER: nbases_ag, Constants.A_MEAN_READ_LENGTH_POST_FILTER: post_readlength_ag, Constants.A_MEAN_READ_LENGTH_PRE_FILTER: readlength_ag, Constants.A_MEAN_READ_SCORE_POST_FILTER: post_readscore_mean_ag, Constants.A_MEAN_READ_SCORE_PRE_FILTER: readscore_mean_ag, Constants.A_READS_N_POST_FILTER: post_nreads_ag, Constants.A_READS_N_PRE_FILTER: nreads_ag, Constants.A_N50_READ_LENGTH_PRE_FILTER: pre_n50_ag, Constants.A_N50_READ_LENGTH_POST_FILTER: post_n50_ag} _log_aggros(id_to_aggro) attributes = to_attributes(**id_to_aggro) plot_aggregators = {Constants.P_PRE_FILTER_READ_LENGTH_HIST: readlength_hist_ag, Constants.P_POST_FILTER_READ_LENGHT_HIST: post_readlength_hist_ag, Constants.P_PRE_FILTER_READ_SCORE_HIST: read_score_hist_ag, Constants.P_POST_FILTER_READ_SCORE_HIST: post_readscore_hist_ag} _log_aggros(plot_aggregators) # this is dict of {id:PlotView instances} view_configs = _plot_view_configs() plot_groups = to_plot_groups(view_configs, output_dir, plot_aggregators) # Temp lists to Create Pbreports Table # Each list has [nbases, nreads, n50, mean readlength, mean readscore] _pre_filter = [nbases_ag.attribute, nreads_ag.attribute, pre_n50_ag.attribute, readlength_ag.attribute, np.round(readscore_mean_ag.attribute, decimals=3)] _post_filter = [post_nbases_ag.attribute, post_nreads_ag.attribute, post_n50_ag.attribute, post_readlength_ag.attribute, np.round(post_readscore_mean_ag.attribute, decimals=3)] table = to_table(_pre_filter, _post_filter) report = Report(Constants.R_ID, tables=[table], plotgroups=plot_groups, attributes=attributes) log.debug(report) for attribute in report.attributes: log.debug(attribute) log.info(str(report.tables[0])) return report
def to_report(filtered_csv, output_dir, dpi=72, thumb_dpi=20): """ Run Report """ validate_file(filtered_csv) validate_dir(output_dir) aggregators = {'nbases': SumAggregator('length'), 'nreads': CountAggregator('length'), 'mean_subreadlength': MeanSubreadLengthAggregator('length'), 'max_readlength': MaxAggregator('length'), 'n50': N50Aggregator('length'), 'readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000), 'subread': SubreadLengthHistogram(dx=100)} passed_filter = lambda record: record.passed_filter is True passed_filter_func = functools.partial( _apply, [passed_filter], aggregators.values()) all_subread_aggregators = {'raw_nreads': SumAggregator('length'), 'max_raw_readlength': MaxAggregator('length'), 'raw_readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000)} all_filter_func = functools.partial( _apply, [null_filter], all_subread_aggregators.values()) funcs = [passed_filter_func, all_filter_func] with open(filtered_csv, 'r') as f: # read in header header = f.readline() # validate_header(header) applyer(to_record, f, funcs) for aggregator in itertools.chain(aggregators.values(), all_subread_aggregators.values()): log.info(aggregator) # Check if any reads are found if all_subread_aggregators['raw_nreads'].attribute == 0: raise NoSubreadsFound( "No subreads found in {f}".format(f=filtered_csv)) # Now check if aggregators['nreads'].attribute == 0: msg = "No subreads passed the filter in {f}.".format(f=filtered_csv) raise NoSubreadsPassedFilter(msg) # this is where you change the plotting options plot_view = PlotViewProperties(Constants.P_POST_FILTER, Constants.PG_SUBREAD_LENGTH, custom_subread_length_histogram, Constants.I_FILTER_SUBREADS_HIST, xlabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).xlabel, ylabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).ylabel["L"], rlabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).ylabel["R"], thumb="filtered_subread_report_thmb.png", use_group_thumb=True, plot_group_title="", color=get_green(3), edgecolor=get_green(2)) view_config_d = {'subread': plot_view} id_aggregators = {'subread': aggregators['subread']} plot_groups = to_plot_groups(view_config_d, output_dir, id_aggregators) to_a = lambda n: aggregators[n].attribute attributes = _to_attributes(to_a('nreads'), to_a('nbases'), to_a('mean_subreadlength'), to_a('n50')) report = Report(Constants.R_ID, title="Subread filtering", plotgroups=plot_groups, attributes=attributes) log.debug(str(report)) return meta_rpt.apply_view(report)
def to_report(self, output_dir, report_id=Constants.R_ID): """ This needs to be cleaned up. Keeping the old interface for testing purposes. """ started_at = time.time() log.info("Found {n} movies.".format(n=len(self.movies))) log.info("Working from {n} alignment file{s}: {f}".format( n=len(self.alignment_file_list), s='s' if len(self.alignment_file_list) > 1 else '', f=self.alignment_file_list)) # make this a dict {attribute_key_name:Aggreggator} so it's easy to # access the instances after they've been computed. # there's duplicated keys in the attributes? # number_of_aligned_reads/mapped_reads_n _total_aggregators = self._get_total_aggregators() null_filter = lambda r: True total_model = StatisticsModel( _total_aggregators.values(), filter_func=null_filter) # need to create specific instances for a given movie. This is used to # create the mapping reports stats table movie_models = {} def _my_filter(movie_name1, movie_name2): return movie_name1 == movie_name2 for movie in self.movies: ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES] # Note this WILL NOT work because of how scope works in python # filter_by_movie_func = lambda m_name: movie.name == m_name _my_filter_func = functools.partial(_my_filter, movie) model = StatisticsModel(ags, filter_func=_my_filter_func) movie_models[movie] = model # The statistic models that will be run all_models = [total_model] + movie_models.values() log.debug(all_models) # Run all the analysis. Now the aggregators can be accessed analyze_movies(self.movies, self.alignment_file_list, all_models) # temp structure used to create the report table. The order is # important # add total values _to_a = lambda k: _total_aggregators[k].attribute _row = [_to_a(n) for n in self.COLUMN_ATTR] _row.insert(0, 'All Movies') movie_datum = [_row] # Add each individual movie stats for movie_name_, model_ in movie_models.iteritems(): _row = [movie_name_] for a in model_.aggregators: _row.append(a.attribute) movie_datum.append(_row) log.info(movie_datum) # create the Report table table = self._to_table(movie_datum) for movie_name, model in movie_models.iteritems(): log.info("Movie name {n}".format(n=movie_name)) for a in model.aggregators: log.info(movie_name + " " + repr(a)) log.info("") log.info("Total models") for a in total_model.aggregators: log.info(a) attributes = get_attributes(_total_aggregators) self.add_more_attributes(attributes) log.info("Attributes from streaming mapping Report.") for a in attributes: log.info(a) plot_config_views = self._get_plot_view_configs() plot_groups = [] ds = openDataFile(self.alignment_file) ds.updateCounts() if len(ds) > 0: # keeping the ids independent requires a bit of dictionary madness # {report_id:HistogramAggregator} id_to_aggregators = {k: _total_aggregators[v] for k, v in self.HISTOGRAM_IDS.iteritems()} plot_groups = to_plot_groups(plot_config_views, output_dir, id_to_aggregators) rb_pg = PlotGroup(Constants.PG_RAINBOW) rb_png = "mapped_concordance_vs_read_length.png" make_rainbow_plot(self.alignment_file, op.join(output_dir, rb_png)) rb_plt = Plot(Constants.P_RAINBOW, rb_png) rb_pg.add_plot(rb_plt) plot_groups.append(rb_pg) self.add_more_plots(plot_groups, output_dir) tables = [table] report = Report(report_id, attributes=attributes, plotgroups=plot_groups, tables=tables, dataset_uuids=self.dataset_uuids) log.debug(report) run_time = time.time() - started_at log.info("Completed running in {s:.2f} sec.".format(s=run_time)) return report