Example #1
0
def to_report(filtered_csv, output_dir, dpi=72, thumb_dpi=20):
    """
    Run Report
    """
    validate_file(filtered_csv)
    validate_dir(output_dir)

    aggregators = {
        'nbases': SumAggregator('length'),
        'nreads': CountAggregator('length'),
        'mean_subreadlength': MeanSubreadLengthAggregator('length'),
        'max_readlength': MaxAggregator('length'),
        'n50': N50Aggregator('length'),
        'readlength_histogram': HistogramAggregator('length',
                                                    0,
                                                    100,
                                                    nbins=10000),
        'subread': SubreadLengthHistogram(dx=100)
    }

    passed_filter = lambda record: record.passed_filter is True

    passed_filter_func = functools.partial(_apply, [passed_filter],
                                           aggregators.values())

    all_subread_aggregators = {
        'raw_nreads':
        SumAggregator('length'),
        'max_raw_readlength':
        MaxAggregator('length'),
        'raw_readlength_histogram':
        HistogramAggregator('length', 0, 100, nbins=10000)
    }

    all_filter_func = functools.partial(_apply, [null_filter],
                                        all_subread_aggregators.values())

    funcs = [passed_filter_func, all_filter_func]

    with open(filtered_csv, 'r') as f:
        # read in header
        header = f.readline()
        # validate_header(header)
        applyer(to_record, f, funcs)

    for aggregator in itertools.chain(aggregators.values(),
                                      all_subread_aggregators.values()):
        log.info(aggregator)

    # Check if any reads are found
    if all_subread_aggregators['raw_nreads'].attribute == 0:
        raise NoSubreadsFound(
            "No subreads found in {f}".format(f=filtered_csv))

    # Now check
    if aggregators['nreads'].attribute == 0:
        msg = "No subreads passed the filter in {f}.".format(f=filtered_csv)
        raise NoSubreadsPassedFilter(msg)

    # this is where you change the plotting options
    plot_view = PlotViewProperties(
        Constants.P_POST_FILTER,
        Constants.PG_SUBREAD_LENGTH,
        custom_subread_length_histogram,
        Constants.I_FILTER_SUBREADS_HIST,
        xlabel=meta_rpt.get_meta_plotgroup(
            Constants.PG_SUBREAD_LENGTH).get_meta_plot(
                Constants.P_POST_FILTER).xlabel,
        ylabel=meta_rpt.get_meta_plotgroup(
            Constants.PG_SUBREAD_LENGTH).get_meta_plot(
                Constants.P_POST_FILTER).ylabel["L"],
        rlabel=meta_rpt.get_meta_plotgroup(
            Constants.PG_SUBREAD_LENGTH).get_meta_plot(
                Constants.P_POST_FILTER).ylabel["R"],
        thumb="filtered_subread_report_thmb.png",
        use_group_thumb=True,
        plot_group_title="",
        color=get_green(3),
        edgecolor=get_green(2))

    view_config_d = {'subread': plot_view}
    id_aggregators = {'subread': aggregators['subread']}

    plot_groups = to_plot_groups(view_config_d, output_dir, id_aggregators)

    to_a = lambda n: aggregators[n].attribute

    attributes = _to_attributes(to_a('nreads'), to_a('nbases'),
                                to_a('mean_subreadlength'), to_a('n50'))

    report = Report(Constants.R_ID,
                    title="Subread filtering",
                    plotgroups=plot_groups,
                    attributes=attributes)

    log.debug(str(report))

    return meta_rpt.apply_view(report)
Example #2
0
    def to_report(self, output_dir, report_id=Constants.R_ID):
        """
        This needs to be cleaned up. Keeping the old interface for testing purposes.
        """
        started_at = time.time()

        log.info("Found {n} movies.".format(n=len(self.movies)))

        log.info("Working from {n} alignment file{s}: {f}".format(
            n=len(self.alignment_file_list),
            s='s' if len(self.alignment_file_list) > 1 else '',
            f=self.alignment_file_list))

        # make this a dict {attribute_key_name:Aggreggator} so it's easy to
        # access the instances after they've been computed.
        # there's duplicated keys in the attributes?
        # number_of_aligned_reads/mapped_reads_n
        _total_aggregators = self._get_total_aggregators()
        null_filter = lambda r: True
        total_model = StatisticsModel(
            _total_aggregators.values(), filter_func=null_filter)

        # need to create specific instances for a given movie. This is used to
        # create the mapping reports stats table
        movie_models = {}

        def _my_filter(movie_name1, movie_name2):
            return movie_name1 == movie_name2

        for movie in self.movies:
            ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES]
            # Note this WILL NOT work because of how scope works in python
            # filter_by_movie_func = lambda m_name: movie.name == m_name
            _my_filter_func = functools.partial(_my_filter, movie)
            model = StatisticsModel(ags, filter_func=_my_filter_func)
            movie_models[movie] = model

        # The statistic models that will be run
        all_models = [total_model] + movie_models.values()
        log.debug(all_models)

        # Run all the analysis. Now the aggregators can be accessed

        analyze_movies(self.movies, self.alignment_file_list, all_models)

        # temp structure used to create the report table. The order is
        # important

        # add total values
        _to_a = lambda k: _total_aggregators[k].attribute
        _row = [_to_a(n) for n in self.COLUMN_ATTR]
        _row.insert(0, 'All Movies')
        movie_datum = [_row]

        # Add each individual movie stats
        for movie_name_, model_ in movie_models.iteritems():
            _row = [movie_name_]
            for a in model_.aggregators:
                _row.append(a.attribute)
            movie_datum.append(_row)
        log.info(movie_datum)

        # create the Report table

        table = self._to_table(movie_datum)

        for movie_name, model in movie_models.iteritems():
            log.info("Movie name {n}".format(n=movie_name))
            for a in model.aggregators:
                log.info(movie_name + " " + repr(a))

        log.info("")
        log.info("Total models")
        for a in total_model.aggregators:
            log.info(a)

        attributes = get_attributes(_total_aggregators)

        log.info("Attributes from streaming mapping Report.")
        for a in attributes:
            log.info(a)

        plot_config_views = self._get_plot_view_configs()
        plot_groups = []

        ds = openDataFile(self.alignment_file)
        ds.updateCounts()
        if len(ds) > 0:
            # keeping the ids independent requires a bit of dictionary madness
            # {report_id:HistogramAggregator}
            id_to_aggregators = {k: _total_aggregators[v]
                                 for k, v in self.HISTOGRAM_IDS.iteritems()}
            plot_groups = to_plot_groups(plot_config_views, output_dir,
                                         id_to_aggregators)
            rb_pg = PlotGroup(Constants.PG_RAINBOW)
            rb_png = "mapped_concordance_vs_read_length.png"
            make_rainbow_plot(self.alignment_file, rb_png)
            rb_plt = Plot(Constants.P_RAINBOW, rb_png,
                          caption=get_plot_caption(spec, Constants.PG_RAINBOW,
                                                   Constants.P_RAINBOW))
            rb_pg.add_plot(rb_plt)
            plot_groups.append(rb_pg)
        self.add_more_plots(plot_groups, output_dir)

        tables = [table]
        report = Report(report_id,
                        attributes=attributes,
                        plotgroups=plot_groups,
                        tables=tables,
                        dataset_uuids=self.dataset_uuids)

        log.debug(report)

        run_time = time.time() - started_at
        log.info("Completed running in {s:.2f} sec.".format(s=run_time))
        return report
Example #3
0
def to_report(filter_csv, output_dir, dpi=72):
    """Main point of entry

    The filter stats report has two main modes.

    All Reads: (i.e., PreFilter)

    SequencingZMW > 0
    - total bases
    - total number of reads
    - n50
    - mean readlength
    - mean readscore

    HQ Region: (i.e., PostFilter)

    PassedFilter > 0, SequencingZMW > 0
    - total bases
    - total number of reads
    - n50
    - mean readlength
    - mean readscore


    Generates:
    - Pre and Post filter ReadLength histograms with SDF (with thumbnails)
    - Pre and Post filter ReadScore Histogram with SDF (with thumbnails)
    - Pre and Post table of total bases, # of reads, mean readlengh, mean readscore


    :type filter_csv: str
    :type output_dir: str
    :type dpi: int

    :rtype: Report
    """
    log.info("Analyzing CSV {f}".format(f=filter_csv))

    # A little sanity checking
    csv_header_fields = _get_header_fields_from_csv(filter_csv)
    if not _validate_header(csv_header_fields):
        raise CsvParserError("Unable to find required fields {r} in {f}.".format(
            r=_REQUIRED_HEADER_FIELDS, f=filter_csv))

    P = functools.partial

    row_to_rec_func = P(_row_to_record,
                        _get_header_fields_from_csv(filter_csv))

    # General Filters
    # General construct to create a func with signature f(record) -> Bool
    seq_zmw_filter_f = lambda record: record.SequencingZMW > 0
    hq_filter_f = lambda record: record.PassedFilter > 0

    # Pre-Filter Aggregator(s)
    nbases_ag = SumAggregator('Readlength')
    nreads_ag = CountAggregator('Readlength')
    readlength_ag = MeanAggregator('Readlength')
    max_readlength_ag = MaxAggregator('Readlength')
    min_readlength_ag = MinAggregator('Readlength')
    # the histogram is adaptively computed. The min value and dx is the
    readlength_hist_ag = HistogramAggregator('Readlength', 0, dx=100)
    read_score_hist_ag = HistogramAggregator('ReadScore', 0, dx=0.002)

    # n50_ag = N50Aggregator('Readlength', max_bins=200000)
    pre_n50_ag = N50Aggregator('Readlength', max_bins=200000)

    readscore_ag = SumAggregator('ReadScore', total=0)
    readscore_mean_ag = MeanAggregator('ReadScore')

    # Create/bind core Functions that can be based to the applyer method
    # Calling these 'Models'. A model is list of filters and an aggregator
    # Signature to _apply is ([filter1, filter2], aggregator, record)
    # calling functools.partial returns a function signature f(record)
    pre_filters = [seq_zmw_filter_f]
    pre_agros = [nbases_ag, nreads_ag,
                 readscore_ag, readscore_mean_ag,
                 min_readlength_ag, max_readlength_ag,
                 readlength_ag,
                 readlength_hist_ag,
                 read_score_hist_ag,
                 pre_n50_ag]

    pre_models = [P(_apply, pre_filters, pre_agros)]

    # Post-Filter Aggregator(s)
    post_nbases_ag = SumAggregator('Readlength')
    post_nreads_ag = CountAggregator('Readlength')
    post_readlength_ag = MeanAggregator('Readlength')
    post_min_readlength_ag = MinAggregator('Readlength')
    post_max_readlength_ag = MaxAggregator('Readlength')
    # the histogram is adaptively computed. The min value and dx is the
    post_readlength_hist_ag = HistogramAggregator('Readlength', 0, dx=100)
    post_readscore_hist_ag = HistogramAggregator('ReadScore', 0, dx=0.002)

    post_readscore_ag = SumAggregator('ReadScore')
    post_readscore_mean_ag = MeanAggregator('ReadScore')

    post_n50_ag = N50Aggregator('Readlength', max_bins=200000)

    # Post Filter Models
    post_filters = [seq_zmw_filter_f, hq_filter_f]

    post_agros = [post_nbases_ag, post_nreads_ag,
                  post_readlength_ag, post_readscore_ag,
                  post_min_readlength_ag, post_max_readlength_ag,
                  post_readscore_mean_ag,
                  post_n50_ag,
                  post_readlength_hist_ag, post_readscore_hist_ag]

    post_models = [P(_apply, post_filters, post_agros)]

    models = pre_models + post_models

    with open(filter_csv, 'r') as f:
        # read in header
        _ = f.readline()
        applyer(row_to_rec_func, f, models)

    # Sanity Checking of data
    # Look for csv files with only the csv header (i.e., empty with no data)
    if nreads_ag.attribute < 1:
        msg = "No filtered reads found in {f}. Unable to generate FilterStats report".format(
            f=filter_csv)
        raise NoFilteredReadsError(msg)

    # Exit if all the reads were filtered out.
    if post_nreads_ag.attribute < 1:
        msg = "No filtered reads found in {f}. Unable to generate report.".format(
            f=filter_csv)
        raise NoPassedFilteredReadsError(msg)

    # this is getting really messy
    id_to_aggro = {Constants.A_BASE_N_POST_FILTER: post_nbases_ag,
                   Constants.A_BASE_N_PRE_FILTER: nbases_ag,
                   Constants.A_MEAN_READ_LENGTH_POST_FILTER: post_readlength_ag,
                   Constants.A_MEAN_READ_LENGTH_PRE_FILTER: readlength_ag,
                   Constants.A_MEAN_READ_SCORE_POST_FILTER: post_readscore_mean_ag,
                   Constants.A_MEAN_READ_SCORE_PRE_FILTER: readscore_mean_ag,
                   Constants.A_READS_N_POST_FILTER: post_nreads_ag,
                   Constants.A_READS_N_PRE_FILTER: nreads_ag,
                   Constants.A_N50_READ_LENGTH_PRE_FILTER: pre_n50_ag,
                   Constants.A_N50_READ_LENGTH_POST_FILTER: post_n50_ag}

    _log_aggros(id_to_aggro)

    attributes = to_attributes(**id_to_aggro)

    plot_aggregators = {Constants.P_PRE_FILTER_READ_LENGTH_HIST: readlength_hist_ag,
                        Constants.P_POST_FILTER_READ_LENGHT_HIST: post_readlength_hist_ag,
                        Constants.P_PRE_FILTER_READ_SCORE_HIST: read_score_hist_ag,
                        Constants.P_POST_FILTER_READ_SCORE_HIST: post_readscore_hist_ag}

    _log_aggros(plot_aggregators)

    # this is dict of {id:PlotView instances}
    view_configs = _plot_view_configs()
    plot_groups = to_plot_groups(view_configs, output_dir, plot_aggregators)

    # Temp lists to Create Pbreports Table
    # Each list has [nbases, nreads, n50, mean readlength, mean readscore]
    _pre_filter = [nbases_ag.attribute,
                   nreads_ag.attribute,
                   pre_n50_ag.attribute,
                   readlength_ag.attribute,
                   np.round(readscore_mean_ag.attribute, decimals=3)]

    _post_filter = [post_nbases_ag.attribute,
                    post_nreads_ag.attribute,
                    post_n50_ag.attribute,
                    post_readlength_ag.attribute,
                    np.round(post_readscore_mean_ag.attribute, decimals=3)]

    table = to_table(_pre_filter, _post_filter)

    report = Report(Constants.R_ID, tables=[table], plotgroups=plot_groups,
                    attributes=attributes)

    log.debug(report)
    for attribute in report.attributes:
        log.debug(attribute)

    log.info(str(report.tables[0]))
    return report
Example #4
0
def to_report(filtered_csv, output_dir, dpi=72, thumb_dpi=20):
    """
    Run Report
    """
    validate_file(filtered_csv)
    validate_dir(output_dir)

    aggregators = {'nbases': SumAggregator('length'),
                   'nreads': CountAggregator('length'),
                   'mean_subreadlength': MeanSubreadLengthAggregator('length'),
                   'max_readlength': MaxAggregator('length'),
                   'n50': N50Aggregator('length'),
                   'readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000),
                   'subread': SubreadLengthHistogram(dx=100)}

    passed_filter = lambda record: record.passed_filter is True

    passed_filter_func = functools.partial(
        _apply, [passed_filter], aggregators.values())

    all_subread_aggregators = {'raw_nreads': SumAggregator('length'),
                               'max_raw_readlength': MaxAggregator('length'),
                               'raw_readlength_histogram': HistogramAggregator('length', 0, 100, nbins=10000)}

    all_filter_func = functools.partial(
        _apply, [null_filter], all_subread_aggregators.values())

    funcs = [passed_filter_func, all_filter_func]

    with open(filtered_csv, 'r') as f:
        # read in header
        header = f.readline()
        # validate_header(header)
        applyer(to_record, f, funcs)

    for aggregator in itertools.chain(aggregators.values(), all_subread_aggregators.values()):
        log.info(aggregator)

    # Check if any reads are found
    if all_subread_aggregators['raw_nreads'].attribute == 0:
        raise NoSubreadsFound(
            "No subreads found in {f}".format(f=filtered_csv))

    # Now check
    if aggregators['nreads'].attribute == 0:
        msg = "No subreads passed the filter in {f}.".format(f=filtered_csv)
        raise NoSubreadsPassedFilter(msg)

    # this is where you change the plotting options
    plot_view = PlotViewProperties(Constants.P_POST_FILTER,
                                   Constants.PG_SUBREAD_LENGTH,
                                   custom_subread_length_histogram,
                                   Constants.I_FILTER_SUBREADS_HIST,
                                   xlabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).xlabel,
                                   ylabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).ylabel["L"],
                                   rlabel=meta_rpt.get_meta_plotgroup(Constants.PG_SUBREAD_LENGTH).get_meta_plot(Constants.P_POST_FILTER).ylabel["R"],
                                   thumb="filtered_subread_report_thmb.png",
                                   use_group_thumb=True,
                                   plot_group_title="",
                                   color=get_green(3),
                                   edgecolor=get_green(2))

    view_config_d = {'subread': plot_view}
    id_aggregators = {'subread': aggregators['subread']}

    plot_groups = to_plot_groups(view_config_d, output_dir, id_aggregators)

    to_a = lambda n: aggregators[n].attribute

    attributes = _to_attributes(to_a('nreads'),
                                to_a('nbases'),
                                to_a('mean_subreadlength'),
                                to_a('n50'))

    report = Report(Constants.R_ID, title="Subread filtering",
                    plotgroups=plot_groups,
                    attributes=attributes)

    log.debug(str(report))

    return meta_rpt.apply_view(report)
Example #5
0
    def to_report(self, output_dir, report_id=Constants.R_ID):
        """
        This needs to be cleaned up. Keeping the old interface for testing purposes.
        """
        started_at = time.time()

        log.info("Found {n} movies.".format(n=len(self.movies)))

        log.info("Working from {n} alignment file{s}: {f}".format(
            n=len(self.alignment_file_list),
            s='s' if len(self.alignment_file_list) > 1 else '',
            f=self.alignment_file_list))

        # make this a dict {attribute_key_name:Aggreggator} so it's easy to
        # access the instances after they've been computed.
        # there's duplicated keys in the attributes?
        # number_of_aligned_reads/mapped_reads_n
        _total_aggregators = self._get_total_aggregators()
        null_filter = lambda r: True
        total_model = StatisticsModel(
            _total_aggregators.values(), filter_func=null_filter)

        # need to create specific instances for a given movie. This is used to
        # create the mapping reports stats table
        movie_models = {}

        def _my_filter(movie_name1, movie_name2):
            return movie_name1 == movie_name2

        for movie in self.movies:
            ags = [k() for k in self.COLUMN_AGGREGATOR_CLASSES]
            # Note this WILL NOT work because of how scope works in python
            # filter_by_movie_func = lambda m_name: movie.name == m_name
            _my_filter_func = functools.partial(_my_filter, movie)
            model = StatisticsModel(ags, filter_func=_my_filter_func)
            movie_models[movie] = model

        # The statistic models that will be run
        all_models = [total_model] + movie_models.values()
        log.debug(all_models)

        # Run all the analysis. Now the aggregators can be accessed

        analyze_movies(self.movies, self.alignment_file_list, all_models)

        # temp structure used to create the report table. The order is
        # important

        # add total values
        _to_a = lambda k: _total_aggregators[k].attribute
        _row = [_to_a(n) for n in self.COLUMN_ATTR]
        _row.insert(0, 'All Movies')
        movie_datum = [_row]

        # Add each individual movie stats
        for movie_name_, model_ in movie_models.iteritems():
            _row = [movie_name_]
            for a in model_.aggregators:
                _row.append(a.attribute)
            movie_datum.append(_row)
        log.info(movie_datum)

        # create the Report table

        table = self._to_table(movie_datum)

        for movie_name, model in movie_models.iteritems():
            log.info("Movie name {n}".format(n=movie_name))
            for a in model.aggregators:
                log.info(movie_name + " " + repr(a))

        log.info("")
        log.info("Total models")
        for a in total_model.aggregators:
            log.info(a)

        attributes = get_attributes(_total_aggregators)
        self.add_more_attributes(attributes)

        log.info("Attributes from streaming mapping Report.")
        for a in attributes:
            log.info(a)

        plot_config_views = self._get_plot_view_configs()
        plot_groups = []

        ds = openDataFile(self.alignment_file)
        ds.updateCounts()
        if len(ds) > 0:
            # keeping the ids independent requires a bit of dictionary madness
            # {report_id:HistogramAggregator}
            id_to_aggregators = {k: _total_aggregators[v]
                                 for k, v in self.HISTOGRAM_IDS.iteritems()}
            plot_groups = to_plot_groups(plot_config_views, output_dir,
                                         id_to_aggregators)
            rb_pg = PlotGroup(Constants.PG_RAINBOW)
            rb_png = "mapped_concordance_vs_read_length.png"
            make_rainbow_plot(self.alignment_file, op.join(output_dir, rb_png))
            rb_plt = Plot(Constants.P_RAINBOW, rb_png)
            rb_pg.add_plot(rb_plt)
            plot_groups.append(rb_pg)
        self.add_more_plots(plot_groups, output_dir)

        tables = [table]
        report = Report(report_id,
                        attributes=attributes,
                        plotgroups=plot_groups,
                        tables=tables,
                        dataset_uuids=self.dataset_uuids)

        log.debug(report)

        run_time = time.time() - started_at
        log.info("Completed running in {s:.2f} sec.".format(s=run_time))
        return report