Esempio n. 1
0
 def test_transformed(self):
     """correctly return transform counts"""
     input = dict(counts=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
     coll = RegionCollection(**input)
     c, r, stderr = coll.transformed()
     self.assertEqual(c, [4, 5])
     freqs = coll.asfreqs()
     c, r, stderr = freqs.transformed(counts_func=column_sum)
     self.assertFloatEqual(
         c, numpy.array([20. / 45., 25. / 45], dtype=c.dtype))
Esempio n. 2
0
class RegionStudy(object):
    """ Specifies the RegionCollection associated with an expression
            data set. Used to collate data for plot_counts.py.

    Members: collection (a RegionCollection), window_start, window_end,
            collection_label
    Methods: filterByGenes, filterByCutoff, normaliseByBases,
            asPlotLines
    """
    def __init__(self, collection_fn, counts_func, *args, **kwargs):
        super(RegionStudy, self).__init__(*args, **kwargs)
        rr = RunRecord('Study')
        # Keep the source file name for labelling purposes
        self.collection_path = collection_fn
        fn = collection_fn.split('/')[-1].rstrip('.gz')
        self.collection_label = ' '.join(fn.replace('_', ' ').split('.')[:-1])
        try:
            self.data_collection = RegionCollection(filename=collection_fn)
        except IOError:
            rr.dieOnCritical('Collection will not load', collection_fn)

        # Frequency normalized counts need to be converted
        if counts_func is column_sum:
            self.data_collection = self.data_collection.asfreqs()
        self.counts_func = counts_func

        # Get feature window start and end
        try:
            self.window_upstream =\
                    self.data_collection.info['args']['window_upstream']
        except KeyError:
            rr.dieOnCritical('Collection value not defined', 'window_upstream')

        try:
            self.window_downstream =\
                    self.data_collection.info['args']['window_downstream']
        except KeyError:
            rr.dieOnCritical('Collection value not defined',
                             'window_downstream')

        try:
            self.feature_type =\
                    self.data_collection.info['args']['feature_type']
        except KeyError:
            self.feature_type = 'Unknown'

    def filterByGenes(self,
                      db_path,
                      chrom=None,
                      include_samples=None,
                      exclude_samples=None):
        """ keep only results that match selected genes """

        rr = RunRecord('filterByGenes')
        if not include_samples and not exclude_samples and not chrom:
            return

        rr.addInfo('Starting no. of genes', self.data_collection.N)

        session = make_session(db_path)
        if include_samples:
            for sample in include_samples:
                rr.addInfo('Restricting plot by include sample', sample)

        if exclude_samples:
            for sample in exclude_samples:
                rr.addInfo('Restricting plot by exclude sample', sample)

        if not chrom is None:
            rr.addInfo('Restricting plot to chromosome', chrom)

        filter_gene_ids = get_gene_ids(session,
                                       chrom=chrom,
                                       include_targets=include_samples,
                                       exclude_targets=exclude_samples)

        self.data_collection =\
                self.data_collection.filteredByLabel(filter_gene_ids)
        rr.addInfo('Remaining genes', self.data_collection.N)

        if self.data_collection is None or\
           len(self.data_collection.ranks) == 0:
            rr.dieOnCritical('Genes remaining after filtering', '0')

    def filterByCutoff(self, cutoff=None):
        """ keep only results that pass Chebyshev cutoff """
        rr = RunRecord('filterByCutoff')

        rr.addInfo('Starting no. of genes', self.data_collection.N)

        # exclude outlier genes using one-sided Chebyshev
        if cutoff is not None and cutoff != 0.0:
            try:
                cutoff = float(cutoff)
                if cutoff < 0.0 or cutoff >= 1.0:
                    rr.addError('Cutoff out of range', cutoff)
                    rr.addInfo('Cutoff set to default', 0.05)
                    cutoff = 0.05
            except ValueError:
                rr.addError('Cutoff not given as float', cutoff)
                rr.addInfo('Cutoff set to default', 0.05)
                cutoff = 0.05
                # Do Chebyshev filtering

            self.data_collection =\
                    self.data_collection.filteredChebyshevUpper(p=cutoff)
            rr.addInfo('Used Chebyshev filter cutoff', cutoff)
            rr.addInfo('No. genes after normalisation filter',
                       self.data_collection.N)
        else:
            rr.addInfo('Outlier cutoff filtering', 'Off')

        if self.data_collection is None or\
                self.data_collection.ranks.max() == 0:
            rr.dieOnCritical('No data after filtering', 'Failure')

    def normaliseByRPM(self):
        """ This requires 'mapped tags', 'tag count' or 'base count' to be present
            in the collection and gives counts per mapped million tags/bases.
            Mapped tags is the total experimental mapped tags.
            Tag count and base count are region specific.
        """
        rr = RunRecord('normaliseByRPM')
        try:
            norm_RPM = self.data_collection.info['args']['mapped tags']
            rr.addInfo("'mapped tags' value", norm_RPM)
        except KeyError:
            rr.addError('Info field not found', 'mapped tags')
            return
        norm_factor = 1000000.0 / norm_RPM
        rr.addInfo('normalising by RPMs', norm_factor)
        normalised_counts = []
        for c in self.data_collection.counts:
            c2 = c * norm_factor
            normalised_counts.append(c2)
        self.data_collection.counts = numpy.array(normalised_counts)

    def _groupAllGeneCounts(self):
        """ Group counts for all genes and return as a single PlotLine.
            Called by asPlotLines or _groupNGeneCounts().
            Returns a list.
        """
        rr = RunRecord('_groupAllGeneCounts')
        counts, ranks, se = self.data_collection.transformed(\
            counts_func=self.counts_func)
        if not len(counts):
            rr.dieOnCritical('No counts data in', 'Study._groupAllGeneCounts')

        ranks = 0  # rank is irrelevant for 'all' genes

        # Always name single lines by their collection name
        label = self.collection_label
        plot_lines = [PlotLine(counts, ranks, label, study=label, stderr=se)]
        return plot_lines

    def _groupNoGeneCounts(self):
        """ Don't group counts. Simply return a PlotLine for each set of
            counts.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNoGeneCounts')
        counts = self.data_collection.counts
        ranks = self.data_collection.ranks
        labels = self.data_collection.labels
        plot_lines = []
        for c, r, l in zip(counts, ranks, labels):
            if self.counts_func == stdev:
                stdev_ = c.std()
                if stdev_ > 0:
                    c = (c - c.mean()) / stdev_
                    plot_lines.append(
                        PlotLine(c, r, l, study=self.collection_label))
            else:
                plot_lines.append(
                    PlotLine(c, r, l, study=self.collection_label))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.dieOnCritical('No data in collection', 'Failure')

        # If a single line is created label it with the collection name
        if len(plot_lines) == 1:
            plot_lines[0].label = [self.collection_label]

        return plot_lines

    def _groupNGeneCounts(self, group_size, p=0.0):
        """ Group counts for N genes and return as PlotLines. Defaults to
            _groupAllGeneCounts() if group size is too large.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNGeneCounts')
        plot_lines = []
        for index, (c,r,l,se) in enumerate(self.data_collection.\
                iterTransformedGroups(group_size=group_size,
                counts_func=self.counts_func, p=p)):
            plot_lines.append(
                PlotLine(c,
                         rank=r,
                         label=l,
                         study=self.collection_label,
                         stderr=se))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.addWarning('Defaulting to ALL features. Not enough '+\
                          'features for group of size', group_size)
            plotLines = self._groupAllGeneCounts()
            return plotLines

        return plot_lines

    def asPlotLines(self, group_size, group_location, p=0.0):
        """
            Returns a list of PlotLine objects from this study.
            'p' is the Chebyshev cut-off if not None
        """
        rr = RunRecord('asPlotLines')
        if p > 0.0:
            rr.addInfo('Applying per-line Chebyshev filtering', p)

        if type(group_size) is str and group_size.lower() == 'all':
            plot_lines = self._groupAllGeneCounts()
        elif type(group_size) is int:
            if group_size == 1:
                plot_lines = self._groupNoGeneCounts()
            else:
                plot_lines = self._groupNGeneCounts(group_size, p=p)
        else:
            rr.dieOnCritical('group_size, wrong type or value',
                             [type(group_size), group_size])

        if group_location.lower() != 'all':
            rr.addInfo('grouping genes from location', group_location)
            plot_lines.sort(key=lambda x: x.rank)
            if group_location.lower() == 'top':
                plot_lines = [plot_lines[0]]
            elif group_location.lower() == 'middle':
                plot_lines = [plot_lines[int(len(plot_lines) / 2)]]
            elif group_location.lower() == 'bottom':
                plot_lines = [plot_lines[-1]]

        rr.addInfo('Plottable lines from study', len(plot_lines))
        return plot_lines