def test_transformed(self): """correctly return transform counts""" input = dict(counts=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) coll = RegionCollection(**input) c, r, stderr = coll.transformed() self.assertEqual(c, [4, 5]) freqs = coll.asfreqs() c, r, stderr = freqs.transformed(counts_func=column_sum) self.assertFloatEqual( c, numpy.array([20. / 45., 25. / 45], dtype=c.dtype))
class RegionStudy(object): """ Specifies the RegionCollection associated with an expression data set. Used to collate data for plot_counts.py. Members: collection (a RegionCollection), window_start, window_end, collection_label Methods: filterByGenes, filterByCutoff, normaliseByBases, asPlotLines """ def __init__(self, collection_fn, counts_func, *args, **kwargs): super(RegionStudy, self).__init__(*args, **kwargs) rr = RunRecord('Study') # Keep the source file name for labelling purposes self.collection_path = collection_fn fn = collection_fn.split('/')[-1].rstrip('.gz') self.collection_label = ' '.join(fn.replace('_', ' ').split('.')[:-1]) try: self.data_collection = RegionCollection(filename=collection_fn) except IOError: rr.dieOnCritical('Collection will not load', collection_fn) # Frequency normalized counts need to be converted if counts_func is column_sum: self.data_collection = self.data_collection.asfreqs() self.counts_func = counts_func # Get feature window start and end try: self.window_upstream =\ self.data_collection.info['args']['window_upstream'] except KeyError: rr.dieOnCritical('Collection value not defined', 'window_upstream') try: self.window_downstream =\ self.data_collection.info['args']['window_downstream'] except KeyError: rr.dieOnCritical('Collection value not defined', 'window_downstream') try: self.feature_type =\ self.data_collection.info['args']['feature_type'] except KeyError: self.feature_type = 'Unknown' def filterByGenes(self, db_path, chrom=None, include_samples=None, exclude_samples=None): """ keep only results that match selected genes """ rr = RunRecord('filterByGenes') if not include_samples and not exclude_samples and not chrom: return rr.addInfo('Starting no. of genes', self.data_collection.N) session = make_session(db_path) if include_samples: for sample in include_samples: rr.addInfo('Restricting plot by include sample', sample) if exclude_samples: for sample in exclude_samples: rr.addInfo('Restricting plot by exclude sample', sample) if not chrom is None: rr.addInfo('Restricting plot to chromosome', chrom) filter_gene_ids = get_gene_ids(session, chrom=chrom, include_targets=include_samples, exclude_targets=exclude_samples) self.data_collection =\ self.data_collection.filteredByLabel(filter_gene_ids) rr.addInfo('Remaining genes', self.data_collection.N) if self.data_collection is None or\ len(self.data_collection.ranks) == 0: rr.dieOnCritical('Genes remaining after filtering', '0') def filterByCutoff(self, cutoff=None): """ keep only results that pass Chebyshev cutoff """ rr = RunRecord('filterByCutoff') rr.addInfo('Starting no. of genes', self.data_collection.N) # exclude outlier genes using one-sided Chebyshev if cutoff is not None and cutoff != 0.0: try: cutoff = float(cutoff) if cutoff < 0.0 or cutoff >= 1.0: rr.addError('Cutoff out of range', cutoff) rr.addInfo('Cutoff set to default', 0.05) cutoff = 0.05 except ValueError: rr.addError('Cutoff not given as float', cutoff) rr.addInfo('Cutoff set to default', 0.05) cutoff = 0.05 # Do Chebyshev filtering self.data_collection =\ self.data_collection.filteredChebyshevUpper(p=cutoff) rr.addInfo('Used Chebyshev filter cutoff', cutoff) rr.addInfo('No. genes after normalisation filter', self.data_collection.N) else: rr.addInfo('Outlier cutoff filtering', 'Off') if self.data_collection is None or\ self.data_collection.ranks.max() == 0: rr.dieOnCritical('No data after filtering', 'Failure') def normaliseByRPM(self): """ This requires 'mapped tags', 'tag count' or 'base count' to be present in the collection and gives counts per mapped million tags/bases. Mapped tags is the total experimental mapped tags. Tag count and base count are region specific. """ rr = RunRecord('normaliseByRPM') try: norm_RPM = self.data_collection.info['args']['mapped tags'] rr.addInfo("'mapped tags' value", norm_RPM) except KeyError: rr.addError('Info field not found', 'mapped tags') return norm_factor = 1000000.0 / norm_RPM rr.addInfo('normalising by RPMs', norm_factor) normalised_counts = [] for c in self.data_collection.counts: c2 = c * norm_factor normalised_counts.append(c2) self.data_collection.counts = numpy.array(normalised_counts) def _groupAllGeneCounts(self): """ Group counts for all genes and return as a single PlotLine. Called by asPlotLines or _groupNGeneCounts(). Returns a list. """ rr = RunRecord('_groupAllGeneCounts') counts, ranks, se = self.data_collection.transformed(\ counts_func=self.counts_func) if not len(counts): rr.dieOnCritical('No counts data in', 'Study._groupAllGeneCounts') ranks = 0 # rank is irrelevant for 'all' genes # Always name single lines by their collection name label = self.collection_label plot_lines = [PlotLine(counts, ranks, label, study=label, stderr=se)] return plot_lines def _groupNoGeneCounts(self): """ Don't group counts. Simply return a PlotLine for each set of counts. Called by asPlotLines() """ rr = RunRecord('_groupNoGeneCounts') counts = self.data_collection.counts ranks = self.data_collection.ranks labels = self.data_collection.labels plot_lines = [] for c, r, l in zip(counts, ranks, labels): if self.counts_func == stdev: stdev_ = c.std() if stdev_ > 0: c = (c - c.mean()) / stdev_ plot_lines.append( PlotLine(c, r, l, study=self.collection_label)) else: plot_lines.append( PlotLine(c, r, l, study=self.collection_label)) # If no data was returned default to groupAllCollectionCounts if not len(plot_lines): rr.dieOnCritical('No data in collection', 'Failure') # If a single line is created label it with the collection name if len(plot_lines) == 1: plot_lines[0].label = [self.collection_label] return plot_lines def _groupNGeneCounts(self, group_size, p=0.0): """ Group counts for N genes and return as PlotLines. Defaults to _groupAllGeneCounts() if group size is too large. Called by asPlotLines() """ rr = RunRecord('_groupNGeneCounts') plot_lines = [] for index, (c,r,l,se) in enumerate(self.data_collection.\ iterTransformedGroups(group_size=group_size, counts_func=self.counts_func, p=p)): plot_lines.append( PlotLine(c, rank=r, label=l, study=self.collection_label, stderr=se)) # If no data was returned default to groupAllCollectionCounts if not len(plot_lines): rr.addWarning('Defaulting to ALL features. Not enough '+\ 'features for group of size', group_size) plotLines = self._groupAllGeneCounts() return plotLines return plot_lines def asPlotLines(self, group_size, group_location, p=0.0): """ Returns a list of PlotLine objects from this study. 'p' is the Chebyshev cut-off if not None """ rr = RunRecord('asPlotLines') if p > 0.0: rr.addInfo('Applying per-line Chebyshev filtering', p) if type(group_size) is str and group_size.lower() == 'all': plot_lines = self._groupAllGeneCounts() elif type(group_size) is int: if group_size == 1: plot_lines = self._groupNoGeneCounts() else: plot_lines = self._groupNGeneCounts(group_size, p=p) else: rr.dieOnCritical('group_size, wrong type or value', [type(group_size), group_size]) if group_location.lower() != 'all': rr.addInfo('grouping genes from location', group_location) plot_lines.sort(key=lambda x: x.rank) if group_location.lower() == 'top': plot_lines = [plot_lines[0]] elif group_location.lower() == 'middle': plot_lines = [plot_lines[int(len(plot_lines) / 2)]] elif group_location.lower() == 'bottom': plot_lines = [plot_lines[-1]] rr.addInfo('Plottable lines from study', len(plot_lines)) return plot_lines