class GaussianMixture1DPluginOp(PluginOpMixin, GaussianMixtureOp): handler_factory = Callable(GaussianMixture1DHandler) channel = Str channel_scale = util.ScaleEnum(estimate=True) # add "estimate" metadata num_components = util.PositiveInt(1, estimate=True) sigma = util.PositiveFloat(0.0, allow_zero=True, estimate=True) by = List(Str, estimate=True) # bits to support the subset editor subset_list = List(ISubset, estimate=True) subset = Property(Str, depends_on="subset_list.str") # MAGIC - returns the value of the "subset" Property, above def _get_subset(self): return " and ".join( [subset.str for subset in self.subset_list if subset.str]) @on_trait_change('subset_list.str', post_init=True) def _subset_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('subset_list', self.subset_list)) _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True) @on_trait_change('channel') def _channel_changed(self): self.channels = [self.channel] self.changed = (Changed.ESTIMATE, ('channels', self.channels)) @on_trait_change('channel_scale') def _scale_changed(self): if self.channel: self.scale[self.channel] = self.channel_scale self.changed = (Changed.ESTIMATE, ('scale', self.scale)) def estimate(self, experiment): super().estimate(experiment, subset=self.subset) self.changed = (Changed.ESTIMATE_RESULT, self) def default_view(self, **kwargs): return GaussianMixture1DPluginView(op=self, **kwargs) def should_clear_estimate(self, changed): if changed == Changed.ESTIMATE: return True return False def clear_estimate(self): self._gmms = {} self._scale = {} self.changed = (Changed.ESTIMATE_RESULT, self)
class GaussianMixture1DPluginOp(PluginOpMixin, GaussianMixture1DOp): handler_factory = Callable(GaussianMixture1DHandler) # add "estimate" metadata num_components = util.PositiveInt(1, estimate = True) sigma = util.PositiveFloat(0.0, allow_zero = True, estimate = True) by = List(Str, estimate = True) _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True, estimate_result = True) def estimate(self, experiment): GaussianMixture1DOp.estimate(self, experiment, subset = self.subset) def default_view(self, **kwargs): return GaussianMixture1DPluginView(op = self, **kwargs) def clear_estimate(self): self._gmms = {}
class GaussianMixture2DOp(HasStrictTraits): """ This module fits a 2D Gaussian mixture model with a specified number of components to a pair of channels. Creates a new categorical metadata variable named `name`, with possible values `name_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it falls within `sigma` standard deviations of the component's mean. If that is true for multiple categories (or if `sigma == 0.0`), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to `name_None`. As a special case, if `num_components` is `1` and `sigma` > 0.0, then the new condition is boolean, `True` if the event fell in the gate and `False` otherwise. Optionally, if `posteriors` is `True`, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named `{Name}_Posterior`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log") (default = "linear") Re-scale the data before fitting the data? TODO - not currently implemented. posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- >>> gauss_op = GaussianMixture2DOp(name = "Gaussian", ... xchannel = "V2-A", ... ychannel = "Y2-A", ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view().plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d') friendly_id = Constant("2D Gaussian Mixture") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum num_components = util.PositiveInt sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GMM)) _xscale = Instance(util.IScale) _yscale = Instance(util.IScale) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, self.ychannel) for group, data_subset in groupby: x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GMM(n_components = self.num_components, covariance_type = "full", random_state = 1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}" .format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0] ** 2 + gmm.means_[:, 1] ** 2) ** 0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covars_ = gmm.covars_[sort_idx] self._gmms[group] = gmm def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError("No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError("Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError("Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.ychannel)) if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowError("If num_components == 1, sigma must be > 0") if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x[:, 0], "y" : x[:, 1], "p" : predicted}) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm._get_covars()[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return GaussianMixture2DView(op = self, **kwargs)
class DensityGateOp(HasStrictTraits): """ This module computes a gate based on a 2D density plot. The user chooses what proportion of events to keep, and the module creates a gate that selects that proportion of events in the highest-density bins of the 2D density histogram. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the binning to. ychannel : Str The Y channel to apply the binning to. xscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the Y axis before fitting the data? keep : Float (default = 0.9) What proportion of events to keep? Must be ``>0`` and ``<1`` bins : Int (default = 100) How many bins should there be on each axis? Must be positive. min_quantile : Float (default = 0.001) Clip values below this quantile max_quantile : Float (default = 1.0) Clip values above this quantile sigma : Float (default = 1.0) What standard deviation to use for the gaussian blur? by : List(Str) A list of metadata attributes to aggregate the data before estimating the gate. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit a separate gate to each subset of the data with a unique combination of ``Time`` and ``Dox``. Notes ----- This gating method was developed by John Sexton, in Jeff Tabor's lab at Rice University. From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html, the method is as follows: 1. Determines the number of events to keep, based on the user specified gating fraction and the total number of events of the input sample. 2. Divides the 2D channel space into a rectangular grid, and counts the number of events falling within each bin of the grid. The number of counts per bin across all bins comprises a 2D histogram, which is a coarse approximation of the underlying probability density function. 3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. Theoretically, the proper amount of smoothing results in a better estimate of the probability density function. Practically, smoothing eliminates isolated bins with high counts, most likely corresponding to noise, and smoothes the contour of the gated region. 4. Selects the bins with the greatest number of events in the smoothed histogram, starting with the highest and proceeding downward until the desired number of events to keep, calculated in step 1, is achieved. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> dens_op = flow.DensityGateOp(name = 'Density', ... xchannel = 'FSC-A', ... xscale = 'log', ... ychannel = 'SSC-A', ... yscale = 'log', ... keep = 0.5) Find the bins to keep .. plot:: :context: close-figs >>> dens_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> dens_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = dens_op.apply(ex) """ id = Constant('edu.mit.synbio.cytoflow.operations.density') friendly_id = Constant("Density Gate") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum keep = util.PositiveFloat(0.9, allow_zero = False) bins = util.PositiveInt(100, allow_zero = False) min_quantile = util.PositiveFloat(0.001, allow_zero = True) max_quantile = util.PositiveFloat(1.0, allow_zero = False) sigma = util.PositiveFloat(1.0, allow_zero = False) by = List(Str) _xscale = Instance(util.IScale, transient = True) _yscale = Instance(util.IScale, transient = True) _xbins = Array(transient = True) _ybins = Array(transient = True) _keep_xbins = Dict(Any, Array, transient = True) _keep_ybins = Dict(Any, Array, transient = True) _histogram = Dict(Any, Array, transient = True) def estimate(self, experiment, subset = None): """ Split the data set into bins and determine which ones to keep. Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the gate parameters. subset : Str (default = None) If set, determine the gate parameters on only a subset of the ``experiment`` parameter. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError('xchannel', "Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError('ychannel', "Column {0} not found in the experiment" .format(self.ychannel)) if self.min_quantile > 1.0: raise util.CytoflowOpError('min_quantile', "min_quantile must be <= 1.0") if self.max_quantile > 1.0: raise util.CytoflowOpError('max_quantile', "max_quantile must be <= 1.0") if not (self.max_quantile > self.min_quantile): raise util.CytoflowOpError('max_quantile', "max_quantile must be > min_quantile") if self.keep > 1.0: raise util.CytoflowOpError('keep', "keep must be <= 1.0") for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = xscale = util.scale_factory(self.xscale, experiment, channel = self.xchannel) self._yscale = yscale = util.scale_factory(self.yscale, experiment, channel = self.ychannel) xlim = (xscale.clip(experiment[self.xchannel].quantile(self.min_quantile)), xscale.clip(experiment[self.xchannel].quantile(self.max_quantile))) ylim = (yscale.clip(experiment[self.ychannel].quantile(self.min_quantile)), yscale.clip(experiment[self.ychannel].quantile(self.max_quantile))) self._xbins = xbins = xscale.inverse(np.linspace(xscale(xlim[0]), xscale(xlim[1]), self.bins)) self._ybins = ybins = yscale.inverse(np.linspace(yscale(ylim[0]), yscale(ylim[1]), self.bins)) for group, group_data in groupby: if len(group_data) == 0: raise util.CytoflowOpError('by', "Group {} had no data" .format(group)) h, _, _ = np.histogram2d(group_data[self.xchannel], group_data[self.ychannel], bins=[xbins, ybins]) h = scipy.ndimage.filters.gaussian_filter(h, sigma = self.sigma) i = scipy.stats.rankdata(h, method = "ordinal") - 1 i = np.unravel_index(np.argsort(-i), h.shape) goal_count = self.keep * len(group_data) curr_count = 0 num_bins = 0 while(curr_count < goal_count and num_bins < i[0].size): curr_count += h[i[0][num_bins], i[1][num_bins]] num_bins += 1 self._keep_xbins[group] = i[0][0:num_bins] self._keep_ybins[group] = i[1][0:num_bins] self._histogram[group] = h def apply(self, experiment): """ Creates a new condition based on membership in the gate that was parameterized with :meth:`estimate`. Parameters ---------- experiment : Experiment the :class:`.Experiment` to apply the gate to. Returns ------- Experiment a new :class:`.Experiment` with the new gate applied. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must set X channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not (self._xbins.size and self._ybins.size and self._keep_xbins): raise util.CytoflowOpError(None, "No gate estimate found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError(None, "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError(None, "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError('xchannel', "Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError('ychannel', "Column {0} not found in the experiment" .format(self.ychannel)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([False] * len(experiment), dtype = "bool") for group, group_data in groupby: if group not in self._keep_xbins: # there weren't any events in this group, so we didn't get # an estimate continue group_idx = groupby.groups[group] cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False) cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False) group_keep = pd.Series([False] * len(group_data)) keep_x = self._keep_xbins[group] keep_y = self._keep_ybins[group] for (xbin, ybin) in zip(keep_x, keep_y): group_keep = group_keep | ((cX == xbin) & (cY == ybin)) event_assignments.iloc[group_idx] = group_keep new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", event_assignments) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView a diagnostic view, call :meth:`~DensityGateView.plot` to see the diagnostic plot. """ v = DensityGateView(op = self) v.trait_set(**kwargs) return v
class GaussianMixture1DOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to a channel. .. warning:: :class:`GaussianMixture1DOp` is **DEPRECATED** and will be removed in a future release. It doesn't correctly handle the case where an event is present in more than one component. Please use :class:`GaussianMixtureOp` instead! Creates a new categorical metadata variable named :attr:`name`, with possible values ``name_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it falls within :attr:`sigma` standard deviations of the component's mean. If that is true for multiple categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to ``name_None``. As a special case, if :attr:`num_components` is `1` and :attr:`sigma` ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in the gate and ``False`` otherwise. Optionally, if :attr:`posteriors` is ``True``, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named ``{Name}_Posterior``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channel : Str Which channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log", "logicle") (default = "linear") Re-scale the data before fitting the model? posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- Make a little data set. .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixture1DOp(name = 'GM', ... channel = 'Y2-A', ... scale = 'log', ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d') friendly_id = Constant("1D Gaussian Mixture") name = CStr() channel = Str() num_components = util.PositiveInt(1) sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) scale = util.ScaleEnum posteriors = Bool(False) # the key is a set _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True) _scale = Instance(util.IScale, transient = True) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters. Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.channel not in experiment.data: raise util.CytoflowOpError('channel', "Column {0} not found in the experiment" .format(self.channel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError('num_components', "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except Exception as e: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(subset)) from e if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: by = sorted(self.by) groupby = experiment.data.groupby(by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._scale = util.scale_factory(self.scale, experiment, channel = self.channel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError(None, "Group {} had no data".format(group)) x = data_subset[self.channel].reset_index(drop = True) x = self._scale(x) # drop data that isn't in the scale range #x = pd.Series(self._scale(x)).dropna() x = x[~np.isnan(x)] gmm = mixture.GaussianMixture(n_components = self.num_components, random_state = 1) gmm.fit(x[:, np.newaxis]) if not gmm.converged_: raise util.CytoflowOpError(None, "Estimator didn't converge" " for group {0}" .format(group)) # to make sure we have a stable ordering, sort the components # by the means (so the first component has the lowest mean, # the next component has the next-lowest, etc.) sort_idx = np.argsort(gmm.means_[:, 0]) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment`, with a new column named :attr:`name`, and possibly one named :attr:`name` _Posterior. Also the following new :attr:`~.Experiment.statistics`: - **mean** : Float the mean of the fitted gaussian - **stdev** : Float the inverse-scaled standard deviation of the fitted gaussian. on a linear scale, this is in the same units as the mean; on a log scale, this is a scalar multiple; and on a logicle scale, this is probably meaningless! - **interval** : (Float, Float) the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian. this is likely more meaningful than ``stdev``, especially on the ``logicle`` scale. - **proportion** : Float the proportion of events in each component of the mixture model. only set if :attr:`num_components` ``> 1``. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self._gmms: raise util.CytoflowOpError(None, "No model found. Did you forget to " "call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") if not self._scale: raise util.CytoflowOpError(None, "Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError('channel', "Column {0} not found in the experiment" .format(self.channel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError('posteriors', "Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.sigma < 0.0: raise util.CytoflowOpError('sigma', "sigma must be >= 0.0") if self.by: by = sorted(self.by) groupby = experiment.data.groupby(by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for group, data_subset in groupby: # if there weren't any events in this group, there's no gmm if group not in self._gmms: warn("There wasn't a GMM for data subset {}".format(group), util.CytoflowOpWarning) continue gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x).values # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covariances_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covariances_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], names = levels) mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = (g[0],) else: g = group mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0]) stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0] interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])), self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0]))) prop_stat.at[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat) new_experiment.statistics[(self.name, "interval")] = interval_stat if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) v = GaussianMixture1DView(op = self) v.trait_set(**kwargs) return v
class FlowPeaksOp(HasStrictTraits): """ This module uses the flowPeaks algorithm to assign events to clusters in an unsupervised manner. Call `estimate()` to compute the clusters. Calling `apply()` creates a new categorical metadata variable named `name`, with possible values `{name}_1` .... `name_n` where `n` is the number of clusters, specified with `n_clusters`. The same model may not be appropriate for different subsets of the data set. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a model. The number of clusters is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the clustering algorithm to. scale : Dict(Str : Enum("linear", "logicle", "log")) Re-scale the data in the specified channels before fitting. If a channel is in `channels` but not in `scale`, the current package-wide default (set with `set_default_scale`) is used. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. h : Float (default = 1.5) A scalar value by which to scale the covariance matrices of the underlying density function. (See `Notes`, below, for more details.) h0 : Float (default = 1.0) A scalar value by which to smooth the covariance matrices of the underlying density function. (See `Notes`, below, for more details.) tol : Float (default = 0.5) How readily should clusters be merged? Must be between 0 and 1. See `Notes`, below, for more details. merge_dist : Float (default = 5) How far apart can clusters be before they are merged? This is a unit-free scalar, and is approximately the maximum number of k-means clusters between peaks. find_outliers : Bool (default = False) Should the algorithm use an extra step to identify outliers? *Note: I have disabled this code until I can try to make it faster.* Notes ----- This algorithm uses kmeans to find a large number of clusters, then hierarchically merges those clusters. Thus, the user does not need to specify the number of clusters in advance; and it can find non-convex clusters. It also operates in an arbitrary number of dimensions. The merging happens in two steps. First, the cluster centroids are used to estimate an underlying density function. Then, the local maxima of the density function are found using a numerical optimization starting from each centroid, and k-means clusters that converge to the same local maximum are merged. Finally, these clusters-of-clusters are merged if their local maxima are (a) close enough, and (b) the density function between them is smooth enough. Thus, the final assignment of each event depends on the k-means cluster it ends up in, and which cluster-of-clusters that k-means centroid is assigned to. There are a lot of parameters that affect this process. The k-means clustering is pretty robust (though somewhat sensitive to the number of clusters, which is currently not exposed in the API.) The most important are exposed as traits of the `FlowPeaksOp` class. These include: - h, h0: sometimes the density function is too "rough" to find good local maxima. These parameters smooth it out by widening the covariance matrices. Increasing `h` makes the density rougher; increasing `h0` makes it smoother. - tol: How smooth does the density function have to be between two density maxima to merge them? Must be between 0 and 1. - merge_dist: How close must two maxima be to merge them? This value is a unit-free scalar, and is approximately the number of k-means clusters between the two maxima. For details and a theoretical justification, see flowPeaks: a fast unsupervised clustering for flow cytometry data via K-means and density peak finding Yongchao Ge Stuart C. Sealfon Bioinformatics (2012) 28 (15): 2052-2058. Examples -------- >>> fp_op = FlowPeaksOp(name = "Clust", ... channels = ["V2-A", "Y2-A"], ... scale = {"V2-A" : "log"}) >>> fp_op.estimate(ex2) >>> fp_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2) >>> ex3 = fp_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.flowpeaks') friendly_id = Constant("FlowPeaks Clustering") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) by = List(Str) # find_outliers = Bool(False) # parameters that control estimation, with sensible defaults h = util.PositiveFloat(1.5, allow_zero=False) h0 = util.PositiveFloat(1, allow_zero=False) tol = util.PositiveFloat(0.5, allow_zero=False) merge_dist = util.PositiveFloat(5, allow_zero=False) # parameters that control outlier selection, with sensible defaults _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient=True) _normals = Dict(Any, List(Function), transient=True) _density = Dict(Any, Function, transient=True) _peaks = Dict(Any, List(Array), transient=True) _cluster_peak = Dict(Any, List, transient=True) # kmeans cluster idx --> peak idx _cluster_group = Dict(Any, List, transient=True) # kmeans cluster idx --> group idx _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) # if self.scale[c] == 'log': # self._scale[c].mode = 'mask' else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for data_group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(data_group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values #### choose the number of clusters and fit the kmeans num_clusters = [ util.num_hist_bins(x[:, c]) for c in range(len(self.channels)) ] num_clusters = np.ceil(np.median(num_clusters)) num_clusters = int(num_clusters) self._kmeans[data_group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters) kmeans.fit(x) x_labels = kmeans.predict(x) d = len(self.channels) #### use the kmeans centroids to parameterize a finite gaussian #### mixture model which estimates the density function d = len(self.channels) s0 = np.zeros([d, d]) for j in range(d): r = x[d].max() - x[d].min() s0[j, j] = (r / (num_clusters**(1. / d)))**0.5 means = [] weights = [] normals = [] beta_max = [] for k in range(num_clusters): xk = x[x_labels == k] num_k = np.sum(x_labels == k) weight_k = num_k / len(x_labels) mu = xk.mean(axis=0) means.append(mu) s = np.cov(xk, rowvar=False) el = num_k / (num_clusters + num_k) s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0 n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth) weights.append(weight_k) normals.append(lambda x, n=n: n.pdf(x)) # get appropriate step size for peak finding min_b = np.inf for b in np.diagonal(s_smooth): if np.sqrt(b) < min_b: min_b = np.sqrt(b) beta_max.append(b) self._normals[data_group] = normals self._density[ data_group] = density = lambda x, weights=weights, normals=normals: np.sum( [w * n(x) for w, n in zip(weights, normals)], axis=0) ### use optimization on the finite gmm to find the local peak for ### each kmeans cluster peaks = [] peak_clusters = [] # peak idx --> list of clusters min_mu = [np.inf] * len(self.channels) max_mu = [-1.0 * np.inf] * len(self.channels) for k in range(num_clusters): mu = means[k] for ci in range(len(self.channels)): if mu[ci] < min_mu[ci]: min_mu[ci] = mu[ci] if mu[ci] > max_mu[ci]: max_mu[ci] = mu[ci] constraints = [] for ci, c in enumerate(self.channels): constraints.append({ 'type': 'ineq', 'fun': lambda x, min_mu=min_mu[ci]: x - min_mu }) constraints.append({ 'type': 'ineq', 'fun': lambda x, max_mu=max_mu[ci]: max_mu - x }) for k in range(num_clusters): mu = means[k] f = lambda x: -1.0 * density(x) res = scipy.optimize.minimize(f, mu, method='COBYLA', constraints=constraints, options={ 'rhobeg': beta_max[k], 'maxiter': 5000 }) if not res.success: raise util.CytoflowOpError( "Peak finding failed for cluster {}: {}".format( k, res.message)) # ### The peak-searching algorithm from the paper. works fine, # ### but slow! we get similar results with the COBYLA # ### optimization method from scipy, using an appropriate rho # x0 = x = means[k] # k0 = k # b = beta_max[k] / 10.0 # Nsuc = 0 # n = 0 # # while(n < 1000): # # df = scipy.misc.derivative(density, x, 1e-6) # df = statsmodels.tools.numdiff.approx_fprime(x, density) # if np.linalg.norm(df) < 1e-3: # break # # y = x + b * df / np.linalg.norm(df) # if density(y) <= density(x): # Nsuc = 0 # b = b / 2.0 # continue # # Nsuc += 1 # if Nsuc >= 2: # b = min(2*b, beta_max[k]) # # ky = kmeans.predict(y[np.newaxis, :])[0] # if ky == k: # x = y # else: # k = ky # b = beta_max[k] / 10.0 # mu = means[k] # if density(mu) > density(y): # x = mu # else: # x = y # # n += 1 # # # # print("{} --> {}, {}".format(x0, x, n)) merged = False for pi, p in enumerate(peaks): if np.linalg.norm(p - res.x) < (1e-2): peak_clusters[pi].append(k) merged = True break if not merged: peak_clusters.append([k]) peaks.append(res.x) self._peaks[data_group] = peaks ### merge peaks that are sufficiently close groups = [[x] for x in range(len(peaks))] peak_groups = [x for x in range(len(peaks)) ] # peak idx --> group idx def max_tol(x, y): f = lambda a: density(a[np.newaxis, :]) # lx = kmeans.predict(x[np.newaxis, :])[0] # ly = kmeans.predict(y[np.newaxis, :])[0] n = len(x) n_scale = 1 # n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters)) def tol(t): zt = x + t * (y - x) fhat_zt = f(x) + t * (f(y) - f(x)) return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale res = scipy.optimize.minimize_scalar(tol, bounds=[0, 1], method='Bounded') if res.status != 0: raise util.CytoflowOpError( "tol optimization failed for {}, {}".format(x, y)) return -1.0 * res.fun def nearest_neighbor_dist(k): min_dist = np.inf for i in range(num_clusters): if i == k: continue dist = np.linalg.norm(means[k] - means[i]) if dist < min_dist: min_dist = dist return min_dist sk = [nearest_neighbor_dist(x) for x in range(num_clusters)] def s(x): k = kmeans.predict(x[np.newaxis, :])[0] return sk[k] def can_merge(g, h): for pg in g: for ph in h: vg = peaks[pg] vh = peaks[ph] dist_gh = np.linalg.norm(vg - vh) if max_tol(vg, vh) < self.tol and dist_gh / ( s(vg) + s(vh)) <= self.merge_dist: return True return False while True: if len(groups) == 1: break # find closest mergable groups min_dist = np.inf for gi in range(len(groups)): g = groups[gi] for hi in range(gi + 1, len(groups)): h = groups[hi] if can_merge(g, h): dist_gh = np.inf for pg in g: vg = peaks[pg] for ph in h: vh = peaks[ph] # print("vg {} vh {}".format(vg, vh)) dist_gh = min(dist_gh, np.linalg.norm(vg - vh)) if dist_gh < min_dist: min_gi = gi min_hi = hi min_dist = dist_gh if min_dist == np.inf: break # merge the groups groups[min_gi].extend(groups[min_hi]) for g in groups[min_hi]: peak_groups[g] = min_gi del groups[min_hi] cluster_group = [0] * num_clusters cluster_peaks = [0] * num_clusters for gi, g in enumerate(groups): for p in g: for cluster in peak_clusters[p]: cluster_group[cluster] = gi cluster_peaks[cluster] = p self._peaks[data_group] = peaks self._cluster_peak[data_group] = cluster_peaks self._cluster_group[data_group] = cluster_group def apply(self, experiment): """ Apply the KMeans clustering to the data """ if experiment is None: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") # make the statistics # clusters = [x + 1 for x in range(self.num_clusters)] # # idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], # names = list(self.by) + ["Cluster"] + ["Channel"]) # centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted_km = np.full(len(x), -1, "int") predicted_km[~x_na] = kmeans.predict(x[~x_na]) groups = np.asarray(self._cluster_group[group]) predicted_group = np.full(len(x), -1, "int") predicted_group[~x_na] = groups[predicted_km[~x_na]] # num_groups = len(set(groups)) # if self.find_outliers: # density = self._density[group] # max_d = [-1.0 * np.inf] * num_groups # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # d_x_c = density(x[xi]) # if d_x_c > max_d[x_c]: # max_d[x_c] = d_x_c # # group_density = [None] * num_groups # group_weight = [0.0] * num_groups # # for c in range(num_groups): # num_c = np.sum(predicted_group == c) # clusters = np.argwhere(groups == c).flatten() # # normals = [] # weights = [] # for k in range(len(clusters)): # num_k = np.sum(predicted_km == k) # weight_k = num_k / num_c # group_weight[c] += num_k / len(x) # weights.append(weight_k) # normals.append(self._normals[group][k]) # # group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0) # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # # if density(x[xi]) / max_d[x_c] < 0.01: # predicted_group[xi] = -1 # continue # # sum_d = 0 # for c in set(groups): # sum_d += group_weight[c] * group_density[c](x[xi]) # # if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8: # predicted_group[xi] = -1 # # max_d = -1.0 * np.inf # for x_c in x[predicted_group == c]: # x_c_d = density(x_c) # if x_c_d > max_d: # max_d = x_c_d # # for i in range(len(x)): # if predicted_group[i] == c and density(x[i]) / max_d <= 0.01: # predicted_group[i] = -1 # # predicted_str = pd.Series(["(none)"] * len(predicted_group)) for c in range(len(self._cluster_group[group])): predicted_str[predicted_group == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted_group == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) # new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) density = kwargs.pop('density', False) for c in channels: if c not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( "Must specify at least one channel for a default view") elif len(channels) == 1: return FlowPeaks1DView(op=self, channel=channels[0], scale=scale[channels[0]], **kwargs) elif len(channels) == 2: if density: return FlowPeaks2DDensityView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: return FlowPeaks2DView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: raise util.CytoflowViewError( "Can't specify more than two channels for a default view")
class BinningOp(HasStrictTraits): """ Bin data along an axis. This operation creates equally spaced bins (in linear or log space) along an axis and adds a condition assigning each event to a bin. The value of the event's condition is the left end of the bin's interval in which the event is located. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel along which to bin. scale : {"linear", "log", "logicle"} Make the bins equidistant along what scale? num_bins : Int The number of bins to make. Must set either :attr:`num_bins` or :attr:`bin_width`. If both are defined, :attr:`num_bins` takes precedence. bin_width : Float The width of the bins. Must set either :attr:`num_bins` or :attr:`bin_width`. If :attr:`scale` is ``log``, :attr:`bin_width` is in log-10 units; if :attr:`scale` is ``logicle``, and error is thrown because the units are ill-defined. If both :attr:`num_bins` and :attr:`bin_width` are defined, :attr:`num_bins` takes precedence. bin_count_name : Str If :attr:`bin_count_name` is set, :meth:`apply` adds another column to the resulting :class:`Experiment` that contains the number of events in the bin that this event falls in. Useful for filtering bins by number of events. Examples -------- Create a small experiment: .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")] >>> ex = import_op.apply() Create and parameterize the operation .. plot:: :context: close-figs >>> bin_op = flow.BinningOp() >>> bin_op.name = "Bin" >>> bin_op.channel = "FITC-A" >>> bin_op.scale = "log" >>> bin_op.bin_width = 0.2 Apply the operation to the experiment .. plot:: :context: close-figs >>> ex2 = bin_op.apply(ex) Plot the result .. plot:: :context: close-figs >>> bin_op.default_view().plot(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.binning') friendly_id = Constant("Binning") name = CStr() bin_count_name = CStr() channel = Str() num_bins = util.PositiveInt(0, allow_zero=True) bin_width = util.PositiveFloat(0, allow_zero=True) scale = util.ScaleEnum _max_num_bins = Int(100) def apply(self, experiment): """ Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment A new experiment with a condition column named :attr:`name`, which contains the location of the left-most edge of the bin that the event is in. If :attr:`bin_count_name` is set, another column is added with that name as well, containing the number of events in the same bin as the event. """ if experiment is None: raise util.CytoflowOpError('experiment', "no experiment specified") if not self.name: raise util.CytoflowOpError('name', "Name is not set") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Name {} is in the experiment already".format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError( 'bin_count_name', "bin_count_name {} is in the experiment already".format( self.bin_count_name)) if not self.channel: raise util.CytoflowOpError('channel', "channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError( 'channel', "channel {} isn't in the experiment".format(self.channel)) if not self.num_bins and not self.bin_width: raise util.CytoflowOpError('num_bins', "must set either bin number or width") if self.bin_width \ and not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError( 'scale', "Can only use bin_width with linear or log scale") scale = util.scale_factory(self.scale, experiment, channel=self.channel) scaled_data = scale(experiment.data[self.channel]) scaled_min = bn.nanmin(scaled_data) scaled_max = bn.nanmax(scaled_data) num_bins = self.num_bins if self.num_bins else \ (scaled_max - scaled_min) / self.bin_width if num_bins > self._max_num_bins: raise util.CytoflowOpError( None, "Too many bins! To increase this limit, " "change _max_num_bins (currently {})".format( self._max_num_bins)) scaled_bins = np.linspace(start=scaled_min, stop=scaled_max, num=num_bins) if len(scaled_bins) < 2: raise util.CytoflowOpError('num_bins', "Must have more than one bin") # put the data in bins bin_idx = np.digitize(scaled_data, scaled_bins[1:-1]) # now, back into data space bins = scale.inverse(scaled_bins) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "float", bins[bin_idx]) # keep track of the bins we used, for prettier plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! # TODO - fix this, then turn it on by default agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to check the binning. Returns ------- IView An view instance, call :meth:`plot()` to plot the bins. """ return BinningView(op=self, **kwargs)
class GaussianMixtureOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to one or more channels. If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical metadata variable named ``name``, with possible values ``{name}_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it has the highest posterior probability of having been produced by component ``i``. If an event has a value that is outside the range of one of the channels' scales, then it is assigned to ``{name}_None``. Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where ``n`` is the number of components. The column ``{name}_i`` is ``True`` if the event is less than :attr:`sigma` standard deviations from the mean of component ``i``. If :attr:`num_components` is ``1``, :attr:`sigma` must be greater than 0. .. note:: The :attr:`sigma` attribute does NOT affect how events are assigned to components in the new ``name`` variable. That is to say, if an event is more than :attr:`sigma` standard deviations from ALL of the components, you might expect it would be labeled as ``{name}_None``. It is *not*. An event is only labeled ``{name}_None`` if it has a value that is outside of the channels' scales. Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new ``double`` metadata variables named ``{name}_1_posterior`` ... ``{name}_n_posterior`` where ``n`` is the number of components. The column ``{name}_i_posterior`` contains the posterior probability that this event is a member of component ``i``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components must be the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the mixture model to. scale : Dict(Str : {"linear", "logicle", "log"}) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`~.set_default_scale`) is used. num_components : Int (default = 1) How many components to fit to the data? Must be a positive integer. sigma : Float If not None, use this operation as a "gate": for each component, create a new boolean variable ``{name}_i`` and if the event is within :attr:`sigma` standard deviations, set that variable to ``True``. If :attr:`num_components` is ``1``, must be ``> 0``. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. posteriors : Bool (default = False) If ``True``, add columns named ``{name}_{i}_posterior`` giving the posterior probability that the event is in component ``i``. Useful for filtering out low-probability events. Notes ----- We use the Mahalnobis distance as a multivariate generalization of the number of standard deviations an event is from the mean of the multivariate gaussian. If :math:`\\vec{x}` is an observation from a distribution with mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['Y2-A'], ... scale = {'Y2-A' : 'log'}, ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) And with two channels: .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['V2-A', 'Y2-A'], ... scale = {'V2-A' : 'log', ... 'Y2-A' : 'log'}, ... num_components = 2) >>> gm_op.estimate(ex) >>> ex2 = gm_op.apply(ex) >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian') friendly_id = Constant("Gaussian Mixture Model") name = Str channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_components = util.PositiveInt(1, allow_zero = False) sigma = util.PositiveFloat(None, allow_zero = False, allow_none = True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(sklearn.mixture.GaussianMixture), transient = True) _scale = Dict(Str, Instance(util.IScale), transient = True) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") if len(self.channels) != len(set(self.channels)): raise util.CytoflowOpError('channels', "Must not duplicate channels") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError('channels', "Scale set for channel {0}, but it isn't " "in the experiment" .format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError('subset', "Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowViewError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError(None, "Group {} had no data" .format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values gmm = sklearn.mixture.GaussianMixture(n_components = self.num_components, covariance_type = "full", random_state = 1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError(None, "Estimator didn't converge" " for group {0}" .format(group)) # in the 1D version, we sorted the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. # that doesn't work in the general case. instead, we assume that # the clusters are likely (?) to be arranged along *one* of the # axes, so we take the |norm| of the mean of each cluster and # sort that way. norms = np.sum(gmm.means_ ** 2, axis = 1) ** 0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmm.precisions_ = gmm.precisions_[sort_idx] gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with the new condition variables as described in the class documentation. Also adds the following new statistics: - **mean** : Float the mean of the fitted gaussian in each channel for each component. - **sigma** : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. - **correlation** : Float the correlation coefficient between each pair of channels for each component. - **proportion** : Float the proportion of events in each component of the mixture model. only added if :attr:`num_components` ``> 1``. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if self.sigma is not None: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {}" .format(cname)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in self._scale: raise util.CytoflowOpError(None, "Model scale not set. Did you forget " "to call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) # # if self.num_components == 1 and self.sigma == 0.0: # raise util.CytoflowOpError('sigma', # "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: warn("If num_components == 1, all posteriors will be 1", util.CytoflowOpWarning) # raise util.CytoflowOpError('posteriors', # "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object") if self.sigma is not None: event_gate = {i : pd.Series([False] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.posteriors: event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double") for i in range(self.num_components)} if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], names = list(self.by) + ["Component"]) prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"), index = prop_idx, dtype = np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], names = list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"), index = mean_idx, dtype = np.dtype(object)).sort_index() sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"), index = mean_idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"), index = mean_idx, dtype = np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"), index = corr_idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma is not None: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: p = np.full((len(x), self.num_components), 0.0) p[~x_na] = gmm.predict_proba(x[~x_na]) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[:, c] for c in range(self.num_components): if len(self.by) == 0: g = tuple([c + 1]) elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.at[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.at[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1])) interval_stat.at[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma is not None: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat new_experiment.statistics[(self.name, "interval")] = interval_stat if len(corr_stat) > 0: new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError('channels', "Channel {} isn't in the operation's channels" .format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError('scale', "Channel {} isn't in the operation's channels" .format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError('channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = GaussianMixture1DView(op = self) v.trait_set(channel = channels[0], scale = scale[channels[0]], **kwargs) return v elif len(channels) == 2: v = GaussianMixture2DView(op = self) v.trait_set(xchannel = channels[0], ychannel = channels[1], xscale = scale[channels[0]], yscale = scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError('channels', "Can't specify more than two channels for a default view")
class BinningOp(HasStrictTraits): """ Bin data along an axis. This operation creates equally spaced bins (in linear or log space) along an axis and adds a metadata column assigning each event to a bin. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel along which to bin. scale : Enum("linear", "log", "logicle) Make the bins equidistant along what scale? num_bins = Int The number of bins to make. Must set either `num_bins` or `bin_width`. If both are defined, `num_bins` takes precedence. bin_width = Float The width of the bins. Must set either `num_bins` or `bin_width`. If `scale` is `log`, `bin_width` is in log-10 units; if `scale` is `logicle`, and error is thrown because the units are ill-defined. If both `num_bins` and `bin_width` are defined, `num_bins` takes precedence. bin_count_name : Str If `bin_count_name` is set, add another piece of metadata when calling `apply()` that contains the number of events in the bin that this event falls in. Useful for filtering bins by # of events. Examples -------- >>> bin_op = flow.BinningOp(name = "CFP_Bin", ... channel = "PE-Tx-Red-YG-A", ... scale = "linear", ... num_bins = 40) >>> ex5_binned = bin_op.apply(ex5) >>> h.huefacet = "CFP_Bin" >>> h.plot(ex5_binned) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.binning') friendly_id = Constant("Binning") name = CStr() bin_count_name = CStr() channel = Str() num_bins = util.PositiveInt(Undefined) bin_width = util.PositiveFloat(Undefined) scale = util.ScaleEnum def apply(self, experiment): """Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment, the same as old_experiment but with a new column the same as the operation name. The bool is True if the event's measurement in self.channel is greater than self.low and less than self.high; it is False otherwise. """ if not experiment: raise util.CytoflowOpError("no experiment specified") if not self.name: raise util.CytoflowOpError("name is not set") if self.name in experiment.data.columns: raise util.CytoflowOpError("name {0} is in the experiment already" .format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError("bin_count_name {0} is in the experiment already" .format(self.bin_count_name)) if not self.channel: raise util.CytoflowOpError("channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError("channel {0} isn't in the experiment" .format(self.channel)) if self.num_bins is Undefined and self.bin_width is Undefined: raise util.CytoflowOpError("must set either bin number or width") if self.num_bins is Undefined \ and not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError("Can only use bin_width with linear or log scale") scale = util.scale_factory(self.scale, experiment, self.channel) scaled_data = scale(experiment.data[self.channel]) channel_min = bn.nanmin(scaled_data) channel_max = bn.nanmax(scaled_data) num_bins = self.num_bins if self.num_bins is not Undefined else \ (channel_max - channel_min) / self.bin_width bins = np.linspace(start = channel_min, stop = channel_max, num = num_bins) # bins need to be internal; drop the first and last one bins = bins[1:-1] new_experiment = experiment.clone() new_experiment.add_condition(self.name, "int", np.digitize(scaled_data, bins)) # if we're log-scaled (for example), don't label data that isn't # showable on a log scale! new_experiment.data.ix[np.isnan(scaled_data), self.name] = np.NaN # keep track of the bins we used, for pretty plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): return BinningView(op = self, **kwargs)
class GaussianMixture2DOp(HasStrictTraits): """ This module fits a 2D Gaussian mixture model with a specified number of components to a pair of channels. .. warning:: :class:`GaussianMixture2DOp` is **DEPRECATED** and will be removed in a future release. It doesn't correctly handle the case where an event is present in more than one component. Please use :class:`GaussianMixtureOp` instead! Creates a new categorical metadata variable named :attr:`name`, with possible values ``name_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it falls within :attr:`sigma` standard deviations of the component's mean. If that is true for multiple categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to ``name_None``. As a special case, if :attr:`num_components` is ``1`` and :attr:`sigma` ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in the gate and ``False`` otherwise. Optionally, if :attr:`posteriors` is ``True``, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named ``{Name}_Posterior``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. xscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the Y axis before fitting the data? num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If :attr:`sigma` is ``0.0``, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be ``>= 0.0``. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. posteriors : Bool (default = False) If ``True``, add a column named ``{Name}_Posterior`` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixture2DOp(name = 'Flow', ... xchannel = 'V2-A', ... xscale = 'log', ... ychannel = 'Y2-A', ... yscale = 'log', ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view with the distributions .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d') friendly_id = Constant("2D Gaussian Mixture") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum num_components = util.PositiveInt sigma = util.PositiveFloat(0.0, allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True) _xscale = Instance(util.IScale, transient=True) _yscale = Instance(util.IScale, transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters. Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ warn( "GaussianMixture2DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError( 'xchannel', "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( 'ychannel', "Column {0} not found in the experiment".format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError( 'posteriors', "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except Exception as e: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) from e if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( None, "Group {} had no data".format(group)) x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GaussianMixture(n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError( None, "Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with a column named :attr:`name` and optionally one named :attr:`name` ``_Posterior``. Also includes the following new statistics: - **xmean** : Float the mean of the fitted gaussian in the x dimension. - **ymean** : Float the mean of the fitted gaussian in the y dimension. - **proportion** : Float the proportion of events in each component of the mixture model. only set if :attr:`num_components` ``> 1``. PS -- if someone has good ideas for summarizing spread in a 2D (non-isotropic) Gaussian, or other useful statistics, let me know! """ warn( "GaussianMixture2DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must set X channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if not self._gmms: raise util.CytoflowOpError( None, "No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError( None, "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError( None, "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError( 'xchannel', "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( 'ychannel', "Column {0} not found in the experiment".format(self.ychannel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError( 'channels', "Column {0} already found in the experiment".format( col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.sigma < 0.0: raise util.CytoflowOpError('sigma', "sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype="object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({ "x": x[:, 0], "y": x[:, 1], "p": predicted }) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm.covariances_[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval( "p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1" ).values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition( self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product( [new_experiment[x].unique() for x in levels], names=levels) xmean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() ymean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() prop_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = g[0] else: g = group xmean_stat.loc[g] = self._xscale.inverse(gmm.means_[c][0]) ymean_stat.loc[g] = self._yscale.inverse(gmm.means_[c][0]) prop_stat.loc[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "xmean")] = pd.to_numeric(xmean_stat) new_experiment.statistics[(self.name, "ymean")] = pd.to_numeric(ymean_stat) if self.num_components > 1: new_experiment.statistics[( self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call :meth:`~GaussianMixture2DView.plot` to see the diagnostic plot. """ warn( "GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) return GaussianMixture2DView(op=self, **kwargs)
class BinningOp(HasStrictTraits): """ Bin data along an axis. This operation creates equally spaced bins (in linear or log space) along an axis and adds a condition assigning each event to a bin. The value of the event's condition is the left end of the bin's interval in which the event is located. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel along which to bin. scale : {"linear", "log", "logicle"} Make the bins equidistant along what scale? bin_width : Float The width of the bins. If :attr:`scale` is ``log``, :attr:`bin_width` is in log-10 units; if :attr:`scale` is ``logicle``, an error is thrown because the units are ill-defined. Examples -------- Create a small experiment: .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")] >>> ex = import_op.apply() Create and parameterize the operation .. plot:: :context: close-figs >>> bin_op = flow.BinningOp() >>> bin_op.name = "Bin" >>> bin_op.channel = "FITC-A" >>> bin_op.scale = "log" >>> bin_op.bin_width = 0.2 Apply the operation to the experiment .. plot:: :context: close-figs >>> ex2 = bin_op.apply(ex) Plot the result .. plot:: :context: close-figs >>> bin_op.default_view().plot(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.binning') friendly_id = Constant("Binning") name = Str bin_count_name = Str channel = Str num_bins = util.Removed(err_string="'num_bins' was removed in 0.9") bin_width = util.PositiveFloat(None, allow_zero=False, allow_none=True) scale = util.ScaleEnum _max_num_bins = Int(100) def apply(self, experiment): """ Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment A new experiment with a condition column named :attr:`name`, which contains the location of the left-most edge of the bin that the event is in. If :attr:`bin_count_name` is set, another column is added with that name as well, containing the number of events in the same bin as the event. """ if experiment is None: raise util.CytoflowOpError('experiment', "no experiment specified") if not self.name: raise util.CytoflowOpError('name', "Name is not set") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Name {} is in the experiment already".format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError( 'bin_count_name', "bin_count_name {} is in the experiment already".format( self.bin_count_name)) if not self.channel: raise util.CytoflowOpError('channel', "channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError( 'channel', "channel {} isn't in the experiment".format(self.channel)) if self.bin_width is None: raise util.CytoflowOpError('bin_width', "must set bin width") if not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError( 'scale', "Can only use binning op with linear or log scale") scale = util.scale_factory(self.scale, experiment, channel=self.channel) scaled_min = scale(scale.clip(experiment.data[self.channel]).min()) scaled_max = scale(scale.clip(experiment.data[self.channel]).max()) if self.scale == 'linear': start = 0 else: start = 1 scaled_bins_left = np.arange(start=-1.0 * start, stop=(-1.0 * scaled_min) + self.bin_width, step=self.bin_width) * -1.0 scaled_bins_left = scaled_bins_left[::-1][:-1] scaled_bins_right = np.arange(start=start, stop=scaled_max + self.bin_width, step=self.bin_width) scaled_bins = np.append(scaled_bins_left, scaled_bins_right) if len(scaled_bins) > self._max_num_bins: raise util.CytoflowOpError( None, "Too many bins! To increase this limit, " "change _max_num_bins (currently {})".format( self._max_num_bins)) if len(scaled_bins) < 2: raise util.CytoflowOpError('bin_width', "Must have more than one bin") # now, back into data space bins = scale.inverse(scaled_bins) # reduce to 4 sig figs bins = ['%.4g' % x for x in bins] bins = [float(x) for x in bins] bins = np.array(bins) # put the data in bins bin_idx = np.digitize(experiment.data[self.channel], bins[1:-1]) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "float64", bins[bin_idx]) # keep track of the bins we used, for prettier plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! # TODO - fix this, then turn it on by default agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to check the binning. Returns ------- IView An view instance, call :meth:`plot()` to plot the bins. """ v = BinningView(op=self) v.trait_set(**kwargs) return v
class GaussianMixture1DOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to a channel. Creates a new categorical metadata variable named `name`, with possible values `name_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it falls within `sigma` standard deviations of the component's mean. If that is true for multiple categories (or if `sigma == 0.0`), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to `name_None`. As a special case, if `num_components` is `1` and `sigma` > 0.0, then the new condition is boolean, `True` if the event fell in the gate and `False` otherwise. Optionally, if `posteriors` is `True`, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named `{Name}_Posterior`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channel : Str Which channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log", "logicle") (default = "linear") Re-scale the data before fitting the data? posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- >>> gauss_op = GaussianMixture1DOp(name = "Gaussian", ... channel = "Y2-A", ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view().plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d') friendly_id = Constant("1D Gaussian Mixture") name = CStr() channel = Str() num_components = util.PositiveInt(1) sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) scale = util.ScaleEnum posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GMM), transient = True) _scale = Instance(util.IScale, transient = True) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.channel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.channel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowOpError("If num_components == 1, sigma must be > 0") if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError("Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowViewError("Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._scale = util.scale_factory(self.scale, experiment, self.channel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError("Group {} had no data" .format(group)) x = data_subset[self.channel].reset_index(drop = True) x = self._scale(x) # drop data that isn't in the scale range #x = pd.Series(self._scale(x)).dropna() x = x[~np.isnan(x)] gmm = mixture.GMM(n_components = self.num_components, random_state = 1) gmm.fit(x[:, np.newaxis]) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}" .format(group)) # to make sure we have a stable ordering, sort the components # by the means (so the first component has the lowest mean, # the next component has the next-lowest, etc.) sort_idx = np.argsort(gmm.means_[:, 0]) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covars_ = gmm.covars_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self._gmms: raise util.CytoflowOpError("No model found. Did you forget to " "call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._scale: raise util.CytoflowOpError("Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.channel)) if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: if group not in self._gmms: raise util.CytoflowOpError("Can't find group in model. " "Did you call estimate()?") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x).values # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covars_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covars_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return GaussianMixture1DView(op = self, **kwargs)
class GaussianMixtureOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to one or more channels. If `num_components > 1`, `apply()` creates a new categorical metadata variable named `name`, with possible values `{name}_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it has the highest posterior probability of having been produced by component `i`. If an event has a value that is outside the range of one of the channels' scales, then it is assigned to `{name}_None`. Optionally, if `sigma` is greater than 0, `apply()` creates new `boolean` metadata variables named `{name}_1` ... `{name}_n` where `n` is the number of components. The column `{name}_i` is `True` if the event is less than `sigma` standard deviations from the mean of component `i`. If `num_components == 1`, `sigma` must be greater than 0. Optionally, if `posteriors` is `True`, `apply()` creates a new `double` metadata variables named `{name}_1_posterior` ... `{name}_n_posterior` where `n` is the number of components. The column `{name}_i_posterior` contains the posterior probability that this event is a member of component `i`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components must be the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the mixture model to. scale : Dict(Str : Enum("linear", "logicle", "log")) Re-scale the data in the specified channels before fitting. If a channel is in `channels` but not in `scale`, the current package-wide default (set with `set_default_scale`) is used. num_components : Int (default = 1) How many components to fit to the data? Must be a positive integer. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in the boolean variable `{name}_i`? Must be >= 0.0. If `num_components == 1`, must be > 0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. posteriors : Bool (default = False) If `True`, add columns named `{Name}_{i}_Posterior` giving the posterior probability that the event is in component `i`. Useful for filtering out low-probability events. Statistics ---------- mean : Float the mean of the fitted gaussian in each channel for each component. sigma : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. correlation : Float the correlation coefficient between each pair of channels for each component. proportion : Float the proportion of events in each component of the mixture model. only added if `num_components` > 1. Notes ----- We use the Mahalnobis distance as a multivariate generalization of the number of standard deviations an event is from the mean of the multivariate gaussian. If \vec{x} is an observation from a distribution with mean \vec{mu} and S is the covariance matrix, then the Mahalanobis distance is sqrt((x - mu)^T * S^-1 *(x - mu)). Examples -------- >>> gauss_op = GaussianMixtureOp(name = "Gaussian", ... channels = ["V2-A", "Y2-A"], ... scale = {"V2-A" : "log"}, ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian') friendly_id = Constant("Gaussian Mixture") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_components = util.PositiveInt(allow_zero=False) sigma = util.PositiveFloat(allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(sklearn.mixture.GaussianMixture), transient=True) _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values gmm = sklearn.mixture.GaussianMixture( n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sorted the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. # that doesn't work in the general case. instead, we assume that # the clusters are likely (?) to be arranged along *one* of the # axes, so we take the |norm| of the mean of each cluster and # sort that way. norms = np.sum(gmm.means_**2, axis=1)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if self.sigma > 0: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {}".format( cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {}".format( cname)) if not self._gmms: raise util.CytoflowOpError( "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError( "Column {0} already found in the experiment".format( col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowOpError( "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError( "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") if self.sigma > 0: event_gate = { i: pd.Series([False] * len(experiment), dtype="double") for i in range(self.num_components) } if self.posteriors: event_posteriors = { i: pd.Series([0.0] * len(experiment), dtype="double") for i in range(self.num_components) } if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components], names=list(self.by) + ["Component"]) prop_stat = pd.Series(index=prop_idx, dtype=np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels], names=list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(index=mean_idx, dtype=np.dtype(object)).sort_index() sigma_stat = pd.Series(index=mean_idx, dtype=np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names=list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(index=corr_idx, dtype=np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: p = gmm.predict(x) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[c] for c in range(self.num_components): if len(self.by) == 0: g = [c + 1] elif hasattr(group, '__iter__'): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.loc[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.loc[g2] = self._scale[channel1].inverse( gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat.loc[g2] = ( self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma > 0: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat if len(corr_stat) > 0: new_experiment.statistics[( self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[( self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( "Must specify at least one channel for a default view") elif len(channels) == 1: return GaussianMixture1DView(op=self, channel=channels[0], scale=scale[channels[0]], **kwargs) elif len(channels) == 2: return GaussianMixture2DView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: raise util.CytoflowViewError( "Can't specify more than two channels for a default view")
class DensityGateOp(HasStrictTraits): """ This module computes a gate based on a 2D density plot. The user chooses what proportion of cells to keep, and the module creates a gate that selects that proportion of cells in the highest-density bins of the 2D density histogram. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. xscale : Enum("linear", "logicle", "log") (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : Enum("linear", "logicle", "log") (default = "linear") Re-scale the data on the Y axis before fitting the data? keep : Float (default = 0.9) What proportion of events to keep? Must be positive. bins : Int (default = 100) How many bins should there be on each axis? Must be positive. min_quantile : Float (default = 0.001) Clip values below this quantile max_quantile : Float (default = 1.0) Clip values above this quantile sigma : Float (default = 1.0) What standard deviation to use for the gaussian blur? by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. Notes ----- This gating method was developed by John Sexton, in Jeff Tabor's lab at Rice University. From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html, the method is as follows: 1. Determines the number of events to keep, based on the user specified gating fraction and the total number of events of the input sample. 2. Divides the 2D channel space into a rectangular grid, and counts the number of events falling within each bin of the grid. The number of counts per bin across all bins comprises a 2D histogram, which is a coarse approximation of the underlying probability density function. 3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. Theoretically, the proper amount of smoothing results in a better estimate of the probability density function. Practically, smoothing eliminates isolated bins with high counts, most likely corresponding to noise, and smoothes the contour of the gated region. 4. Selects the bins with the greatest number of events in the smoothed histogram, starting with the highest and proceeding downward until the desired number of events to keep, calculated in step 1, is achieved. Examples -------- >>> density_op = DensityGateOp(name = "Density", ... xchannel = "V2-A", ... ychannel = "Y2-A", ... keep = 0.7) >>> density_op.estimate(ex2) >>> density_op.default_view().plot(ex2) >>> ex3 = density_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.density') friendly_id = Constant("Density Gate") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum keep = util.PositiveFloat(0.9, allow_zero=False) bins = util.PositiveInt(100, allow_zero=False) min_quantile = util.PositiveFloat(0.001, allow_zero=True) max_quantile = util.PositiveFloat(1.0, allow_zero=False) sigma = util.PositiveFloat(1.0, allow_zero=False) by = List(Str) _xscale = Instance(util.IScale, transient=True) _yscale = Instance(util.IScale, transient=True) _xbins = Array(transient=True) _ybins = Array(transient=True) _keep_xbins = Dict(Any, Array, transient=True) _keep_ybins = Dict(Any, Array, transient=True) _histogram = Dict(Any, Array, transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.ychannel)) if self.max_quantile > 1.0 or self.min_quantile > 1.0: raise util.CytoflowOpError( "min_quantile and max_quantile must be <= 1.0") if not (self.max_quantile > self.min_quantile): raise util.CytoflowOpError("max_quantile must be > min_quantile") if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") if self.keep > 1.0: raise util.CytoflowOpError("keep must be <= 1.0") for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) self._yscale = yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) xlim = (xscale.clip(experiment[self.xchannel].quantile( self.min_quantile)), xscale.clip(experiment[self.xchannel].quantile( self.max_quantile))) ylim = (yscale.clip(experiment[self.ychannel].quantile( self.min_quantile)), yscale.clip(experiment[self.ychannel].quantile( self.max_quantile))) self._xbins = xbins = xscale.inverse( np.linspace(xscale(xlim[0]), xscale(xlim[1]), self.bins)) self._ybins = ybins = yscale.inverse( np.linspace(yscale(ylim[0]), yscale(ylim[1]), self.bins)) for group, group_data in groupby: if len(group_data) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) h, _, _ = np.histogram2d(group_data[self.xchannel], group_data[self.ychannel], bins=[xbins, ybins]) h = scipy.ndimage.filters.gaussian_filter(h, sigma=self.sigma) i = scipy.stats.rankdata(h, method="ordinal") - 1 i = np.unravel_index(np.argsort(-i), h.shape) goal_count = self.keep * len(group_data) curr_count = 0 num_bins = 0 while (curr_count < goal_count and num_bins < i[0].size): curr_count += h[i[0][num_bins], i[1][num_bins]] num_bins += 1 self._keep_xbins[group] = i[0][0:num_bins] self._keep_ybins[group] = i[1][0:num_bins] self._histogram[group] = h # # self._keep_xbins[group] = i[0][0:num_bins] # self._keep_ybins[group] = i[1][0:num_bins] def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if not self.xchannel: raise util.CytoflowOpError("Must set X channel") if not self.ychannel: raise util.CytoflowOpError("Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if not (self._xbins.size and self._ybins.size and self._keep_xbins): raise util.CytoflowOpError( "No gate estimate found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError( "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError( "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([False] * len(experiment), dtype="bool") for group, group_data in groupby: if group not in self._keep_xbins: # there weren't any events in this group, so we didn't get # an estimate continue group_idx = groupby.groups[group] cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest=True, labels=False) cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest=True, labels=False) group_keep = pd.Series([False] * len(group_data)) keep_x = self._keep_xbins[group] keep_y = self._keep_ybins[group] for (xbin, ybin) in zip(keep_x, keep_y): group_keep[(cX == xbin) & (cY == ybin)] = True event_assignments.iloc[group_idx] = group_keep new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", event_assignments) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return DensityGateView(op=self, **kwargs)