Beispiel #1
0
class ImportPluginOp(PluginOpMixin, ImportOp):
    handler_factory = Callable(ImportHandler, transient=True)

    original_channels = List(Str, estimate=True)
    channels_list = List(Channel, estimate=True)
    events = util.CIntOrNone(None, estimate=True)
    tubes = List(Tube, estimate=True)
    channels = Dict(Str, Str, transient=True)
    name_metadata = Enum(None, "$PnN", "$PnS", estimate=True)

    ret_events = util.PositiveInt(0, allow_zero=True, status=True)
    do_import = Bool(False)

    def reset_channels(self):
        self.channels_list = [
            Channel(channel=x, name=util.sanitize_identifier(x))
            for x in self.original_channels
        ]

    @on_trait_change('channels_list_items, channels_list.+')
    def _channels_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('channels_list',
                                           self.channels_list))

    @on_trait_change('tubes_items, tubes:+')
    def _tubes_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('tubes', self.tubes))

    def estimate(self, _):
        self.do_import = False
        self.do_import = True

    def apply(self, experiment=None, metadata_only=False, force=False):
        if self.do_import or force:
            self.channels = {c.channel: c.name for c in self.channels_list}
            ret = super().apply(experiment=experiment,
                                metadata_only=metadata_only)

            self.ret_events = len(ret.data)
            return ret
        else:
            if not self.tubes:
                raise util.CytoflowOpError(
                    None, 'Click "Set up experiment", '
                    'then "Import!"')
            raise util.CytoflowOpError(None, "Press 'Import!'")

    def clear_estimate(self):
        self.do_import = False

    def get_notebook_code(self, idx):
        op = ImportOp()
        op.copy_traits(self, op.copyable_trait_names())
        op.channels = {c.channel: c.name for c in self.channels_list}

        return dedent("""
            op_{idx} = {repr}
            
            ex_{idx} = op_{idx}.apply()""".format(repr=repr(op), idx=idx))
Beispiel #2
0
class ImportHandler(OpHandlerMixin, Controller):
    
    import_event = Button(label="Edit samples...")
    samples = Property(depends_on = 'model.tubes', status = True)

    coarse = Bool
    coarse_events = util.PositiveInt(0, allow_zero = True)
    
    def default_traits_view(self):
        return View(Item('handler.import_event',
                         show_label=False),
                    Item('handler.samples',
                         label='Samples',
                         style='readonly'),
                    Item('ret_events',
                         label='Events',
                         style='readonly'),
                    Item('handler.coarse',
                         label="Random subsample?",
                         show_label = False,
                         editor = ToggleButtonEditor()),
                    Item('object.events',
                         editor = TextEditor(auto_set = False),
                         label="Events per\nsample",
                         visible_when='handler.coarse == True'),
                    shared_op_traits)
        
    def _import_event_fired(self):
        """
        Import data; save as self.result
        """

        d = ExperimentDialog()

        # self.model is an instance of ImportPluginOp
        d.model.init_model(self.model)
            
        d.size = (550, 500)
        d.open()
        
        if d.return_code is not PyfaceOK:
            return
        
        d.model.update_import_op(self.model)
        
        d = None
        
    @cached_property
    def _get_samples(self):
        return len(self.model.tubes)
        
    @on_trait_change('coarse')    
    def _on_coarse_changed(self):
        if self.coarse:
            self.model.events = self.coarse_events
        else:
            self.coarse_events = self.model.events
            self.model.events = 0
Beispiel #3
0
class ImportPluginOp(ImportOp, PluginOpMixin):
    handler_factory = Callable(ImportHandler, transient=True)
    ret_events = util.PositiveInt(0, allow_zero=True, status=True)

    def apply(self, experiment=None):
        ret = super(ImportPluginOp, self).apply(experiment=experiment)
        self.ret_events = len(ret.data)

        return ret
Beispiel #4
0
class GaussianMixture1DPluginOp(PluginOpMixin, GaussianMixtureOp):
    handler_factory = Callable(GaussianMixture1DHandler)

    channel = Str
    channel_scale = util.ScaleEnum(estimate=True)

    # add "estimate" metadata
    num_components = util.PositiveInt(1, estimate=True)
    sigma = util.PositiveFloat(0.0, allow_zero=True, estimate=True)
    by = List(Str, estimate=True)

    # bits to support the subset editor

    subset_list = List(ISubset, estimate=True)
    subset = Property(Str, depends_on="subset_list.str")

    # MAGIC - returns the value of the "subset" Property, above
    def _get_subset(self):
        return " and ".join(
            [subset.str for subset in self.subset_list if subset.str])

    @on_trait_change('subset_list.str', post_init=True)
    def _subset_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('subset_list', self.subset_list))

    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True)

    @on_trait_change('channel')
    def _channel_changed(self):
        self.channels = [self.channel]
        self.changed = (Changed.ESTIMATE, ('channels', self.channels))

    @on_trait_change('channel_scale')
    def _scale_changed(self):
        if self.channel:
            self.scale[self.channel] = self.channel_scale
        self.changed = (Changed.ESTIMATE, ('scale', self.scale))

    def estimate(self, experiment):
        super().estimate(experiment, subset=self.subset)
        self.changed = (Changed.ESTIMATE_RESULT, self)

    def default_view(self, **kwargs):
        return GaussianMixture1DPluginView(op=self, **kwargs)

    def should_clear_estimate(self, changed):
        if changed == Changed.ESTIMATE:
            return True

        return False

    def clear_estimate(self):
        self._gmms = {}
        self._scale = {}
        self.changed = (Changed.ESTIMATE_RESULT, self)
Beispiel #5
0
class ImportWorkflowOp(WorkflowOperation, ImportOp):
    original_channels = List(Str)
    channels_list = List(Channel, estimate=True)
    events = util.CIntOrNone(None, estimate=True)
    tubes = List(Tube, estimate=True)
    conditions = Dict(Str, Str, estimate=True)
    channels = Dict(Str, Str, transient=True)
    name_metadata = Enum(None, "$PnN", "$PnS", estimate=True)

    # how many events did we load?
    ret_events = util.PositiveInt(0,
                                  allow_zero=True,
                                  status=True,
                                  estimate_result=True,
                                  transient=True)

    # since we're actually calling super().apply() from self.estimate(), we need
    # to keep around the actual experiment that's returned
    ret_experiment = Instance('cytoflow.experiment.Experiment', transient=True)

    def reset_channels(self):
        self.channels_list = [
            Channel(channel=x, name=util.sanitize_identifier(x))
            for x in self.original_channels
        ]

    def estimate(self, _):
        self.channels = {c.channel: c.name for c in self.channels_list}
        self.ret_experiment = super().apply()
        self.ret_events = len(self.ret_experiment)

    def apply(self, _):
        if self.ret_experiment:
            return self.ret_experiment
        elif not self.tubes:
            raise util.CytoflowOpError(
                None, 'Click "Set up experiment, then "Import!"')
        else:
            raise util.CytoflowOpError(None, 'Click "Import!"')

    def clear_estimate(self):
        self.ret_experiment = None
        self.ret_events = 0

    def get_notebook_code(self, idx):
        op = ImportOp()
        op.copy_traits(self, op.copyable_trait_names())
        op.channels = {c.channel: c.name for c in self.channels_list}

        return dedent("""
            op_{idx} = {repr}
            
            ex_{idx} = op_{idx}.apply()""".format(repr=repr(op), idx=idx))
Beispiel #6
0
class GaussianMixture1DPluginOp(PluginOpMixin, GaussianMixture1DOp):
    handler_factory = Callable(GaussianMixture1DHandler)
    
    # add "estimate" metadata
    num_components = util.PositiveInt(1, estimate = True)
    sigma = util.PositiveFloat(0.0, allow_zero = True, estimate = True)
    by = List(Str, estimate = True)
    
    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True, estimate_result = True)
    
    def estimate(self, experiment):
        GaussianMixture1DOp.estimate(self, experiment, subset = self.subset)
    
    def default_view(self, **kwargs):
        return GaussianMixture1DPluginView(op = self, **kwargs)
    
    def clear_estimate(self):
        self._gmms = {}
Beispiel #7
0
class ImportPluginOp(PluginOpMixin, ImportOp):
    handler_factory = Callable(ImportHandler, transient=True)
    ret_events = util.PositiveInt(0, allow_zero=True, status=True)
    events = util.PositiveCInt(None, allow_zero=True, allow_none=True)

    def apply(self, experiment=None):
        ret = super().apply(experiment=experiment)
        self.ret_events = len(ret.data)

        return ret

    def get_notebook_code(self, idx):
        op = ImportOp()
        op.copy_traits(self, op.copyable_trait_names())

        return dedent("""
            op_{idx} = {repr}
            
            ex_{idx} = op_{idx}.apply()""".format(repr=repr(op), idx=idx))
Beispiel #8
0
class GaussianMixtureOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to one or more channels.
    
    If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical 
    metadata variable named  ``name``, with possible values ``{name}_1`` .... 
    ``name_n`` where ``n`` is the number of components.  An event is assigned to 
    ``name_i`` category if it has the highest posterior probability of having been 
    produced by component ``i``.  If an event has a value that is outside the
    range of one of the channels' scales, then it is assigned to ``{name}_None``.
    
    Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new  
    ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where 
    ``n`` is the number of components.  The column ``{name}_i`` is ``True`` if 
    the event is less than :attr:`sigma` standard deviations from the mean of 
    component ``i``.  If :attr:`num_components` is ``1``, :attr:`sigma` must be 
    greater than 0.
    
    .. note::
       The :attr:`sigma` attribute does NOT affect how events are assigned to 
       components in the new ``name`` variable. That is to say, if an event
       is more than :attr:`sigma` standard deviations from ALL of the 
       components, you might expect it would be labeled as ``{name}_None``. 
       It is *not*. An event is only labeled ``{name}_None`` if it has a 
       value that is outside of the channels' scales.
    
    Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new 
    ``double`` metadata variables named ``{name}_1_posterior`` ... 
    ``{name}_n_posterior`` where ``n`` is the number of components.  The column 
    ``{name}_i_posterior`` contains the posterior probability that this event is 
    a member of component ``i``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components must be the same across each subset, though.
    
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the mixture model to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`~.set_default_scale`) is used.

    num_components : Int (default = 1)
        How many components to fit to the data?  Must be a positive integer.

    sigma : Float
        If not None, use this operation as a "gate": for each component, create 
        a new boolean variable ``{name}_i`` and if the event is within
        :attr:`sigma` standard deviations, set that variable to ``True``.
        If :attr:`num_components` is ``1``, must be ``> 0``.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit 
        the model separately to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.

    posteriors : Bool (default = False)
        If ``True``, add columns named ``{name}_{i}_posterior`` giving the 
        posterior probability that the event is in component ``i``.  Useful for 
        filtering out low-probability events.
        
    Notes
    -----
    
    We use the Mahalnobis distance as a multivariate generalization of the 
    number of standard deviations an event is from the mean of the multivariate
    gaussian.  If :math:`\\vec{x}` is an observation from a distribution with 
    mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the 
    Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`.
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['Y2-A'],
        ...                                scale = {'Y2-A' : 'log'},
        ...                                num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)
        
    And with two channels:
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['V2-A', 'Y2-A'],
        ...                                scale = {'V2-A' : 'log',
        ...                                         'Y2-A' : 'log'},
        ...                                num_components = 2)
        >>> gm_op.estimate(ex)   
        >>> ex2 = gm_op.apply(ex)
        >>> gm_op.default_view().plot(ex2)
        
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian')
    friendly_id = Constant("Gaussian Mixture Model")
    
    name = Str
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(1, allow_zero = False)
    sigma = util.PositiveFloat(None, allow_zero = False, allow_none = True)
    by = List(Str)
    
    posteriors = Bool(False)
    
    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(sklearn.mixture.GaussianMixture), transient = True)
    _scale = Dict(Str, Instance(util.IScale), transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        if len(self.channels) != len(set(self.channels)):
            raise util.CytoflowOpError('channels', 
                                       "Must not duplicate channels")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('channels',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError('subset',
                                             "Subset string '{0}' isn't valid"
                                             .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowViewError('subset',
                                             "Subset string '{0}' returned no events"
                                             .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(None,
                                           "Group {} had no data"
                                           .format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
            
            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values
            
            gmm = sklearn.mixture.GaussianMixture(n_components = self.num_components,
                                                  covariance_type = "full",
                                                  random_state = 1)
            gmm.fit(x)
            
            if not gmm.converged_:
                raise util.CytoflowOpError(None,
                                           "Estimator didn't converge"
                                           " for group {0}"
                                           .format(group))
                
            # in the 1D version, we sorted the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.
            
            # that doesn't work in the general case.  instead, we assume that 
            # the clusters are likely (?) to be arranged along *one* of the 
            # axes, so we take the |norm| of the mean of each cluster and 
            # sort that way.
            
            norms = np.sum(gmm.means_ ** 2, axis = 1) ** 0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
            gmm.precisions_ = gmm.precisions_[sort_idx]
            gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx]

            
            gmms[group] = gmm
            
        self._gmms = gmms
     
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """
             
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
        
        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if self.sigma is not None:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))
 
        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))               
         
        if not self._gmms:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
            
        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(None,
                                           "Model scale not set.  Did you forget "
                                           "to call estimate()?")
 
        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
#                             
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")
        
                
        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)
#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")
         
        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
 
        if self.sigma is not None:
            event_gate = {i : pd.Series([False] * len(experiment), dtype = "double")
                           for i in range(self.num_components)}
 
        if self.posteriors:
            event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double")
                                for i in range(self.num_components)}

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)   

        # make the statistics       
        components = [x + 1 for x in range(self.num_components)]
         
        prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], 
                                         names = list(self.by) + ["Component"])
        prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"),
                              index = prop_idx, 
                              dtype = np.dtype(object)).sort_index()
                  
        mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"),
                              index = mean_idx, 
                              dtype = np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"),
                               index = mean_idx,
                               dtype = np.dtype(object)).sort_index()
        interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"),
                                  index = mean_idx, 
                                  dtype = np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"])
        corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"),
                              index = corr_idx, 
                              dtype = np.dtype(object)).sort_index()  
                 
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
             
            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                
            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                        
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
 
            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])
                
                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx
     
                event_assignments.iloc[group_idx] = predicted_str
                
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma is not None:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]
                    
                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate 
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed
                    
                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)
                    
                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)
                    
            if self.posteriors:  
                p = np.full((len(x), self.num_components), 0.0)
                p[~x_na] = gmm.predict_proba(x[~x_na])
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]
                    
            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = tuple([c + 1])
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.at[g] = gmm.weights_[c]
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.at[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1])
                    
                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.at[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]),
                                             self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1]))
            
                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]
                        
                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True)

        new_experiment = experiment.clone()
          
        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.sigma is not None:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])              
                
        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double", event_posteriors[c])
                
        new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment

     
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.

         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        
        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError('channels',
                                             "Channel {} isn't in the operation's channels"
                                             .format(c))
                
        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError('scale',
                                             "Channel {} isn't in the operation's channels"
                                             .format(s))
            
        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()
            
        if len(channels) == 0:
            raise util.CytoflowViewError('channels',
                                         "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = GaussianMixture1DView(op = self)
            v.trait_set(channel = channels[0], 
                        scale = scale[channels[0]], 
                        **kwargs)
            return v
        
        elif len(channels) == 2:
            v = GaussianMixture2DView(op = self)
            v.trait_set(xchannel = channels[0], 
                        ychannel = channels[1],
                        xscale = scale[channels[0]],
                        yscale = scale[channels[1]], 
                        **kwargs)
            return v
        
        else:
            raise util.CytoflowViewError('channels',
                                         "Can't specify more than two channels for a default view")
Beispiel #9
0
class BinningOp(HasStrictTraits):
    """
    Bin data along an axis.
    
    This operation creates equally spaced bins (in linear or log space)
    along an axis and adds a metadata column assigning each event to a bin.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    channel : Str
        The name of the channel along which to bin.

    scale : Enum("linear", "log", "logicle)
        Make the bins equidistant along what scale?
        
    num_bins = Int
        The number of bins to make.  Must set either `num_bins` or `bin_width`.
        If both are defined, `num_bins` takes precedence.
        
    bin_width = Float
        The width of the bins.  Must set either `num_bins` or `bin_width`.  If
        `scale` is `log`, `bin_width` is in log-10 units; if `scale` is
        `logicle`, and error is thrown because the units are ill-defined.
        If both `num_bins` and `bin_width` are defined, `num_bins` takes 
        precedence. 
        
    bin_count_name : Str
        If `bin_count_name` is set, add another piece of metadata when calling
        `apply()` that contains the number of events in the bin that this event
        falls in.  Useful for filtering bins by # of events.
        
    Examples
    --------
    >>> bin_op = flow.BinningOp(name = "CFP_Bin",
    ...                         channel = "PE-Tx-Red-YG-A",
    ...                         scale = "linear",
    ...                         num_bins = 40)
    >>> ex5_binned = bin_op.apply(ex5)

    >>> h.huefacet = "CFP_Bin"
    >>> h.plot(ex5_binned)
    """
    
    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.binning')
    friendly_id = Constant("Binning")
    
    name = CStr()
    bin_count_name = CStr()
    channel = Str()
    num_bins = util.PositiveInt(Undefined)
    bin_width = util.PositiveFloat(Undefined)
    scale = util.ScaleEnum

    def apply(self, experiment):
        """Applies the binning to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment, the same as old_experiment but with a new
            column the same as the operation name.  The bool is True if the
            event's measurement in self.channel is greater than self.low and
            less than self.high; it is False otherwise.
        """
        if not experiment:
            raise util.CytoflowOpError("no experiment specified")
        
        if not self.name:
            raise util.CytoflowOpError("name is not set")
        
        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("name {0} is in the experiment already"
                                  .format(self.name))
            
        if self.bin_count_name and self.bin_count_name in experiment.data.columns:
            raise util.CytoflowOpError("bin_count_name {0} is in the experiment already"
                                  .format(self.bin_count_name))
        
        if not self.channel:
            raise util.CytoflowOpError("channel is not set")
        
        if self.channel not in experiment.data.columns:
            raise util.CytoflowOpError("channel {0} isn't in the experiment"
                                  .format(self.channel))
              
        if self.num_bins is Undefined and self.bin_width is Undefined:
            raise util.CytoflowOpError("must set either bin number or width")
        
        if self.num_bins is Undefined \
           and not (self.scale == "linear" or self.scale == "log"):
            raise util.CytoflowOpError("Can only use bin_width with linear or log scale") 
        
        scale = util.scale_factory(self.scale, experiment, self.channel)
        scaled_data = scale(experiment.data[self.channel])
            
        channel_min = bn.nanmin(scaled_data)
        channel_max = bn.nanmax(scaled_data)
        
        num_bins = self.num_bins if self.num_bins is not Undefined else \
                   (channel_max - channel_min) / self.bin_width

        bins = np.linspace(start = channel_min, stop = channel_max,
                           num = num_bins)
            
        # bins need to be internal; drop the first and last one
        bins = bins[1:-1]
            
        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name,
                                     "int",
                                     np.digitize(scaled_data, bins))
        
        # if we're log-scaled (for example), don't label data that isn't
        # showable on a log scale!
        new_experiment.data.ix[np.isnan(scaled_data), self.name] = np.NaN
        
        # keep track of the bins we used, for pretty plotting later.
        new_experiment.metadata[self.name]["bin_scale"] = self.scale
        new_experiment.metadata[self.name]["bins"] = bins
        
        if self.bin_count_name:
            # TODO - this is a HUGE memory hog?!
            agg_count = new_experiment.data.groupby(self.name).count()
            agg_count = agg_count[agg_count.columns[0]]
            
            # have to make the condition a float64, because if we're in log
            # space there may be events that have NaN as the bin number.
            
            new_experiment.add_condition(
                self.bin_count_name,
                "float64",
                new_experiment[self.name].map(agg_count))
        
        new_experiment.history.append(self.clone_traits())
        return new_experiment
    
    def default_view(self, **kwargs):
        return BinningView(op = self, **kwargs)
Beispiel #10
0
class PCAOp(HasStrictTraits):
    """
    Use principal components analysis (PCA) to decompose a multivariate data
    set into orthogonal components that explain a maximum amount of variance.
    
    Call :meth:`estimate` to compute the optimal decomposition.
      
    Calling :meth:`apply` creates new "channels" named ``{name}_1 ... {name}_n``,
    where ``name`` is the :attr:`name` attribute and ``n`` is :attr:`num_components`.

    The same decomposition may not be appropriate for different subsets of the data set.
    If this is the case, you can use the :attr:`by` attribute to specify 
    metadata by which to aggregate the data before estimating (and applying) a 
    model.  The PCA parameters such as the number of components and the kernel
    are the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new columns.
        
    channels : List(Str)
        The channels to apply the decomposition to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`.set_default_scale`) is used.

    num_components : Int (default = 2)
        How many components to fit to the data?  Must be a positive integer.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will 
        fit the model separately to each subset of the data with a unique 
        combination of ``Time`` and ``Dox``.
        
    whiten : Bool (default = False)
        Scale each component to unit variance?  May be useful if you will
        be using unsupervized clustering (such as K-means).

    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> pca = flow.PCAOp(name = 'PCA',
        ...                  channels = ['V2-A', 'V2-H', 'Y2-A', 'Y2-H'],
        ...                  scale = {'V2-A' : 'log',
        ...                           'V2-H' : 'log',
        ...                           'Y2-A' : 'log',
        ...                           'Y2-H' : 'log'},
        ...                  num_components = 2,
        ...                  by = ["Dox"])
        
    Estimate the decomposition
    
    .. plot::
        :context: close-figs
        
        >>> pca.estimate(ex)
        
    Apply the operation
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = pca.apply(ex)

    Plot a scatterplot of the PCA.  Compare to a scatterplot of the underlying
    channels.
    
    .. plot::
        :context: close-figs
        
        >>> flow.ScatterplotView(xchannel = "V2-A",
        ...                      xscale = "log",
        ...                      ychannel = "Y2-A",
        ...                      yscale = "log",
        ...                      subset = "Dox == 1.0").plot(ex2)

        >>> flow.ScatterplotView(xchannel = "PCA_1",
        ...                      ychannel = "PCA_2",
        ...                      subset = "Dox == 1.0").plot(ex2)
       
    .. plot::
        :context: close-figs
        
        >>> flow.ScatterplotView(xchannel = "V2-A",
        ...                      xscale = "log",
        ...                      ychannel = "Y2-A",
        ...                      yscale = "log",
        ...                      subset = "Dox == 10.0").plot(ex2) 

        >>> flow.ScatterplotView(xchannel = "PCA_1",
        ...                      ychannel = "PCA_2",
        ...                      subset = "Dox == 10.0").plot(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.pca')
    friendly_id = Constant("Principal Component Analysis")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(2, allow_zero=False)
    whiten = Bool(False)
    by = List(Str)

    _pca = Dict(Any, Any, transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the decomposition
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.

        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        if self.num_components > len(self.channels):
            raise util.CytoflowOpError(
                'num_components', "Number of components must be less than "
                "or equal to number of channels.")

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in `channels`".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            self._pca[group] = pca = \
                sklearn.decomposition.PCA(n_components = self.num_components,
                                          whiten = self.whiten,
                                          random_state = 0)

            pca.fit(x)

    def apply(self, experiment):
        """
        Apply the PCA decomposition to the data.
        
        Returns
        -------
        Experiment
            a new Experiment with additional :attr:`~Experiment.channels` 
            named ``name_1 ... name_n``

        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self._pca:
            raise util.CytoflowOpError(
                None, "No PCA found.  Did you forget to call estimate()?")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the operation's name "
                "before applying it!")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        new_experiment = experiment.clone()
        new_channels = []
        for i in range(self.num_components):
            cname = "{}_{}".format(self.name, i + 1)
            if cname in experiment.data:
                raise util.CytoflowOpError(
                    'name',
                    "Channel {} is already in the experiment".format(cname))

            new_experiment.add_channel(cname,
                                       pd.Series(index=experiment.data.index))
            new_channels.append(cname)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
            x_na = x_na.values
            x[x_na] = 0

            group_idx = groupby.groups[group]

            pca = self._pca[group]
            x_tf = pca.transform(x)
            x_tf[x_na] = np.nan

            for ci, c in enumerate(new_channels):
                new_experiment.data.loc[group_idx, c] = x_tf[:, ci]

        new_experiment.data.dropna(inplace=True)
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment
Beispiel #11
0
class GaussianMixture1DOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to a channel.
    
    .. warning:: 
    
        :class:`GaussianMixture1DOp` is **DEPRECATED** and will be removed
        in a future release.  It doesn't correctly handle the case where an 
        event is present in more than one component.  Please use
        :class:`GaussianMixtureOp` instead!
    
    Creates a new categorical metadata variable named :attr:`name`, with possible
    values ``name_1`` .... ``name_n`` where ``n`` is the number of components.
    An event is assigned to ``name_i`` category if it falls within :attr:`sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to ``name_None``.
    
    As a special case, if :attr:`num_components` is `1` and :attr:`sigma` 
    ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in 
    the gate and ``False`` otherwise.
    
    Optionally, if :attr:`posteriors` is ``True``, this module will also 
    compute the posterior probability of each event in its assigned component, 
    returning it in a new colunm named ``{Name}_Posterior``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channel : Str
        Which channel to apply the mixture model to.
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    scale : Enum("linear", "log", "logicle") (default = "linear")
        Re-scale the data before fitting the model?  
        
    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
        
        
    Examples
    --------
    
    Make a little data set.
    
    .. plot::
        :context: close-figs
            
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixture1DOp(name = 'GM',
        ...                                  channel = 'Y2-A',
        ...                                  scale = 'log',
        ...                                  num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)

    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d')
    friendly_id = Constant("1D Gaussian Mixture")
    
    name = CStr()
    channel = Str()
    num_components = util.PositiveInt(1)
    sigma = util.PositiveFloat(0.0, allow_zero = True)
    by = List(Str)
    scale = util.ScaleEnum
    posteriors = Bool(False)
    
    # the key is a set
    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True)
    _scale = Instance(util.IScale, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters.
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """
        
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
        
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError('channel',
                                       "Column {0} not found in the experiment"
                                       .format(self.channel))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
            
        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError('num_components',
                                       "If num_components == 1, all posteriors are 1.")
        
        if subset:
            try:
                experiment = experiment.query(subset)
            except Exception as e:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' isn't valid"
                                           .format(subset)) from e
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            by = sorted(self.by)
            groupby = experiment.data.groupby(by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._scale = util.scale_factory(self.scale, experiment, channel = self.channel)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(None, 
                                           "Group {} had no data".format(group))
            x = data_subset[self.channel].reset_index(drop = True)
            x = self._scale(x)
            
            # drop data that isn't in the scale range
            #x = pd.Series(self._scale(x)).dropna()
            x = x[~np.isnan(x)]
            
            gmm = mixture.GaussianMixture(n_components = self.num_components,
                                          random_state = 1)
            gmm.fit(x[:, np.newaxis])
            
            if not gmm.converged_:
                raise util.CytoflowOpError(None,
                                           "Estimator didn't converge"
                                           " for group {0}"
                                           .format(group))
                
            # to make sure we have a stable ordering, sort the components
            # by the means (so the first component has the lowest mean, 
            # the next component has the next-lowest, etc.)
            
            sort_idx = np.argsort(gmm.means_[:, 0])
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
           
            gmms[group] = gmm
            
        self._gmms = gmms
    
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment`, with a new column named :attr:`name`,
            and possibly one named :attr:`name` _Posterior.  Also the following
            new :attr:`~.Experiment.statistics`:
            
            - **mean** : Float
                the mean of the fitted gaussian
            
            - **stdev** : Float
                the inverse-scaled standard deviation of the fitted gaussian.  on a 
                linear scale, this is in the same units as the mean; on a log scale,
                this is a scalar multiple; and on a logicle scale, this is probably
                meaningless!
            
            - **interval** : (Float, Float)
                the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian.
                this is likely more meaningful than ``stdev``, especially on the
                ``logicle`` scale.
            
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                set if :attr:`num_components` ``> 1``.
             
        """
        
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")

        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No model found.  Did you forget to "
                                       "call estimate()?")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name))  

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No components found.  Did you forget to "
                                       "call estimate()?")

        if not self._scale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError('channel',
                                       "Column {0} not found in the experiment"
                                       .format(self.channel))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError('posteriors',
                                           "Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError('sigma',
                                       "sigma must be >= 0.0")

        if self.by:
            by = sorted(self.by)
            groupby = experiment.data.groupby(by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        for group, data_subset in groupby:
            
            # if there weren't any events in this group, there's no gmm
            if group not in self._gmms:
                warn("There wasn't a GMM for data subset {}".format(group),
                     util.CytoflowOpWarning)
                continue
            
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x).values
                        
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1 and self.sigma > 0:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        elif self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors and self.num_components > 1:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)

        # add the statistics
        levels = list(self.by)
        if self.num_components > 1:
            levels.append(self.name)
        
        if levels:     
            idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], 
                                             names = levels)
    
            mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()     
                                   
            for group, _ in groupby:
                gmm = self._gmms[group]
                for c in range(self.num_components):
                    if self.num_components > 1:
                        component_name = "{}_{}".format(self.name, c + 1)

                        if group is True:
                            g = [component_name]
                        elif isinstance(group, tuple):
                            g = list(group)
                            g.append(component_name)
                        else:
                            g = list([group])
                            g.append(component_name)
                        
                        if len(g) > 1:
                            g = tuple(g)
                        else:
                            g = (g[0],)
                    else:
                        g = group

                    mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0])
                    stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0]
                    interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])),
                                            self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0])))
                    prop_stat.at[g] = gmm.weights_[c]
                     
            new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
            new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat)
            new_experiment.statistics[(self.name, "interval")] = interval_stat
            if self.num_components > 1:
                new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)
            
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
        
        v = GaussianMixture1DView(op = self)
        v.trait_set(**kwargs)
        return v
Beispiel #12
0
class ImportOp(HasStrictTraits):
    """
    An operation for importing data and making an `Experiment`.

    To use, set the `conditions` dict to a mapping between condition name and
    NumPy `dtype`.  Useful dtypes include `category`, `float`, `int`, `bool`.

    Next, set `tubes` to a list of `Tube` containing FCS filenames and the
    corresponding conditions.

    If you would rather not analyze every single event in every FCS file,
    set `coarse` to `True` and `coarse_events` to the number of events from
    each FCS file you want to load.

    Call `apply()` to load the data.

    Attributes
    ----------

    conditions : Dict(Str, Str)
        A dictionary mapping condition names (keys) to NumPy `dtype`s (values).
        Useful `dtype`s include "category", "float", "int", and "bool".

    tubes : List(Tube)
        A list of `Tube` instances, which map FCS files to their corresponding
        experimental conditions.  Each `Tube` must have a `conditions` dict
        whose keys match `self.conditions.keys()`.

    channels = List(Str)
        If you only need a subset of the channels available in the data set,
        specify them here.  If `channels` is empty, load all the channels in
        the FCS files.

    coarse_events : Int (default = 0)
        If >= 0, import only a random subset of events of size `coarse_events`.
        Presumably the analysis will go faster but less precisely; good for
        interactive data exploration.  Then, set `coarse_events = 0` and re-run
        the analysis non-interactively.

    name_metadata : Enum(None, "$PnN", "$PnS") (default = None)
        Which FCS metadata is the channel name?  If `None`, attempt to
        autodetect.

    ignore_v : List(Str)
        **Cytoflow** is designed to operate on an `Experiment` containing
        tubes that were all collected under the same instrument settings.
        In particular, the same PMT voltages ensure that data can be
        compared across samples.

        *Very rarely*, you may need to set up an Experiment with different
        voltage settings.  This is likely only to be the case when you are
        trying to figure out which voltages should be used in future
        experiments.  If so, set `ignore_v` to a List of channel names
        to ignore particular channels.  **BE WARNED - THIS WILL BREAK REAL
        EXPERIMENTS.**

    Examples
    --------
    >>> tube1 = flow.Tube(file = 'RFP_Well_A3.fcs', conditions = {"Dox" : 10.0})
    >>> tube2 = flow.Tube(file='CFP_Well_A4.fcs', conditions = {"Dox" : 1.0})
    >>> import_op = flow.ImportOp(conditions = {"Dox" : "float"},
    ...                           tubes = [tube1, tube2])
    >>> ex = import_op.apply()
    """

    id = Constant("edu.mit.synbio.cytoflow.operations.import")
    friendly_id = Constant("Import")
    name = Constant("Import Data")

    # experimental conditions: name --> dtype.
    conditions = Dict(Str, Str)

    # the tubes
    tubes = List(Tube)

    # which channels do we import?
    channels = List(Str)

    # which FCS metadata has the channel names in it?
    name_metadata = Enum(None, "$PnN", "$PnS")

    # are we subsetting?
    coarse_events = util.PositiveInt(0, allow_zero=True)

    # DON'T DO THIS
    ignore_v = List(Str)

    def apply(self, experiment=None):

        if not self.tubes or len(self.tubes) == 0:
            raise util.CytoflowOpError("Must specify some tubes!")

        # make sure each tube has the same conditions
        tube0_conditions = set(self.tubes[0].conditions)
        for tube in self.tubes:
            tube_conditions = set(tube.conditions)
            if len(tube0_conditions ^ tube_conditions) > 0:
                raise util.CytoflowOpError("Tube {0} didn't have the same "
                                           "conditions as tube {1}".format(
                                               tube.file, self.tubes[0].file))

        # make sure experimental conditions are unique
        for idx, i in enumerate(self.tubes[0:-1]):
            for j in self.tubes[idx + 1:]:
                if i.conditions_equal(j):
                    raise util.CytoflowOpError(
                        "The same conditions specified for "
                        "tube {0} and tube {1}".format(i.file, j.file))

        experiment = Experiment()

        experiment.metadata["ignore_v"] = self.ignore_v

        for condition, dtype in self.conditions.items():
            experiment.add_condition(condition, dtype)

        try:
            # silence warnings about duplicate channels;
            # we'll figure that out below
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube0_meta = fcsparser.parse(self.tubes[0].file,
                                             meta_data_only=True,
                                             reformat_meta=True)
        except Exception as e:
            raise util.CytoflowOpError(
                "FCS reader threw an error reading metadata "
                " for tube {0}: {1}".format(self.tubes[0].file, str(e)))

        meta_channels = tube0_meta["_channels_"]

        if self.name_metadata:
            experiment.metadata["name_metadata"] = self.name_metadata
        else:
            # try to autodetect the metadata
            if "$PnN" in meta_channels and not "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnN"
            elif "$PnN" not in meta_channels and "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnS"
            else:
                PnN = meta_channels["$PnN"]
                PnS = meta_channels["$PnS"]

                # sometimes one is unique and the other isn't
                if (len(set(PnN)) == len(PnN) and len(set(PnS)) != len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnN"
                elif (len(set(PnN)) != len(PnN) and len(set(PnS)) == len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnS"
                else:
                    # as per fcsparser.api, $PnN is the "short name" (like FL-1)
                    # and $PnS is the "actual name" (like "FSC-H").  so let's
                    # use $PnS.
                    experiment.metadata["name_metadata"] = "$PnS"

        meta_channels.set_index(experiment.metadata["name_metadata"],
                                inplace=True)

        channels = self.channels if self.channels \
                   else list(tube0_meta["_channel_names_"])

        # make sure everything in self.channels is in the tube channels

        for channel in channels:
            if channel not in meta_channels.index:
                raise util.CytoflowOpError(
                    "Channel {0} not in tube {1}".format(
                        channel, self.tubes[0].file))

        # now that we have the metadata, load it into experiment

        for channel in channels:
            experiment.add_channel(channel)

            # keep track of the channel's PMT voltage
            if ("$PnV" in meta_channels.ix[channel]):
                v = meta_channels.ix[channel]['$PnV']
                if v: experiment.metadata[channel]["voltage"] = v

            # add the maximum possible value for this channel.
            data_range = meta_channels.ix[channel]['$PnR']
            data_range = float(data_range)
            experiment.metadata[channel]['range'] = data_range

        for tube in self.tubes:
            tube_data = parse_tube(tube.file, experiment)

            if self.coarse_events:
                if self.coarse_events <= len(tube_data):
                    tube_data = tube_data.loc[np.random.choice(
                        tube_data.index, self.coarse_events, replace=False)]
                else:
                    warnings.warn(
                        "Only {0} events in tube {1}".format(
                            len(tube_data), tube.file), util.CytoflowWarning)

            experiment.add_events(tube_data[channels], tube.conditions)

        return experiment
Beispiel #13
0
class GaussianMixture1DOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to a channel.
    
    Creates a new categorical metadata variable named `name`, with possible
    values `name_1` .... `name_n` where `n` is the number of components.
    An event is assigned to `name_i` category if it falls within `sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if `sigma == 0.0`), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to `name_None`.
    
    As a special case, if `num_components` is `1` and `sigma` > 0.0, then
    the new condition is boolean, `True` if the event fell in the gate and
    `False` otherwise.
    
    Optionally, if `posteriors` is `True`, this module will also compute the 
    posterior probability of each event in its assigned component, returning
    it in a new colunm named `{Name}_Posterior`.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the `by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channel : Str
        Which channel to apply the mixture model to.
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    scale : Enum("linear", "log", "logicle") (default = "linear")
        Re-scale the data before fitting the data?  
        
    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
        
    Examples
    --------
    
    >>> gauss_op = GaussianMixture1DOp(name = "Gaussian",
    ...                                channel = "Y2-A",
    ...                                num_components = 2)
    >>> gauss_op.estimate(ex2)
    >>> gauss_op.default_view().plot(ex2)
    >>> ex3 = gauss_op.apply(ex2)
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d')
    friendly_id = Constant("1D Gaussian Mixture")
    
    name = CStr()
    channel = Str()
    num_components = util.PositiveInt(1)
    sigma = util.PositiveFloat(0.0, allow_zero = True)
    by = List(Str)
    scale = util.ScaleEnum
    posteriors = Bool(False)
    
    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(mixture.GMM), transient = True)
    _scale = Instance(util.IScale, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters
        """
        
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.channel))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))
            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                
            
        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowOpError("If num_components == 1, sigma must be > 0")
        
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError("Subset string '{0}' isn't valid"
                                        .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowViewError("Subset string '{0}' returned no events"
                                        .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda x: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._scale = util.scale_factory(self.scale, experiment, self.channel)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError("Group {} had no data"
                                           .format(group))
            x = data_subset[self.channel].reset_index(drop = True)
            x = self._scale(x)
            
            # drop data that isn't in the scale range
            #x = pd.Series(self._scale(x)).dropna()
            x = x[~np.isnan(x)]
            
            gmm = mixture.GMM(n_components = self.num_components,
                              random_state = 1)
            gmm.fit(x[:, np.newaxis])
            
            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                      " for group {0}"
                                      .format(group))
                
            # to make sure we have a stable ordering, sort the components
            # by the means (so the first component has the lowest mean, 
            # the next component has the next-lowest, etc.)
            
            sort_idx = np.argsort(gmm.means_[:, 0])
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covars_ = gmm.covars_[sort_idx]
           
            gmms[group] = gmm
            
        self._gmms = gmms
    
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """
            
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self._gmms:
            raise util.CytoflowOpError("No model found.  Did you forget to "
                                       "call estimate()?")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                  "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("Experiment already has a column named {0}"
                                  .format(self.name))

        if not self._scale:
            raise util.CytoflowOpError("Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.channel))

            
        if (self.name + "_Posterior") in experiment.data:
            raise util.CytoflowOpError("Column {0} already found in the experiment"
                                  .format(self.name + "_Posterior"))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError("Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))

            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)

        for group, data_subset in groupby:
            if group not in self._gmms:
                raise util.CytoflowOpError("Can't find group in model. "
                                           "Did you call estimate()?")

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
            
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x).values
            
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covars_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covars_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        else:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)
            
        new_experiment.history.append(self.clone_traits(transient = lambda t: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        return GaussianMixture1DView(op = self, **kwargs)
Beispiel #14
0
class KMeansOp(HasStrictTraits):
    """
    This module uses a K-means clustering algorithm to cluster events.  
    
    Call `estimate()` to compute the cluster centroids.
      
    Calling `apply()` creates a new categorical metadata variable 
    named `name`, with possible values `{name}_1` .... `name_n` where `n` is 
    the number of clusters, specified with `n_clusters`.
    
    The same model may not be appropriate for different subsets of the data set.
    If this is the case, you can use the `by` attribute to specify metadata by 
    which to aggregate the data before estimating (and applying) a model.  The 
    number of clusters is the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the clustering algorithm to.

    scale : Dict(Str : Enum("linear", "logicle", "log"))
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in `channels` but not in `scale`, the current package-wide
        default (set with `set_default_scale`) is used.

    num_clusters : Int (default = 2)
        How many components to fit to the data?  Must be a positive integer.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    Statistics
    ----------       
    centers : Float
        the location of each cluster's centroid in each channel
  
    Examples
    --------
    
    >>> clust_op = KMeansOp(name = "Clust",
    ...                         channels = ["V2-A", "Y2-A"],
    ...                         scale = {"V2-A" : "log"},
    ...                         num_clusters = 2)
    >>> clust_op.estimate(ex2)
    >>> clust_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2)
    >>> ex3 = clust_op.apply(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.kmeans')
    friendly_id = Constant("KMeans Clustering")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_clusters = util.PositiveInt(allow_zero=False)
    by = List(Str)

    _kmeans = Dict(Any,
                   Instance(sklearn.cluster.MiniBatchKMeans),
                   transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if self.num_clusters < 2:
            raise util.CytoflowOpError("num_clusters must be >= 2")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            self._kmeans[group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters)

            kmeans.fit(x)

    def apply(self, experiment):
        """
        Apply the KMeans clustering to the data
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series(["{}_None".format(self.name)] *
                                      len(experiment),
                                      dtype="object")

        # make the statistics
        clusters = [x + 1 for x in range(self.num_clusters)]

        idx = pd.MultiIndex.from_product(
            [experiment[x].unique()
             for x in self.by] + [clusters] + [self.channels],
            names=list(self.by) + ["Cluster"] + ["Channel"])
        centers_stat = pd.Series(index=idx,
                                 dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            kmeans = self._kmeans[group]

            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = kmeans.predict(x[~x_na])

            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_clusters):
                predicted_str[predicted == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

            for c in range(self.num_clusters):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__'):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    centers_stat.loc[g2] = self._scale[channel1].inverse(
                        kmeans.cluster_centers_[c, cidx1])

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "category", event_assignments)

        new_experiment.statistics[(self.name,
                                   "centers")] = pd.to_numeric(centers_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            return KMeans1DView(op=self,
                                channel=channels[0],
                                scale=scale[channels[0]],
                                **kwargs)
        elif len(channels) == 2:
            return KMeans2DView(op=self,
                                xchannel=channels[0],
                                ychannel=channels[1],
                                xscale=scale[channels[0]],
                                yscale=scale[channels[1]],
                                **kwargs)
        else:
            raise util.CytoflowViewError(
                "Can't specify more than two channels for a default view")
Beispiel #15
0
class GaussianMixtureOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to one or more channels.
    
    If `num_components > 1`, `apply()` creates a new categorical metadata 
    variable named  `name`, with possible values `{name}_1` .... `name_n` 
    where `n` is the number of components.  An event is assigned to `name_i` 
    category if it has the highest posterior probability of having been 
    produced by component `i`.  If an event has a value that is outside the
    range of one of the channels' scales, then it is assigned to `{name}_None`.
    
    Optionally, if `sigma` is greater than 0, `apply()` creates new  `boolean` 
    metadata variables named `{name}_1` ... `{name}_n` where `n` is the number 
    of components.  The column `{name}_i` is `True` if the event is less than 
    `sigma` standard deviations from the mean of component `i`.  If 
    `num_components == 1`, `sigma` must be greater than 0.
    
    Optionally, if `posteriors` is `True`, `apply()` creates a new `double`
    metadata variables named `{name}_1_posterior` ... `{name}_n_posterior` 
    where `n` is the number of components.  The column `{name}_i_posterior`
    contains the posterior probability that this event is a member of 
    component `i`.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the `by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components must be the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the mixture model to.

    scale : Dict(Str : Enum("linear", "logicle", "log"))
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in `channels` but not in `scale`, the current package-wide
        default (set with `set_default_scale`) is used.

    num_components : Int (default = 1)
        How many components to fit to the data?  Must be a positive integer.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in the boolean variable `{name}_i`?  Must be >= 0.0.  If 
        `num_components == 1`, must be > 0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.

    posteriors : Bool (default = False)
        If `True`, add columns named `{Name}_{i}_Posterior` giving the posterior
        probability that the event is in component `i`.  Useful for filtering 
        out low-probability events.
        
    Statistics
    ----------
    mean : Float
        the mean of the fitted gaussian in each channel for each component.
        
    sigma : (Float, Float)
        the locations the mean +/- one standard deviation in each channel
        for each component.
        
    correlation : Float
        the correlation coefficient between each pair of channels for each
        component.
        
    proportion : Float
        the proportion of events in each component of the mixture model.  only
        added if `num_components` > 1.
        
    Notes
    -----
    
    We use the Mahalnobis distance as a multivariate generalization of 
    the number of standard deviations an event is from the mean of the
    multivariate gaussian.  If \vec{x} is an observation from a distribution
    with mean \vec{mu} and S is the covariance matrix, then the Mahalanobis
    distance is sqrt((x - mu)^T * S^-1 *(x - mu)).
    
    Examples
    --------
    
    >>> gauss_op = GaussianMixtureOp(name = "Gaussian",
    ...                              channels = ["V2-A", "Y2-A"],
    ...                              scale = {"V2-A" : "log"},
    ...                              num_components = 2)
    >>> gauss_op.estimate(ex2)
    >>> gauss_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2)
    >>> ex3 = gauss_op.apply(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian')
    friendly_id = Constant("Gaussian Mixture")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(allow_zero=False)
    sigma = util.PositiveFloat(allow_zero=True)
    by = List(Str)

    posteriors = Bool(False)

    # the key is either a single value or a tuple
    _gmms = Dict(Any,
                 Instance(sklearn.mixture.GaussianMixture),
                 transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            gmm = sklearn.mixture.GaussianMixture(
                n_components=self.num_components,
                covariance_type="full",
                random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                           " for group {0}".format(group))

            # in the 1D version, we sorted the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.

            # that doesn't work in the general case.  instead, we assume that
            # the clusters are likely (?) to be arranged along *one* of the
            # axes, so we take the |norm| of the mean of each cluster and
            # sort that way.

            norms = np.sum(gmm.means_**2, axis=1)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if self.sigma > 0:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        "Experiment already has a column named {}".format(
                            cname))

        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        "Experiment already has a column named {}".format(
                            cname))

        if not self._gmms:
            raise util.CytoflowOpError(
                "No components found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError(
                    "Column {0} already found in the experiment".format(
                        col_name))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))

            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowOpError(
                "if num_components is 1, sigma must be > 0.0")

        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError(
                "If num_components == 1, all posteriors will be 1.")

        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] *
                                          len(experiment),
                                          dtype="object")

        if self.sigma > 0:
            event_gate = {
                i: pd.Series([False] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.posteriors:
            event_posteriors = {
                i: pd.Series([0.0] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        # make the statistics
        components = [x + 1 for x in range(self.num_components)]

        prop_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components],
            names=list(self.by) + ["Component"])
        prop_stat = pd.Series(index=prop_idx,
                              dtype=np.dtype(object)).sort_index()

        mean_idx = pd.MultiIndex.from_product(
            [experiment[x].unique()
             for x in self.by] + [components] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(index=mean_idx,
                              dtype=np.dtype(object)).sort_index()
        sigma_stat = pd.Series(index=mean_idx,
                               dtype=np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components] +
            [self.channels] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel_1"] +
            ["Channel_2"])
        corr_stat = pd.Series(index=corr_idx,
                              dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])

                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(
                        self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx

                event_assignments.iloc[group_idx] = predicted_str

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]

                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s),
                                                (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed

                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)

                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)

            if self.posteriors:
                p = gmm.predict(x)
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[c]

            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__'):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.loc[g] = gmm.weights_[c]

                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.loc[g2] = self._scale[channel1].inverse(
                        gmm.means_[c, cidx1])

                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat.loc[g2] = (
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] -
                                                      s[cidx1]),
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] +
                                                      s[cidx1]))

                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]

                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True)

        new_experiment = experiment.clone()

        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.sigma > 0:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])

        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double",
                                             event_posteriors[c])

        new_experiment.statistics[(self.name,
                                   "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(
                self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(
                self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            return GaussianMixture1DView(op=self,
                                         channel=channels[0],
                                         scale=scale[channels[0]],
                                         **kwargs)
        elif len(channels) == 2:
            return GaussianMixture2DView(op=self,
                                         xchannel=channels[0],
                                         ychannel=channels[1],
                                         xscale=scale[channels[0]],
                                         yscale=scale[channels[1]],
                                         **kwargs)
        else:
            raise util.CytoflowViewError(
                "Can't specify more than two channels for a default view")
Beispiel #16
0
class BinningOp(HasStrictTraits):
    """
    Bin data along an axis.
    
    This operation creates equally spaced bins (in linear or log space)
    along an axis and adds a condition assigning each event to a bin.  The
    value of the event's condition is the left end of the bin's interval in
    which the event is located.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    channel : Str
        The name of the channel along which to bin.

    scale : {"linear", "log", "logicle"}
        Make the bins equidistant along what scale?
        
    num_bins : Int
        The number of bins to make.  Must set either :attr:`num_bins` or 
        :attr:`bin_width`. If both are defined, :attr:`num_bins` takes precedence.
        
    bin_width : Float
        The width of the bins.  Must set either :attr:`num_bins` or :attr:`bin_width`.  If
        :attr:`scale` is ``log``, :attr:`bin_width` is in log-10 units; if :attr:`scale` is
        ``logicle``, and error is thrown because the units are ill-defined.
        If both :attr:`num_bins` and :attr:`bin_width` are defined, :attr:`num_bins` takes 
        precedence. 
        
    bin_count_name : Str
        If :attr:`bin_count_name` is set, :meth:`apply` adds another column to 
        the resulting :class:`Experiment` that contains the number of events in 
        the bin that this event falls in.  Useful for filtering bins by number of events.
        
    Examples
    --------
    Create a small experiment:
    
    .. plot::
        :context: close-figs
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")]
        >>> ex = import_op.apply()
    
    Create and parameterize the operation
    
    .. plot::
        :context: close-figs

        >>> bin_op = flow.BinningOp()
        >>> bin_op.name = "Bin"
        >>> bin_op.channel = "FITC-A"
        >>> bin_op.scale = "log"
        >>> bin_op.bin_width = 0.2
    
    Apply the operation to the experiment
    
    .. plot::
        :context: close-figs 
    
        >>> ex2 = bin_op.apply(ex)
    
    Plot the result
    
    .. plot::
        :context: close-figs

        >>> bin_op.default_view().plot(ex2)  

    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.binning')
    friendly_id = Constant("Binning")

    name = CStr()
    bin_count_name = CStr()
    channel = Str()
    num_bins = util.PositiveInt(0, allow_zero=True)
    bin_width = util.PositiveFloat(0, allow_zero=True)
    scale = util.ScaleEnum

    _max_num_bins = Int(100)

    def apply(self, experiment):
        """
        Applies the binning to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            A new experiment with a condition column named :attr:`name`, which
            contains the location of the left-most edge of the bin that the
            event is in.  If :attr:`bin_count_name` is set, another column
            is added with that name as well, containing the number of events
            in the same bin as the event.

        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "no experiment specified")

        if not self.name:
            raise util.CytoflowOpError('name', "Name is not set")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Name {} is in the experiment already".format(self.name))

        if self.bin_count_name and self.bin_count_name in experiment.data.columns:
            raise util.CytoflowOpError(
                'bin_count_name',
                "bin_count_name {} is in the experiment already".format(
                    self.bin_count_name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "channel is not set")

        if self.channel not in experiment.data.columns:
            raise util.CytoflowOpError(
                'channel',
                "channel {} isn't in the experiment".format(self.channel))

        if not self.num_bins and not self.bin_width:
            raise util.CytoflowOpError('num_bins',
                                       "must set either bin number or width")

        if self.bin_width \
           and not (self.scale == "linear" or self.scale == "log"):
            raise util.CytoflowOpError(
                'scale', "Can only use bin_width with linear or log scale")

        scale = util.scale_factory(self.scale,
                                   experiment,
                                   channel=self.channel)
        scaled_data = scale(experiment.data[self.channel])

        scaled_min = bn.nanmin(scaled_data)
        scaled_max = bn.nanmax(scaled_data)

        num_bins = self.num_bins if self.num_bins else \
                   (scaled_max - scaled_min) / self.bin_width

        if num_bins > self._max_num_bins:
            raise util.CytoflowOpError(
                None, "Too many bins! To increase this limit, "
                "change _max_num_bins (currently {})".format(
                    self._max_num_bins))

        scaled_bins = np.linspace(start=scaled_min,
                                  stop=scaled_max,
                                  num=num_bins)

        if len(scaled_bins) < 2:
            raise util.CytoflowOpError('num_bins',
                                       "Must have more than one bin")

        # put the data in bins
        bin_idx = np.digitize(scaled_data, scaled_bins[1:-1])

        # now, back into data space
        bins = scale.inverse(scaled_bins)

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "float", bins[bin_idx])

        # keep track of the bins we used, for prettier plotting later.
        new_experiment.metadata[self.name]["bin_scale"] = self.scale
        new_experiment.metadata[self.name]["bins"] = bins

        if self.bin_count_name:
            # TODO - this is a HUGE memory hog?!
            # TODO - fix this, then turn it on by default
            agg_count = new_experiment.data.groupby(self.name).count()
            agg_count = agg_count[agg_count.columns[0]]

            # have to make the condition a float64, because if we're in log
            # space there may be events that have NaN as the bin number.

            new_experiment.add_condition(
                self.bin_count_name, "float64",
                new_experiment[self.name].map(agg_count))

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to check the binning.
        
        Returns
        -------
        IView
            An view instance, call :meth:`plot()` to plot the bins.
        """
        return BinningView(op=self, **kwargs)
Beispiel #17
0
class ImportOp(HasStrictTraits):
    """
    An operation for importing data and making an :class:`.Experiment`.
    
    To use, set the :attr:`conditions` dict to a mapping between condition name 
    and NumPy ``dtype``.  Useful dtypes include ``category``, ``float``, 
    ``int``, ``bool``.
    
    Next, set :attr:`tubes` to a list of :class:`Tube` containing FCS filenames 
    and the corresponding conditions.
    
    If you would rather not analyze every single event in every FCS file,
    set :attr:`events` to the number of events from each FCS file you want to 
    load.
    
    Call :meth:`apply` to load the data.  The usual ``experiment`` parameter
    can be ``None``.
    
    Attributes
    ----------
    conditions : Dict(Str, Str)
        A dictionary mapping condition names (keys) to NumPy ``dtype``s (values).
        Useful ``dtype``s include ``category``, ``float``, ``int``, and ``bool``.
        
    tubes : List(Tube)
        A list of :class:``Tube`` instances, which map FCS files to their corresponding
        experimental conditions.  Each :class:``Tube`` must have a 
        :attr:``~Tube.conditions`` dict whose keys match those of 
        :attr:`conditions`.
        
    channels : Dict(Str, Str)
        If you only need a subset of the channels available in the data set,
        specify them here.  Each ``(key, value)`` pair specifies a channel to
        include in the output experiment.  The key is the channel name in the 
        FCS file, and the value is the name of the channel in the Experiment.
        You can use this to rename channels as you import data (because flow
        channel names are frequently not terribly informative.)  New channel
        names must be valid Python identifiers: start with a letter or ``_``, and
        all characters must be letters, numbers or ``_``.  If :attr:`channels` is
        empty, load all channels in the FCS files.
        
    events : Int (default = 0)
        If ``> 0``, import only a random subset of events of size :attr:`events`. 
        Presumably the analysis will go faster but less precisely; good for
        interactive data exploration.  Then, unset :attr:`events` and re-run
        the analysis non-interactively.
        
    name_metadata : {None, "$PnN", "$PnS"} (default = None)
        Which FCS metadata is the channel name?  If ``None``, attempt to  
        autodetect.
        
    ignore_v : List(Str)
        :class:`cytoflow` is designed to operate on an :class:`.Experiment` containing
        tubes that were all collected under the same instrument settings.
        In particular, the same PMT voltages ensure that data can be
        compared across samples.
        
        *Very rarely*, you may need to set up an :class:`.Experiment` with 
        different voltage settings on different :class:`Tube`s.  This is likely 
        only to be the case when you are trying to figure out which voltages 
        should be used in future experiments.  If so, set :attr:`ignore_v` to a 
        :class:`List` of channel names to ignore particular channels.  
        
        .. warning::
        
            THIS WILL BREAK REAL EXPERIMENTS
        
    Examples
    --------
    >>> tube1 = flow.Tube(file = 'RFP_Well_A3.fcs', conditions = {"Dox" : 10.0})
    >>> tube2 = flow.Tube(file='CFP_Well_A4.fcs', conditions = {"Dox" : 1.0})
    >>> import_op = flow.ImportOp(conditions = {"Dox" : "float"},
    ...                           tubes = [tube1, tube2])
    >>> ex = import_op.apply()
    """

    id = Constant("edu.mit.synbio.cytoflow.operations.import")
    friendly_id = Constant("Import")
    name = Constant("Import Data")

    # experimental conditions: name --> dtype.
    conditions = Dict(Str, Str)

    # the tubes
    tubes = List(Tube)

    # which channels do we import?
    channels = Dict(Str, Str)

    # which FCS metadata has the channel names in it?
    name_metadata = Enum(None, "$PnN", "$PnS")

    # are we subsetting?
    events = util.PositiveInt(0, allow_zero=True)
    coarse_events = util.Deprecated(new='events')

    # DON'T DO THIS
    ignore_v = List(Str)

    def apply(self, experiment=None):
        """
        Load a new :class:`.Experiment`.  
        
        Returns
        -------
        Experiment
            The new :class:`.Experiment`.  New channels have the following
            metadata:
            
            - **voltage** - int
                The voltage that this channel was collected at.  Determined
                by the ``$PnV`` field from the first FCS file.
                
            - **range** - int
                The maximum range of this channel.  Determined by the ``$PnR``
                field from the first FCS file.
                
            New experimental conditions do not have **voltage** or **range**
            metadata, obviously.  Instead, they have **experiment** set to 
            ``True``, to distinguish the experimental variables from the
            conditions that were added by gates, etc.
            
            If :attr:`ignore_v` is set, it is added as a key to the 
            :class:`.Experiment`-wide metadata.
            
        """

        if not self.tubes or len(self.tubes) == 0:
            raise util.CytoflowOpError('tubes', "Must specify some tubes!")

        # if we have channel renaming, make sure the new names are valid
        # python identifiers
        if self.channels:
            for old_name, new_name in self.channels.items():
                if old_name != new_name and new_name != util.sanitize_identifier(
                        new_name):
                    raise util.CytoflowOpError(
                        'channels', "Channel name {} must be a "
                        "valid Python identifier.".format(new_name))

        # make sure each tube has the same conditions
        tube0_conditions = set(self.tubes[0].conditions)
        for tube in self.tubes:
            tube_conditions = set(tube.conditions)
            if len(tube0_conditions ^ tube_conditions) > 0:
                raise util.CytoflowOpError(
                    'tubes', "Tube {0} didn't have the same "
                    "conditions as tube {1}".format(tube.file,
                                                    self.tubes[0].file))

        # make sure experimental conditions are unique
        for idx, i in enumerate(self.tubes[0:-1]):
            for j in self.tubes[idx + 1:]:
                if i.conditions_equal(j):
                    raise util.CytoflowOpError(
                        'tubes', "The same conditions specified for "
                        "tube {0} and tube {1}".format(i.file, j.file))

        experiment = Experiment()

        experiment.metadata["ignore_v"] = self.ignore_v

        for condition, dtype in list(self.conditions.items()):
            experiment.add_condition(condition, dtype)
            experiment.metadata[condition]['experiment'] = True

        try:
            # silence warnings about duplicate channels;
            # we'll figure that out below
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube0_meta = fcsparser.parse(self.tubes[0].file,
                                             meta_data_only=True,
                                             reformat_meta=True)
        except Exception as e:
            raise util.CytoflowOpError(
                'tubes', "FCS reader threw an error reading metadata "
                "for tube {}".format(self.tubes[0].file)) from e

        meta_channels = tube0_meta["_channels_"]

        if self.name_metadata:
            experiment.metadata["name_metadata"] = self.name_metadata
        else:
            # try to autodetect the metadata
            if "$PnN" in meta_channels and not "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnN"
            elif "$PnN" not in meta_channels and "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnS"
            else:
                PnN = meta_channels["$PnN"]
                PnS = meta_channels["$PnS"]

                # sometimes one is unique and the other isn't
                if (len(set(PnN)) == len(PnN) and len(set(PnS)) != len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnN"
                elif (len(set(PnN)) != len(PnN) and len(set(PnS)) == len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnS"
                else:
                    # as per fcsparser.api, $PnN is the "short name" (like FL-1)
                    # and $PnS is the "actual name" (like "FSC-H").  so let's
                    # use $PnS.
                    experiment.metadata["name_metadata"] = "$PnS"

        meta_channels.set_index(experiment.metadata["name_metadata"],
                                inplace=True)

        channels = list(self.channels.keys()) if self.channels \
                   else list(tube0_meta["_channel_names_"])

        # make sure everything in self.channels is in the tube channels

        for channel in channels:
            if channel not in meta_channels.index:
                raise util.CytoflowOpError(
                    'channels', "Channel {0} not in tube {1}".format(
                        channel, self.tubes[0].file))

        # now that we have the metadata, load it into experiment

        for channel in channels:
            experiment.add_channel(channel)

            experiment.metadata[channel]["fcs_name"] = channel

            # keep track of the channel's PMT voltage
            if ("$PnV" in meta_channels.loc[channel]):
                v = meta_channels.loc[channel]['$PnV']
                if v: experiment.metadata[channel]["voltage"] = v

            # add the maximum possible value for this channel.
            data_range = meta_channels.loc[channel]['$PnR']
            data_range = float(data_range)
            experiment.metadata[channel]['range'] = data_range

        experiment.metadata['fcs_metadata'] = {}
        for tube in self.tubes:
            tube_meta, tube_data = parse_tube(tube.file, experiment)

            if self.events:
                if self.events <= len(tube_data):
                    tube_data = tube_data.loc[np.random.choice(tube_data.index,
                                                               self.events,
                                                               replace=False)]
                else:
                    warnings.warn(
                        "Only {0} events in tube {1}".format(
                            len(tube_data), tube.file), util.CytoflowWarning)

            experiment.add_events(tube_data[channels], tube.conditions)
            experiment.metadata['fcs_metadata'][tube.file] = tube_meta

        for channel in channels:
            if self.channels and channel in self.channels:
                new_name = self.channels[channel]
                if channel == new_name:
                    continue
                experiment.data.rename(columns={channel: new_name},
                                       inplace=True)
                experiment.metadata[new_name] = experiment.metadata[channel]
                experiment.metadata[new_name]["fcs_name"] = channel
                del experiment.metadata[channel]

        return experiment
Beispiel #18
0
class KMeansOp(HasStrictTraits):
    """
    Use a K-means clustering algorithm to cluster events.  
    
    Call :meth:`estimate` to compute the cluster centroids.
      
    Calling :meth:`apply` creates a new categorical metadata variable 
    named :attr:`name`, with possible values ``{name}_1`` .... ``name_n`` where 
    ``n`` is the number of clusters, specified with :attr:`num_clusters`.
    
    The same model may not be appropriate for different subsets of the data set.
    If this is the case, you can use the :attr:`by` attribute to specify 
    metadata by which to aggregate the data before estimating (and applying) a 
    model.  The  number of clusters is the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the clustering algorithm to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`.set_default_scale`) is used.

    num_clusters : Int (default = 2)
        How many components to fit to the data?  Must be a positive integer.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will 
        fit the model separately to each subset of the data with a unique 
        combination of ``Time`` and ``Dox``.
  
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> km_op = flow.KMeansOp(name = 'KMeans',
        ...                       channels = ['V2-A', 'Y2-A'],
        ...                       scale = {'V2-A' : 'log',
        ...                                'Y2-A' : 'log'},
        ...                       num_clusters = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> km_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> km_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = km_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> km_op.default_view().plot(ex2)
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.kmeans')
    friendly_id = Constant("KMeans Clustering")
    
    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_clusters = util.PositiveInt(allow_zero = False)
    by = List(Str)
    
    _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient = True)
    _scale = Dict(Str, Instance(util.IScale), transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the k-means clusters
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if self.num_clusters < 2:
            raise util.CytoflowOpError('num_clusters',
                                       "num_clusters must be >= 2")
        
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('scale',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError('subset',
                                            "Subset string '{0}' isn't valid"
                                            .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c)
                    
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
            
            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values
            
            self._kmeans[group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters,
                                                random_state = 0)
            
            kmeans.fit(x)
                                                 
         
    def apply(self, experiment):
        """
        Apply the KMeans clustering to the data.
        
        Returns
        -------
        Experiment
            a new Experiment with one additional :attr:`~Experiment.condition` 
            named :attr:`name`, of type ``category``.  The new category has 
            values  ``name_1, name_2, etc`` to indicate which k-means cluster 
            an event is a member of.
            
            The new :class:`.Experiment` also has one new statistic called
            ``centers``, which is a list of tuples encoding the centroids of each
            k-means cluster.
        """
 
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
         
        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._kmeans:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
 
        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                 
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('scale',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
                 
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
                 
        event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
         
        # make the statistics       
        clusters = [x + 1 for x in range(self.num_clusters)]
          
        idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], 
                                         names = list(self.by) + ["Cluster"] + ["Channel"])
        centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
                     
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))
            
            if group not in self._kmeans:
                raise util.CytoflowOpError('by',
                                           "Group {} not found in the estimated model. "
                                           "Do you need to re-run estimate()?"
                                           .format(group))    
            
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                 
            # which values are missing?
 
            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                         
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
            
            kmeans = self._kmeans[group]
  
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = kmeans.predict(x[~x_na])
                 
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_clusters):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx
      
            event_assignments.iloc[group_idx] = predicted_str
            
            for c in range(self.num_clusters):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    centers_stat.loc[g2] = self._scale[channel1].inverse(kmeans.cluster_centers_[c, cidx1])
         
        new_experiment = experiment.clone()          
        new_experiment.add_condition(self.name, "category", event_assignments)
        
        new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)
 
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the k-means clustering.
         
        Returns
        -------
            IView : an IView, call :meth:`KMeans1DView.plot` to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        
        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError('channels',
                                             "Channel {} isn't in the operation's channels"
                                             .format(c))
                
        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError('scale',
                                             "Channel {} isn't in the operation's channels"
                                             .format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()
            
        if len(channels) == 0:
            raise util.CytoflowViewError('channels',
                                         "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = KMeans1DView(op = self)
            v.trait_set(channel = channels[0], 
                        scale = scale[channels[0]], 
                        **kwargs)
            return v
        
        elif len(channels) == 2:
            v = KMeans2DView(op = self)
            v.trait_set(xchannel = channels[0], 
                        ychannel = channels[1],
                        xscale = scale[channels[0]],
                        yscale = scale[channels[1]], 
                        **kwargs)
            return v
        
        else:
            raise util.CytoflowViewError('channels',
                                         "Can't specify more than two channels for a default view")
Beispiel #19
0
class DensityGateOp(HasStrictTraits):
    """
    This module computes a gate based on a 2D density plot.  The user chooses
    what proportion of events to keep, and the module creates a gate that selects
    that proportion of events in the highest-density bins of the 2D density
    histogram.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the binning to.
        
    ychannel : Str
        The Y channel to apply the binning to.

    xscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the X acis before fitting the data?  

    yscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the Y axis before fitting the data?  
        
    keep : Float (default = 0.9)
        What proportion of events to keep?  Must be ``>0`` and ``<1`` 
        
    bins : Int (default = 100)
        How many bins should there be on each axis?  Must be positive.
        
    min_quantile : Float (default = 0.001)
        Clip values below this quantile
        
    max_quantile : Float (default = 1.0)
        Clip values above this quantile

    sigma : Float (default = 1.0)
        What standard deviation to use for the gaussian blur?
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the gate.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit a 
        separate gate to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.
        
    Notes
    -----
    This gating method was developed by John Sexton, in Jeff Tabor's lab at
    Rice University.  
    
    From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html,
    the method is as follows:
    
    1. Determines the number of events to keep, based on the user specified 
       gating fraction and the total number of events of the input sample.
       
    2. Divides the 2D channel space into a rectangular grid, and counts the 
       number of events falling within each bin of the grid. The number of 
       counts per bin across all bins comprises a 2D histogram, which is a 
       coarse approximation of the underlying probability density function.
       
    3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. 
       Theoretically, the proper amount of smoothing results in a better 
       estimate of the probability density function. Practically, smoothing 
       eliminates isolated bins with high counts, most likely corresponding to 
       noise, and smoothes the contour of the gated region.
       
    4. Selects the bins with the greatest number of events in the smoothed 
       histogram, starting with the highest and proceeding downward until the 
       desired number of events to keep, calculated in step 1, is achieved.
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> dens_op = flow.DensityGateOp(name = 'Density',
        ...                              xchannel = 'FSC-A',
        ...                              xscale = 'log',
        ...                              ychannel = 'SSC-A',
        ...                              yscale = 'log',
        ...                              keep = 0.5)
        
    Find the bins to keep
    
    .. plot::
        :context: close-figs
        
        >>> dens_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> dens_op.default_view().plot(ex)
        
    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = dens_op.apply(ex)
        
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.density')
    friendly_id = Constant("Density Gate")
    
    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    keep = util.PositiveFloat(0.9, allow_zero = False)
    bins = util.PositiveInt(100, allow_zero = False)
    min_quantile = util.PositiveFloat(0.001, allow_zero = True)
    max_quantile = util.PositiveFloat(1.0, allow_zero = False)
    sigma = util.PositiveFloat(1.0, allow_zero = False)
    by = List(Str)
        
    _xscale = Instance(util.IScale, transient = True)
    _yscale = Instance(util.IScale, transient = True)
    
    _xbins = Array(transient = True)
    _ybins = Array(transient = True)

    _keep_xbins = Dict(Any, Array, transient = True)
    _keep_ybins = Dict(Any, Array, transient = True)
    _histogram = Dict(Any, Array, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Split the data set into bins and determine which ones to keep.
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the gate parameters.
            
        subset : Str (default = None)
            If set, determine the gate parameters on only a subset of the
            ``experiment`` parameter.
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError('xchannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.xchannel))
            
        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError('ychannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.ychannel))

        if self.min_quantile > 1.0:
            raise util.CytoflowOpError('min_quantile',
                                       "min_quantile must be <= 1.0")
            
        if self.max_quantile > 1.0:
            raise util.CytoflowOpError('max_quantile',
                                       "max_quantile must be <= 1.0")
               
        if not (self.max_quantile > self.min_quantile):
            raise util.CytoflowOpError('max_quantile',
                                       "max_quantile must be > min_quantile")
        
        if self.keep > 1.0:
            raise util.CytoflowOpError('keep',
                                       "keep must be <= 1.0")

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError('subset',
                                            "Subset string '{0}' isn't valid"
                                            .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = xscale = util.scale_factory(self.xscale, experiment, channel = self.xchannel)
        self._yscale = yscale = util.scale_factory(self.yscale, experiment, channel = self.ychannel)
        

        xlim = (xscale.clip(experiment[self.xchannel].quantile(self.min_quantile)),
                xscale.clip(experiment[self.xchannel].quantile(self.max_quantile)))
                  
        ylim = (yscale.clip(experiment[self.ychannel].quantile(self.min_quantile)),
                yscale.clip(experiment[self.ychannel].quantile(self.max_quantile)))
        
        self._xbins = xbins = xscale.inverse(np.linspace(xscale(xlim[0]), 
                                                         xscale(xlim[1]), 
                                                         self.bins))
        self._ybins = ybins = yscale.inverse(np.linspace(yscale(ylim[0]), 
                                                         yscale(ylim[1]), 
                                                         self.bins))
                    
        for group, group_data in groupby:
            if len(group_data) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))

            h, _, _ = np.histogram2d(group_data[self.xchannel], 
                                     group_data[self.ychannel], 
                                     bins=[xbins, ybins])
            
            h = scipy.ndimage.filters.gaussian_filter(h, sigma = self.sigma)
            
            i = scipy.stats.rankdata(h, method = "ordinal") - 1
            i = np.unravel_index(np.argsort(-i), h.shape)
            
            goal_count = self.keep * len(group_data)
            curr_count = 0
            num_bins = 0

            while(curr_count < goal_count and num_bins < i[0].size):
                curr_count += h[i[0][num_bins], i[1][num_bins]]
                num_bins += 1
                
            self._keep_xbins[group] = i[0][0:num_bins]
            self._keep_ybins[group] = i[1][0:num_bins]
            self._histogram[group] = h

            
    def apply(self, experiment):
        """
        Creates a new condition based on membership in the gate that was
        parameterized with :meth:`estimate`.
        
        Parameters
        ----------
        experiment : Experiment
            the :class:`.Experiment` to apply the gate to.
            
        Returns
        -------
        Experiment
            a new :class:`.Experiment` with the new gate applied.
        """
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if not self.xchannel:
            raise util.CytoflowOpError('xchannel',
                                       "Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel',
                                       "Must set Y channel")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
        
        if not (self._xbins.size and self._ybins.size and self._keep_xbins):
            raise util.CytoflowOpError(None,
                                       "No gate estimate found.  Did you forget to "
                                       "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _xscale.  What happened??")
        
        if not self._yscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError('xchannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError('ychannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.ychannel))
       
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        event_assignments = pd.Series([False] * len(experiment), dtype = "bool")
        
        for group, group_data in groupby:
            if group not in self._keep_xbins:
                # there weren't any events in this group, so we didn't get
                # an estimate
                continue
            
            group_idx = groupby.groups[group]
            
            cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False)
            cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False)

            group_keep = pd.Series([False] * len(group_data))
            
            keep_x = self._keep_xbins[group]
            keep_y = self._keep_ybins[group]
            
            for (xbin, ybin) in zip(keep_x, keep_y):
                group_keep = group_keep | ((cX == xbin) & (cY == ybin))
                            
            event_assignments.iloc[group_idx] = group_keep
                    
        new_experiment = experiment.clone()
        
        new_experiment.add_condition(self.name, "bool", event_assignments)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
     
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
        IView
            a diagnostic view, call :meth:`~DensityGateView.plot` to see the 
            diagnostic plot.
        """
        v = DensityGateView(op = self)
        v.trait_set(**kwargs)
        return v
Beispiel #20
0
class DensityGateOp(HasStrictTraits):
    """
    This module computes a gate based on a 2D density plot.  The user chooses
    what proportion of cells to keep, and the module creates a gate that selects
    that proportion of cells in the highest-density bins of the 2D density
    histogram.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the mixture model to.
        
    ychannel : Str
        The Y channel to apply the mixture model to.

    xscale : Enum("linear", "logicle", "log") (default = "linear")
        Re-scale the data on the X acis before fitting the data?  

    yscale : Enum("linear", "logicle", "log") (default = "linear")
        Re-scale the data on the Y axis before fitting the data?  
        
    keep : Float (default = 0.9)
        What proportion of events to keep?  Must be positive.
        
    bins : Int (default = 100)
        How many bins should there be on each axis?  Must be positive.
        
    min_quantile : Float (default = 0.001)
        Clip values below this quantile
        
    max_quantile : Float (default = 1.0)
        Clip values above this quantile

    sigma : Float (default = 1.0)
        What standard deviation to use for the gaussian blur?
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    Notes
    -----
    This gating method was developed by John Sexton, in Jeff Tabor's lab at
    Rice University.  
    
    From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html,
    the method is as follows:
    
    1. Determines the number of events to keep, based on the user specified 
       gating fraction and the total number of events of the input sample.
       
    2. Divides the 2D channel space into a rectangular grid, and counts the 
       number of events falling within each bin of the grid. The number of 
       counts per bin across all bins comprises a 2D histogram, which is a 
       coarse approximation of the underlying probability density function.
       
    3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. 
       Theoretically, the proper amount of smoothing results in a better 
       estimate of the probability density function. Practically, smoothing 
       eliminates isolated bins with high counts, most likely corresponding to 
       noise, and smoothes the contour of the gated region.
       
    4. Selects the bins with the greatest number of events in the smoothed 
       histogram, starting with the highest and proceeding downward until the 
       desired number of events to keep, calculated in step 1, is achieved.
    
    Examples
    --------
    
    >>> density_op = DensityGateOp(name = "Density",
    ...                            xchannel = "V2-A",
    ...                            ychannel = "Y2-A",
    ...                            keep = 0.7)
    >>> density_op.estimate(ex2)
    >>> density_op.default_view().plot(ex2)
    >>> ex3 = density_op.apply(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.density')
    friendly_id = Constant("Density Gate")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    keep = util.PositiveFloat(0.9, allow_zero=False)
    bins = util.PositiveInt(100, allow_zero=False)
    min_quantile = util.PositiveFloat(0.001, allow_zero=True)
    max_quantile = util.PositiveFloat(1.0, allow_zero=False)
    sigma = util.PositiveFloat(1.0, allow_zero=False)
    by = List(Str)

    _xscale = Instance(util.IScale, transient=True)
    _yscale = Instance(util.IScale, transient=True)

    _xbins = Array(transient=True)
    _ybins = Array(transient=True)

    _keep_xbins = Dict(Any, Array, transient=True)
    _keep_ybins = Dict(Any, Array, transient=True)
    _histogram = Dict(Any, Array, transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.ychannel))

        if self.max_quantile > 1.0 or self.min_quantile > 1.0:
            raise util.CytoflowOpError(
                "min_quantile and max_quantile must be <= 1.0")

        if not (self.max_quantile > self.min_quantile):
            raise util.CytoflowOpError("max_quantile must be > min_quantile")

        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")

        if self.keep > 1.0:
            raise util.CytoflowOpError("keep must be <= 1.0")

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = xscale = util.scale_factory(self.xscale,
                                                   experiment,
                                                   channel=self.xchannel)
        self._yscale = yscale = util.scale_factory(self.yscale,
                                                   experiment,
                                                   channel=self.ychannel)

        xlim = (xscale.clip(experiment[self.xchannel].quantile(
            self.min_quantile)),
                xscale.clip(experiment[self.xchannel].quantile(
                    self.max_quantile)))

        ylim = (yscale.clip(experiment[self.ychannel].quantile(
            self.min_quantile)),
                yscale.clip(experiment[self.ychannel].quantile(
                    self.max_quantile)))

        self._xbins = xbins = xscale.inverse(
            np.linspace(xscale(xlim[0]), xscale(xlim[1]), self.bins))
        self._ybins = ybins = yscale.inverse(
            np.linspace(yscale(ylim[0]), yscale(ylim[1]), self.bins))

        for group, group_data in groupby:
            if len(group_data) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))

            h, _, _ = np.histogram2d(group_data[self.xchannel],
                                     group_data[self.ychannel],
                                     bins=[xbins, ybins])

            h = scipy.ndimage.filters.gaussian_filter(h, sigma=self.sigma)

            i = scipy.stats.rankdata(h, method="ordinal") - 1
            i = np.unravel_index(np.argsort(-i), h.shape)

            goal_count = self.keep * len(group_data)
            curr_count = 0
            num_bins = 0

            while (curr_count < goal_count and num_bins < i[0].size):
                curr_count += h[i[0][num_bins], i[1][num_bins]]
                num_bins += 1

            self._keep_xbins[group] = i[0][0:num_bins]
            self._keep_ybins[group] = i[1][0:num_bins]
            self._histogram[group] = h


#
#             self._keep_xbins[group] = i[0][0:num_bins]
#             self._keep_ybins[group] = i[1][0:num_bins]

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if not self.xchannel:
            raise util.CytoflowOpError("Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError("Must set Y channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if not (self._xbins.size and self._ybins.size and self._keep_xbins):
            raise util.CytoflowOpError(
                "No gate estimate found.  Did you forget to "
                "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(
                "Couldn't find _xscale.  What happened??")

        if not self._yscale:
            raise util.CytoflowOpError(
                "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.ychannel))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))

            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series([False] * len(experiment), dtype="bool")

        for group, group_data in groupby:
            if group not in self._keep_xbins:
                # there weren't any events in this group, so we didn't get
                # an estimate
                continue

            group_idx = groupby.groups[group]

            cX = pd.cut(group_data[self.xchannel],
                        self._xbins,
                        include_lowest=True,
                        labels=False)
            cY = pd.cut(group_data[self.ychannel],
                        self._ybins,
                        include_lowest=True,
                        labels=False)

            group_keep = pd.Series([False] * len(group_data))

            keep_x = self._keep_xbins[group]
            keep_y = self._keep_ybins[group]

            for (xbin, ybin) in zip(keep_x, keep_y):
                group_keep[(cX == xbin) & (cY == ybin)] = True

            event_assignments.iloc[group_idx] = group_keep

        new_experiment = experiment.clone()

        new_experiment.add_condition(self.name, "bool", event_assignments)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        return DensityGateView(op=self, **kwargs)