Beispiel #1
0
class SectionHeading(SettingBase):
    value = Constant('')

    def __init__(self, name):
        self.name = name
class BleedthroughPiecewiseOp(HasStrictTraits):
    """
    *THIS OPERATION IS DEPRECATED.*
    
    Apply bleedthrough correction to a set of fluorescence channels.
    
    This is not a traditional bleedthrough matrix-based compensation; it uses
    a similar set of single-color controls, but instead of computing a compensation
    matrix, it fits a piecewise-linear spline to the untransformed data and
    uses those splines to compute the correction factor at each point in
    a mesh across the color space.  The experimental data is corrected using
    a linear interpolation along that mesh: this is much faster than computing
    the correction factor for each cell indiviually (an operation that takes
    5 msec each.)
    
    To use, set up the `controls` dict with the single color controls;
    call `estimate()` to parameterize the operation; check that the bleedthrough 
    plots look good with `default_view().plot()`; and then `apply()` to an 
    Experiment.
    
    *THIS OPERATION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE. TO
    USE IT, SET `ignore_deprecated` TO `True`.  IF YOU HAVE A USE CASE WHERE
    THIS WORKS BETTER THAN THE LINEAR BLEEDTHROUGH CORRECTION, PLEASE EMAIL
    ME OR FILE A BUG.*
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation; optional for interactive use)
    
    controls : Dict(Str, File)
        The channel names to correct, and corresponding single-color control
        FCS files to estimate the correction splines with.  Must be set to
        use `estimate()`.
        
    num_knots : Int (default = 12)
        The number of internal control points to estimate, spaced log-evenly
        from 0 to the range of the channel.  Must be set to use `estimate()`.
        
    mesh_size : Int (default = 32)
        The size of each axis in the mesh used to interpolate corrected values.
        
    ignore_deprecated : Bool (default = False)
        
    Metadata
    --------
    bleedthrough_channels : List(Str)
        The channels that were used to correct this one.
        
    bleedthrough_fn : Callable (Tuple(Float) --> Float)
        The function that will correct one event in this channel.  Pass it
        the values specified in `bleedthrough_channels` and it will return
        the corrected value for this channel. 
            
    Notes
    -----
    We use an interpolation-based scheme to estimate corrected bleedthrough.
    The algorithm is as follows:
    
     - Fit a piecewise-linear spline to each single-color control's bleedthrough
       into other channels.  Because we want to fit the spline to untransfomed
       data, but capture both the negative, positive-linear and positive-log 
       portions of a traditional flow data set, we distribute the spline knots 
       evenly on an hlog-transformed axis for each color we're correcting.   

     - At each point on a regular mesh spanning the entire range of the
       instrument, estimate the mapping from (raw colors) --> (actual colors).
       The mesh points are also distributed evenly along the hlog-transformed
       color axes; this captures negative data as well as positive 
       This is quite slow: ~30 seconds for a mesh size of 32 in 3-space.
       Remember that additional channels expand the number of mesh points
       exponentially!

     - Use these estimates to paramaterize a linear interpolator (in linear
       space, this time).  There's one interpolator per output channel (so
       for a 3-channel correction, each interpolator is R^3 --> R).  For 
       each measured cell, run each interpolator to give the corrected output.

    Examples
    --------
    >>> bl_op = flow.BleedthroughPiecewiseOp()
    >>> bl_op.controls = {'Pacific Blue-A' : 'ebfp.fcs',
    ...                   'FITC-A' : 'eyfp.fcs',
    ...                   'PE-Tx-Red-YG-A' : 'mkate.fcs'}
    >>>
    >>> bl_op.estimate(ex2)
    >>> bl_op.default_view().plot(ex2)    
    >>>
    >>> %time ex3 = bl_op.apply(ex2) # 410,000 cells
    CPU times: user 577 ms, sys: 27.7 ms, total: 605 ms
    Wall time: 607 ms
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_piecewise')
    friendly_id = Constant("Piecewise Bleedthrough Correction")

    name = Constant("Bleedthrough")

    controls = Dict(Str, File)
    num_knots = Int(12)
    mesh_size = Int(32)

    ignore_deprecated = Bool(False)

    _splines = Dict(Str, Dict(Str, Python), transient=True)
    _interpolators = Dict(Str, Python, transient=True)

    # because the order of the channels is important, we can't just call
    # _interpolators.keys()
    # TODO - this is ugly and unpythonic.  :-/
    _channels = List(Str, transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the bleedthrough from the single-channel controls in `controls`
        """

        if not self.ignore_deprecated:
            raise util.CytoflowOpError(
                "BleedthroughPiecewiseOp is DEPRECATED. "
                "To use it anyway, set ignore_deprected "
                "to True.")

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if self.num_knots < 3:
            raise util.CytoflowOpError(
                "Need to allow at least 3 knots in the spline")

        self._channels = list(self.controls.keys())

        if len(self._channels) < 2:
            raise util.CytoflowOpError(
                "Need at least two channels to correct bleedthrough.")

        for channel in list(self.controls.keys()):
            if 'range' not in experiment.metadata[channel]:
                raise util.CytoflowOpError(
                    "Can't find range for channel {}".format(channel))

        self._splines = {}
        mesh_axes = []

        for channel in self._channels:
            self._splines[channel] = {}

            # make a little Experiment
            check_tube(self.controls[channel], experiment)
            tube_exp = ImportOp(
                tubes=[Tube(file=self.controls[channel])],
                channels={
                    experiment.metadata[c]["fcs_name"]: c
                    for c in experiment.channels
                },
                name_metadata=experiment.metadata['name_metadata']).apply()

            # apply previous operations
            for op in experiment.history:
                tube_exp = op.apply(tube_exp)

            # subset it
            if subset:
                try:
                    tube_exp = tube_exp.query(subset)
                except Exception as e:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' isn't valid".format(
                            self.subset)) from e

                if len(tube_exp.data) == 0:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' returned no events".format(
                            self.subset))

            tube_data = tube_exp.data

            # polyfit requires sorted data
            tube_data.sort_values(by=channel, inplace=True)

            channel_min = tube_data[channel].min()
            channel_max = tube_data[channel].max()

            # we're going to set the knots and splines evenly across the
            # logicle-transformed data, so as to captur both the "linear"
            # aspect of the near-0 and negative values, and the "log"
            # aspect of large values.

            scale = util.scale_factory("logicle", experiment, channel=channel)

            # the splines' knots
            knot_min = channel_min
            knot_max = channel_max

            lg_knot_min = scale(knot_min)
            lg_knot_max = scale(knot_max)
            lg_knots = np.linspace(lg_knot_min, lg_knot_max, self.num_knots)
            knots = scale.inverse(lg_knots)

            # only keep the interior knots
            knots = knots[1:-1]

            # the interpolators' mesh
            if 'af_median' in experiment.metadata[channel] and \
               'af_stdev' in experiment.metadata[channel]:
                mesh_min = experiment.metadata[channel]['af_median'] - \
                           3 * experiment.metadata[channel]['af_stdev']
            elif 'range' in experiment.metadata[channel]:
                mesh_min = -0.01 * experiment.metadata[channel][
                    'range']  # TODO - does this even work?
                warn(
                    "This works best if you apply AutofluorescenceOp before "
                    "computing bleedthrough", util.CytoflowOpWarning)

            mesh_max = experiment.metadata[channel]['range']

            lg_mesh_min = scale(mesh_min)
            lg_mesh_max = scale(mesh_max)
            lg_mesh_axis = \
                np.linspace(lg_mesh_min, lg_mesh_max, self.mesh_size)

            mesh_axis = scale.inverse(lg_mesh_axis)
            mesh_axes.append(mesh_axis)

            for to_channel in self._channels:
                from_channel = channel
                if from_channel == to_channel:
                    continue

                self._splines[from_channel][to_channel] = \
                    scipy.interpolate.LSQUnivariateSpline(tube_data[from_channel].values,
                                                          tube_data[to_channel].values,
                                                          t = knots,
                                                          k = 1)

        mesh = pd.DataFrame(util.cartesian(mesh_axes),
                            columns=[x for x in self._channels])

        mesh_corrected = mesh.apply(_correct_bleedthrough,
                                    axis=1,
                                    args=([[x for x in self._channels],
                                           self._splines]))

        for channel in self._channels:
            chan_values = mesh_corrected[channel].values.reshape(
                [len(x) for x in mesh_axes])
            self._interpolators[channel] = \
                scipy.interpolate.RegularGridInterpolator(points = mesh_axes,
                                                          values = chan_values,
                                                          bounds_error = False,
                                                          fill_value = 0.0)

        # TODO - some sort of validity checking.

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment with the bleedthrough subtracted out.
        """

        if not self.ignore_deprecated:
            raise util.CytoflowOpError(
                "BleedthroughPiecewiseOp is DEPRECATED. "
                "To use it anyway, set ignore_deprected "
                "to True.")

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if not self._interpolators:
            raise util.CytoflowOpError("Module interpolators aren't set. "
                                       "Did you run estimate()?")

        if not set(self._interpolators.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError(
                "Module parameters don't match experiment channels")

        new_experiment = experiment.clone()

        # get rid of data outside of the interpolators' mesh
        # (-3 * autofluorescence sigma)
        for channel in self._channels:

            # if you update the mesh calculation above, update it here too!
            if 'af_median' in experiment.metadata[channel] and \
               'af_stdev' in experiment.metadata[channel]:
                mesh_min = experiment.metadata[channel]['af_median'] - \
                           3 * experiment.metadata[channel]['af_stdev']
            else:
                mesh_min = -0.01 * experiment.metadata[channel][
                    'range']  # TODO - does this even work?

            new_experiment.data = \
                new_experiment.data[new_experiment.data[channel] > mesh_min]

        new_experiment.data.reset_index(drop=True, inplace=True)

        old_data = new_experiment.data[self._channels]

        for channel in self._channels:
            new_experiment[channel] = self._interpolators[channel](old_data)

            new_experiment.metadata[channel][
                'bleedthrough_channels'] = self._channels
            new_experiment.metadata[channel][
                'bleedthrough_fn'] = self._interpolators[channel]

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the bleedthrough spline estimation
        is working.
        
        Returns
        -------
            IView : An IView, call plot() to see the diagnostic plots
        """

        if not self.ignore_deprecated:
            raise util.CytoflowOpError(
                "BleedthroughPiecewiseOp is DEPRECATED. "
                "To use it anyway, set ignore_deprected "
                "to True.")

        if set(self.controls.keys()) != set(self._splines.keys()):
            raise util.CytoflowOpError(
                "Must have both the controls and bleedthrough to plot")

        return BleedthroughPiecewiseDiagnostic(op=self, **kwargs)
Beispiel #3
0
class AutofluorescenceOp(HasStrictTraits):
    """
    Apply autofluorescence correction to a set of fluorescence channels.
    
    The :meth:`estimate` function loads a separate FCS file (not part of the input
    :class:`.Experiment`) and computes the untransformed median and standard deviation 
    of the blank cells.  Then, :meth:`apply` subtracts the median from the 
    experiment data.
    
    To use, set the :attr:`blank_file` property to point to an FCS file with
    unstained or nonfluorescing cells in it; set the :attr:`channels` 
    property to a  list of channels to correct.
    
    :meth:`apply` also adds the ``af_median`` and ``af_stdev`` metadata to the 
    corrected channels, representing the median and standard deviation of the 
    measured blank distributions.
    
    Attributes
    ----------       
    channels : List(Str)
        The channels to correct.
        
    blank_file : File
        The filename of a file with "blank" cells (not fluorescent).  Used
        to :meth:`estimate` the autofluorescence.
        
    Examples
    --------
    Create a small experiment:
    
    .. plot::
        :context: close-figs
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")]
        >>> ex = import_op.apply()
    
    Create and parameterize the operation
    
    .. plot::
        :context: close-figs

        >>> af_op = flow.AutofluorescenceOp()
        >>> af_op.channels = ["Pacific Blue-A", "FITC-A", "PE-Tx-Red-YG-A"]
        >>> af_op.blank_file = "tasbe/blank.fcs"
    
    Estimate the model parameters
    
    .. plot::
        :context: close-figs 
    
        >>> af_op.estimate(ex)
    
    Plot the diagnostic plot
    
    .. plot::
        :context: close-figs

        >>> af_op.default_view().plot(ex)  

    Apply the operation to the experiment
    
    .. plot::
        :context: close-figs
    
        >>> ex2 = af_op.apply(ex)  
        
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.autofluorescence')
    friendly_id = Constant("Autofluorescence correction")

    name = Constant("Autofluorescence")
    channels = List(Str)
    blank_file = File(exists=True)

    _af_median = Dict(Str, CFloat, transient=True)
    _af_stdev = Dict(Str, CFloat, transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the autofluorescence from :attr:`blank_file` in channels
        specified in :attr:`channels`.  
        
        Parameters
        ----------
        experiment : Experiment
            The experiment to which this operation is applied
            
        subset : str (default = "")
            An expression that specifies the events used to compute the 
            autofluorescence

        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self.channels:
            raise util.CytoflowOpError('channels', "No channels specified")

        if not set(self.channels) <= set(experiment.channels):
            raise util.CytoflowOpError(
                'channels', "Specified channels that weren't found "
                "in the experiment.")

        # don't have to validate that blank_file exists; should crap out on
        # trying to set a bad value

        # make a little Experiment
        check_tube(self.blank_file, experiment)
        blank_exp = ImportOp(
            tubes=[Tube(file=self.blank_file)],
            channels={
                experiment.metadata[c]["fcs_name"]: c
                for c in experiment.channels
            },
            name_metadata=experiment.metadata['name_metadata']).apply()

        # apply previous operations
        for op in experiment.history:
            blank_exp = op.apply(blank_exp)

        # subset it
        if subset:
            try:
                blank_exp = blank_exp.query(subset)
            except Exception as exc:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' isn't valid".format(subset)) from exc

            if len(blank_exp.data) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        for channel in self.channels:
            channel_min = blank_exp[channel].quantile(0.025)
            channel_max = blank_exp[channel].quantile(0.975)

            blank_exp[channel] = blank_exp[channel].clip(
                channel_min, channel_max)

            self._af_median[channel] = np.median(blank_exp[channel])
            self._af_stdev[channel] = np.std(blank_exp[channel])

    def apply(self, experiment):
        """
        Applies the autofluorescence correction to channels in an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new experiment with the autofluorescence median subtracted.  The
            corrected channels have the following metadata added to them:
            
            - **af_median** : Float
              The median of the non-fluorescent distribution
        
            - **af_stdev** : Float
              The standard deviation of the non-fluorescent distribution
        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self.channels:
            raise util.CytoflowOpError('channels', "No channels specified")

        if not self._af_median:
            raise util.CytoflowOpError(
                None, "Autofluorescence values aren't set. Did "
                "you forget to run estimate()?")

        if not set(self._af_median.keys()) <= set(experiment.channels) or \
           not set(self._af_stdev.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError(
                None, "Autofluorescence estimates aren't set, or are "
                "different than those in the experiment "
                "parameter. Did you forget to run estimate()?")

        if not set(self._af_median.keys()) == set(self._af_stdev.keys()):
            raise util.CytoflowOpError(
                None, "Median and stdev keys are different! "
                "What the hell happened?!")

        if not set(self.channels) == set(self._af_median.keys()):
            raise util.CytoflowOpError(
                'channels', "Estimated channels differ from the channels "
                "parameter.  Did you forget to (re)run estimate()?")

        new_experiment = experiment.clone()

        for channel in self.channels:
            new_experiment[channel] = \
                experiment[channel] - self._af_median[channel]

            new_experiment.metadata[channel]['af_median'] = self._af_median[
                channel]
            new_experiment.metadata[channel]['af_stdev'] = self._af_stdev[
                channel]

        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))

        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the autofluorescence estimation
        is working.
        
        Returns
        -------
        IView
            An diagnostic view, call :meth:`~AutofluorescenceDiagnosticView.plot` 
            to see the diagnostic plots
        """
        return AutofluorescenceDiagnosticView(op=self, **kwargs)
Beispiel #4
0
class LogicleScale(HasStrictTraits):
    """
    A scale that transforms the data using the `logicle` function.
    
    This scaling method implements a "linear-like" region around 0, and a
    "log-like" region for large values, with a very smooth transition between
    them.  It's particularly good for compensated data, and data where you have
    "negative" events (events with a fluorescence of ~0.)
    
    If you don't have any data around 0, you might be better of with a more
    traditional log scale.
    
    The transformation has one parameter, `W`, which specifies the width of
    the "linear" range in log10 decades.  By default, the optimal value is
    estimated from the data; but if you assign a value to `W` it will be used.
    `0.5` is usually a good start.
    
    Attributes
    ----------
    experiment : Instance(cytoflow.Experiment)
        the `cytoflow.Experiment` used to estimate the scale parameters.
        
    channel : Str
        If set, choose scale parameters from this channel in `experiment`.
        One of `channel`, `condition` or `statistic` must be set.
        
    condition : Str
        If set, choose scale parameters from this condition in `experiment`.
        One of `channel`, `condition` or `statistic` must be set.
        
    statistic : Str
        If set, choose scale parameters from this statistic in `experiment`.
        One of `channel`, `condition` or `statistic` must be set.
        
    quantiles = Tuple(Float, Float) (default = (0.001, 0.999))
        If there are a few very large or very small values, this can throw off
        matplotlib's choice of default axis ranges.  Set `quantiles` to choose
        what part of the data to consider when choosing axis ranges.
        
    W : Float (default = estimated from data)
        The width of the linear range, in log10 decades.  can estimate from data, 
        or use a fixed value like 0.5.
        
    M : Float (default = 4.5)
        The width of the log portion of the display, in log10 decades.  
        
    A : Float (default = 0.0)
        additional decades of negative data to include.  the default display 
        usually captures all the data, so 0 is fine to start.
    
    r : Float (default = 0.05)
        Quantile used to estimate `W`.
    
    References
    ----------
    [1] A new "Logicle" display method avoids deceptive effects of logarithmic 
        scaling for low signals and compensated data.
        Parks DR, Roederer M, Moore WA.
        Cytometry A. 2006 Jun;69(6):541-51.
        PMID: 16604519
        http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20258/full
        
    [2] Update for the logicle data scale including operational code 
        implementations.
        Moore WA, Parks DR.
        Cytometry A. 2012 Apr;81(4):273-7. 
        doi: 10.1002/cyto.a.22030 
        PMID: 22411901
        http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.22030/full
    """    

    id = Constant("edu.mit.synbio.cytoflow.utility.logicle_scale")        
    name = "logicle"
    
    experiment = Instance("cytoflow.Experiment")
    
    # what data do we use to compute scale parameters?  set one.
    channel = Str
    condition = Str
    statistic = Tuple(Str, Str)
    error_statistic = Tuple(Str, Str)
    data = Array

    W = Property(Float, depends_on = "[experiment, channel, M, _T, r]")
    M = Float(4.5, desc = "the width of the display in log10 decades")
    A = Float(0.0, desc = "additional decades of negative data to include.")
    r = Float(0.05, desc = "quantile to use for estimating the W parameter.")

    _W = Float(Undefined)
    _T = Property(Float, depends_on = "[experiment, condition, channel]")
    _logicle = Property(Instance(FastLogicle), depends_on = "[_T, W, M, A]")

    mpl_params = Property(Dict, depends_on = "_logicle")
    
    def __call__(self, data):
        """
        Transforms `data` using this scale.
        
        Careful!  May return `NaN` if the scale domain doesn't match the data 
        (ie, applying a log10 scale to negative numbers.)
        """
        
        try:
            logicle_min = self._logicle.inverse(0.0)
            logicle_max = self._logicle.inverse(1.0 - sys.float_info.epsilon)
            if isinstance(data, pd.Series):            
                data = data.clip(logicle_min, logicle_max)
                return data.apply(self._logicle.scale)
            elif isinstance(data, np.ndarray):
                data = np.clip(data, logicle_min, logicle_max)
                scale = np.vectorize(self._logicle.scale)
                return scale(data)
            elif isinstance(data, float):
                data = max(min(data, logicle_max), logicle_min)
                return self._logicle.scale(data)
            else:
                try:
                    return list(map(self._logicle.scale, data))
                except TypeError as e:
                    raise CytoflowError("Unknown data type") from e
        except ValueError as e:
            raise CytoflowError(e.strerror)

        
    def inverse(self, data):
        """
        Transforms 'data' using the inverse of this scale.
        """
        try:
            if isinstance(data, pd.Series):            
                data = data.clip(0, 1.0 - sys.float_info.epsilon)
                return data.apply(self._logicle.inverse)
            elif isinstance(data, np.ndarray):
                data = np.clip(data, 0, 1.0 - sys.float_info.epsilon)
                inverse = np.vectorize(self._logicle.inverse)
                return inverse(data)
            elif isinstance(data, float):
                data = max(min(data, 1.0 - sys.float_info.epsilon), 0.0)
                return self._logicle.inverse(data)
            else:
                try:
                    return list(map(self._logicle.inverse, data))
                except TypeError as e:
                    raise CytoflowError("Unknown data type") from e
        except ValueError as e:
            raise CytoflowError(str(e))
        
    def clip(self, data):
        try:
            logicle_min = self._logicle.inverse(0.0)
            logicle_max = self._logicle.inverse(1.0 - sys.float_info.epsilon)
            if isinstance(data, pd.Series):            
                return data.clip(logicle_min, logicle_max)
            elif isinstance(data, np.ndarray):
                return np.clip(data, logicle_min, logicle_max)
            elif isinstance(data, float):
                return max(min(data, logicle_max), logicle_min)
            else:
                try:
                    return [max(min(x, logicle_max), logicle_min) for x in data]
                except TypeError as e:
                    raise CytoflowError("Unknown data type") from e
        except ValueError as e:
            raise CytoflowError(e.strerror)
        
    def color_norm(self):
        # it turns out that Logicle is already defined as a normalization to 
        # [0, 1].
        class LogicleNormalize(matplotlib.colors.Normalize):
            def __init__(self, scale = None):
                self._scale = scale
                self.vmin = scale.inverse(0.0)
                self.vmax = scale.inverse(1.0 - sys.float_info.epsilon)
                
            def __call__(self, data, clip = None):
                # it turns out that Logicle is already defined as a
                # normalization to [0, 1].
                ret = self._scale(data)
                return np.ma.masked_array(ret)
            
        return LogicleNormalize(scale = self)
        
    
    @cached_property
    def _get__T(self):
        "The range of possible data values"
        if self.experiment:
            if self.channel and self.channel in self.experiment.channels:
                if "range" in self.experiment.metadata[self.channel]:
                    return self.experiment.metadata[self.channel]["range"]
                else:
                    return self.experiment.data[self.channel].max()
            elif self.condition and self.condition in self.experiment.conditions:
                return self.experiment.data[self.condition].max()
            elif self.statistic in self.experiment.statistics \
                 and not self.error_statistic in self.experiment.statistics:
                stat = self.experiment.statistics[self.statistic]
                assert is_numeric(stat)
                return stat.max()
            elif self.statistic in self.experiment.statistics and \
                 self.error_statistic in self.experiment.statistics:
                stat = self.experiment.statistics[self.statistic]
                err_stat = self.experiment.statistics[self.error_statistic]
                
                try:
                    err_max = max([max(x) for x in err_stat])
                    return err_max
                except (TypeError, IndexError):
                    err_max = err_stat.max()
                    stat_max = stat.max()

                    return stat_max + err_max 
            elif self.data.size > 0:
                return self.data.max()
            else:
                return Undefined
        else:
            return Undefined
        
    @cached_property
    def _get_W(self):
        if not self.experiment:
            return Undefined
        
        if self._W is not Undefined:
            return self._W
        
        if self.channel and self.channel in self.experiment.channels:
            data = self.experiment[self.channel]
            
            if self.r <= 0 or self.r >= 1:
                raise CytoflowError("r must be between 0 and 1")
            
            # get the range by finding the rth quantile of the negative values
            neg_values = data[data < 0]
            if(not neg_values.empty):
                r_value = neg_values.quantile(self.r)
                W = (self.M - math.log10(self._T/math.fabs(r_value)))/2
                if W <= 0:
                    warn("Channel {0} doesn't have enough negative data. " 
                         "Try a log transform instead."
                         .format(self.channel),
                         CytoflowWarning)
                    return 0.5
                else:
                    return W
            else:
                # ... unless there aren't any negative values, in which case
                # you probably shouldn't use this transform
                warn("Channel {0} doesn't have any negative data. " 
                     "Try a log transform instead."
                     .format(self.channel),
                     CytoflowWarning)
                return 0.5
        else:
            return 0.5  # a reasonable default for non-channel scales
        
    def _set_W(self, value):
        self._W = value
        
    @cached_property
    def _get__logicle(self):
        if self.W is Undefined or self._T is Undefined:
            return Undefined
        
        if self._T <= 0:
            raise CytoflowError("Logicle range must be > 0")
        
        if self.W < 0:
            raise CytoflowError("Logicle param W must be >= 0")
        
        if self.M <= 0:
            raise CytoflowError("Logicle param M must be > 0")
        
        if (2 * self.W > self.M):
            raise CytoflowError("Logicle param W is too large; it must be "
                                "less than half of param M.")
        
        if (-self.A > self.W or self.A + self.W > self.M - self.W):
            raise CytoflowError("Logicle param A is too large.")
         
        return FastLogicle(self._T, self.W, self.M, self.A)
    
    @cached_property
    def _get_mpl_params(self):
        return {"logicle" : self._logicle} 
Beispiel #5
0
class BeadCalibrationDiagnostic(HasStrictTraits):
    """
    A diagnostic view for `BeadCalibrationOp`.
        
    Plots the smoothed histogram of the bead data; the peak locations;
    a scatter plot of the raw bead fluorescence values vs the calibrated unit 
    values; and a line plot of the model that was computed.  Make sure that the
    relationship is linear; if it's not, it likely isn't a good calibration!
    
    Attributes
    ----------
    op : Instance(BeadCalibrationOp)
        The operation instance whose parameters we're plotting.  Set 
        automatically if you created the instance using 
        :meth:`BeadCalibrationOp.default_view`.

    """
    
    # traits   
    id = Constant("edu.mit.synbio.cytoflow.view.beadcalibrationdiagnosticview")
    friendly_id = Constant("Bead Calibration Diagnostic")
        
    op = Instance(BeadCalibrationOp)
    
    def plot(self, experiment):
        """
        Plots the diagnostic view.
        
        Parameters
        ----------
        experiment : Experiment
            The experiment used to create the diagnostic plot.
        
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment', "No experiment specified")

        channels = list(self.op.units.keys())

        if not channels:
            raise util.CytoflowViewError(None, "No channels to plot")

        if set(channels) != set(self.op._histograms.keys()):
            raise util.CytoflowViewError(None, "You must estimate the parameters "
                                               "before plotting")

        plt.figure()
        
        for idx, channel in enumerate(channels):            
            _, hist_bins, hist_smooth = self.op._histograms[channel]
                
            plt.subplot(len(channels), 2, 2 * idx + 1)
            plt.xscale('log')
            plt.xlabel(channel)
            plt.plot(hist_bins[1:], hist_smooth)
            
            plt.axvline(self.op.bead_brightness_threshold, color = 'blue', linestyle = '--' )
            if self.op.bead_brightness_cutoff:
                plt.axvline(self.op.bead_brightness_cutoff, color = 'blue', linestyle = '--' )
            else:
                plt.axvline(experiment.metadata[channel]['range'] * 0.7, color = 'blue', linestyle = '--')                

            if channel in self.op._peaks:
                for peak in self.op._peaks[channel]:
                    plt.axvline(peak, color = 'r')
                    
            if channel in self.op._peaks and channel in self.op._mefs:
                plt.subplot(len(channels), 2, 2 * idx + 2)
                plt.xscale('log')
                plt.yscale('log')
                plt.xlabel(channel)
                plt.ylabel(self.op.units[channel])
                plt.plot(self.op._peaks[channel], 
                         self.op._mefs[channel], 
                         marker = 'o')
                
                xmin, xmax = plt.xlim()
                x = np.logspace(np.log10(xmin), np.log10(xmax))
                plt.plot(x, 
                         self.op._calibration_functions[channel](x), 
                         color = 'r', linestyle = ':')
            
        plt.tight_layout(pad = 0.8)
            
class TasbeCalibrationView(PluginViewMixin):
    handler_factory = Callable(TasbeViewHandler)
    op = Instance(TasbeCalibrationOp)

    id = "edu.mit.synbio.cytoflowgui.op_plugins.tasbe"
    friendly_id = "TASBE Calibration"

    name = Constant("TASBE Calibration")

    fsc_channel = DelegatesTo('op')
    ssc_channel = DelegatesTo('op')

    _polygon_view = Instance(PolygonSelection, transient=True)
    interactive = Property(Bool)

    def _get_interactive(self):
        if self._polygon_view:
            return self._polygon_view.interactive
        else:
            return False

    def _set_interactive(self, val):
        if self._polygon_view:
            self._polygon_view.interactive = val

    def plot_wi(self, wi):
        self.plot(None, plot_name=self.current_plot)

    def enum_plots(self, experiment):
        return iter([
            "Morphology", "Autofluorescence", "Bleedthrough",
            "Bead Calibration", "Color Translation"
        ])

    def enum_plots_wi(self, wi):
        return iter([
            "Morphology", "Autofluorescence", "Bleedthrough",
            "Bead Calibration", "Color Translation"
        ])

    def should_plot(self, changed, payload):
        """
        Should the owning WorkflowItem refresh the plot when certain things
        change?  `changed` can be:
        - Changed.VIEW -- the view's parameters changed
        - Changed.RESULT -- this WorkflowItem's result changed
        - Changed.PREV_RESULT -- the previous WorkflowItem's result changed
        - Changed.ESTIMATE_RESULT -- the results of calling "estimate" changed

        """
        if changed == Changed.VIEW:
            _, name, _ = payload
            if self.current_plot == 'Morphology' and (name == 'fsc_channel' or
                                                      name == 'ssc_channel'):
                return True
            elif name == 'current_plot':
                return True
        elif changed == Changed.PREV_RESULT:
            if self.current_plot == payload:
                return True
        else:
            return False

    def plot(self, experiment, plot_name=None, **kwargs):

        if plot_name not in [
                "Morphology", "Autofluorescence", "Bleedthrough",
                "Bead Calibration", "Color Translation"
        ]:
            raise util.CytoflowViewError(
                "Which plot do you want?  Must be one "
                "of \"Morphology\", \"Autofluorescence\", "
                "\"Bleedthrough\", \"Bead Calibration\", "
                "or \"Color Translation\"")

        if not self.op._blank_exp:
            raise util.CytoflowViewError(
                "Must set at least the blank control file!")

        new_ex = self.op._blank_exp.clone()

        if plot_name == "Morphology":
            if not self._polygon_view:
                self._polygon_view = self.op._polygon_op.default_view()

            self._polygon_view.plot(new_ex, **kwargs)

            return
        else:
            new_ex = self.op._polygon_op.apply(new_ex)

        if plot_name == "Autofluorescence":
            self.op._af_op.default_view().plot(new_ex, **kwargs)
            return
        else:
            new_ex = self.op._af_op.apply(new_ex)

        if plot_name == "Bleedthrough":
            self.op._bleedthrough_op.default_view().plot(new_ex, **kwargs)
            return
        else:
            new_ex = self.op._bleedthrough_op.apply(new_ex)

        if plot_name == "Bead Calibration":
            self.op._bead_calibration_op.default_view().plot(new_ex, **kwargs)
            return
        else:
            new_ex = self.op._bead_calibration_op.apply(new_ex)

        if plot_name == "Color Translation":
            self.op._color_translation_op.default_view().plot(new_ex, **kwargs)
Beispiel #7
0
class PolygonSelection(Op2DView, ScatterplotView):
    """
    Plots, and lets the user interact with, a 2D polygon selection.
    
    Attributes
    ----------
    interactive : bool
        is this view interactive?  Ie, can the user set the polygon verticies
        with mouse clicks?
        
    Examples
    --------

    In a Jupyter notebook with `%matplotlib notebook`
    
    >>> s = flow.PolygonOp(xchannel = "V2-A",
    ...                    ychannel = "Y2-A")
    >>> poly = s.default_view()
    >>> poly.plot(ex2)
    >>> poly.interactive = True
    """

    id = Constant('edu.mit.synbio.cytoflow.views.polygon')
    friendly_id = Constant("Polygon Selection")

    xfacet = Constant(None)
    yfacet = Constant(None)

    interactive = Bool(False, transient=True)

    # internal state.
    _ax = Any(transient=True)
    _widget = Instance(util.PolygonSelector, transient=True)
    _patch = Instance(mpl.patches.PathPatch, transient=True)

    def plot(self, experiment, **kwargs):
        """
        Plot the scatter plot, and then plot the selection on top of it.
        
        Parameters
        ----------
        
        """

        super(PolygonSelection, self).plot(experiment, **kwargs)
        self._ax = plt.gca()
        self._draw_poly()
        self._interactive()

    @on_trait_change('op.vertices', post_init=True)
    def _draw_poly(self):
        if not self._ax:
            return

        if self._patch and self._patch in self._ax.patches:
            self._patch.remove()

        if not self.op.vertices or len(self.op.vertices) < 3:
            return

        patch_vert = np.concatenate(
            (np.array(self.op.vertices), np.array((0, 0), ndmin=2)))

        self._patch = \
            mpl.patches.PathPatch(mpl.path.Path(patch_vert, closed = True),
                                  edgecolor="black",
                                  linewidth = 2,
                                  fill = False)

        self._ax.add_patch(self._patch)
        plt.draw()

    @on_trait_change('interactive', post_init=True)
    def _interactive(self):
        if self._ax and self.interactive:
            self._widget = util.PolygonSelector(self._ax,
                                                self._onselect,
                                                useblit=True)
        elif self._widget:
            self._widget = None

    def _onselect(self, vertices):
        self.op.vertices = vertices
Beispiel #8
0
class FlowPeaks2DDensityView(By2DView, AnnotatingView, NullView):
    """
    A two-dimensional diagnostic view for :class:`FlowPeaksOp`.  Plots the
    estimated density function of the two channels, then overlays the k-means 
    centroids in blue and the clusters-of-k-means in pink.

    Attributes
    ----------    
        
    """

    id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks2ddensityview')
    friendly_id = Constant("FlowPeaks 2D Diagnostic Plot (Density)")

    xchannel = Str
    ychannel = Str
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    huefacet = Constant(None)

    def plot(self, experiment, **kwargs):
        """
        Plot the plots.
        
        Parameters
        ----------
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        annotations = {}
        for k in self.op._kmeans:
            annotations[k] = (self.op._kmeans[k], self.op._peaks[k],
                              self.op._cluster_peak[k])

        if self.xchannel in self.op._scale:
            xscale = self.op._scale[self.xchannel]
        else:
            xscale = util.scale_factory(self.xscale,
                                        experiment,
                                        channel=self.xchannel)

        if self.ychannel in self.op._scale:
            yscale = self.op._scale[self.ychannel]
        else:
            yscale = util.scale_factory(self.yscale,
                                        experiment,
                                        channel=self.ychannel)

        if not self.op._kmeans:
            raise util.CytoflowViewError(
                None, "Must estimate a model before plotting "
                "the density plot.")

        for k in self.op._kmeans:
            annotations[k] = (self.op._kmeans[k], self.op._peaks[k],
                              self.op._cluster_peak[k], self.op._density[k])

        super().plot(experiment,
                     annotations=annotations,
                     xscale=xscale,
                     yscale=yscale,
                     **kwargs)

    def _grid_plot(self, experiment, grid, **kwargs):
        # all the real plotting happens in _annotation_plot.  this just sets some
        # defaults and then stores them for later.

        kwargs.setdefault('antialiased', False)
        kwargs.setdefault('linewidth', 0)
        kwargs.setdefault('edgecolors', 'face')
        kwargs.setdefault('cmap', plt.get_cmap('viridis'))

        xscale = kwargs['scale'][self.xchannel]
        xlim = kwargs['lim'][self.xchannel]
        yscale = kwargs['scale'][self.ychannel]
        ylim = kwargs['lim'][self.ychannel]

        under_color = kwargs.pop('under_color', None)
        if under_color is not None:
            kwargs['cmap'].set_under(color=under_color)
        else:
            kwargs['cmap'].set_under(color=kwargs['cmap'](0.0))

        bad_color = kwargs.pop('bad_color', None)
        if bad_color is not None:
            kwargs['cmap'].set_bad(color=kwargs['cmap'](0.0))

        gridsize = kwargs.pop('gridsize', 50)
        xbins = xscale.inverse(
            np.linspace(xscale(xlim[0]), xscale(xlim[1]), gridsize))
        ybins = yscale.inverse(
            np.linspace(yscale(ylim[0]), yscale(ylim[1]), gridsize))

        for (i, j, _), _ in grid.facet_data():
            ax = grid.facet_axis(i, j)
            ax.fp_xbins = xbins
            ax.fp_ybins = ybins
            ax.fp_keywords = kwargs

        super()._grid_plot(experiment, grid, **kwargs)

        return dict(xscale=xscale,
                    xlim=xlim,
                    yscale=yscale,
                    ylim=ylim,
                    cmap=kwargs['cmap'])

    def _annotation_plot(self, axes, annotation, annotation_facet,
                         annotation_value, annotation_color, **kwargs):

        km = annotation[0]
        peaks = annotation[1]
        cluster_peak = annotation[2]
        density = annotation[3]

        xbins = axes.fp_xbins
        ybins = axes.fp_ybins
        kwargs = axes.fp_keywords

        # get rid of some kwargs that confuse pcolormesh
        kwargs.pop('annotations', None)
        kwargs.pop('annotation_facet', None)
        kwargs.pop('plot_name', None)

        xscale = kwargs['scale'][self.xchannel]
        yscale = kwargs['scale'][self.ychannel]

        kwargs.pop('scale')
        kwargs.pop('lim')

        h = density(util.cartesian([xscale(xbins), yscale(ybins)]))
        h = np.reshape(h, (len(xbins), len(ybins)))
        axes.pcolormesh(xbins, ybins, h.T, **kwargs)

        ix = self.op.channels.index(self.xchannel)
        iy = self.op.channels.index(self.ychannel)

        for k in range(len(km.cluster_centers_)):

            x = self.op._scale[self.xchannel].inverse(
                km.cluster_centers_[k][ix])
            y = self.op._scale[self.ychannel].inverse(
                km.cluster_centers_[k][iy])

            plt.plot(x, y, '*', color='blue')

            peak_idx = cluster_peak[k]
            peak = peaks[peak_idx]
            peak_x = xscale.inverse(peak[0])
            peak_y = yscale.inverse(peak[1])

            plt.plot([x, peak_x], [y, peak_y])

        for peak in peaks:
            x = self.op._scale[self.ychannel].inverse(peak[0])
            y = self.op._scale[self.xchannel].inverse(peak[1])
            plt.plot(x, y, 'o', color="magenta")
Beispiel #9
0
class BruteForceOptimizerStep(ExperimentOptimizerStep):
    """ Optimize a set of simulation parameters to model the provided
    experiment using the grid search (brute force) approach.

    If sim_group_max_size is 0, the step creates 1 simulation grid around a
    simulation built to model each target experiment. if sim_group_max_size is
    a positive integer, all simulations for a target experiments are split into
    groups of size less or equal to sim_group_max_size.

    When a simulation grid is fully run, the cost of each simulation to the
    corresponding target experiment is computed using the cost function
    attribute. The cost data from each simulation grid is stored in the
    group_cost_data dict and combined into the step's cost_data once the
    simulation names are stripped.
    """
    # General step traits -----------------------------------------------------

    #: Type of the optimizer step
    optimizer_step_type = Constant(OPTIMIZER_STEP_TYPE)

    #: List of parameter objects to scan
    parameter_list = List(ParameterScanDescription)

    #: List of parameter names to scan
    scanned_param_names = Property(List(Str), depends_on="parameter_list[]")

    # SimulationGroup related traits ------------------------------------------

    #: List of simulation groups, scanning desired parameters, 1 per target exp
    # Built from start_point_simulation and scanned_params if not provided.
    simulation_groups = List(Instance(SimulationGroup))

    #: Cost function to minimize, one per simulation group
    group_cost_functions = Dict(Str, Callable)

    #: Maximum size for each of the simulation groups in the step
    # if the step needs a larger grid, it will be split into SimGroups of size
    # less or equal to this
    sim_group_max_size = Int

    #: Number of the next simulation group to run
    _next_group_to_run = Int(0)

    #: Local storage of the job_manager to run subsequent groups
    _job_manager = Instance(JobManager)

    #: Make the run call blocking?
    _wait_on_run = Bool

    # Run related traits ------------------------------------------------------

    # Total number of simulations involved in the optimization step
    size = Property(Int, depends_on="simulation_groups[]")

    #: Number of simulations already run
    size_run = Property(Int, depends_on="simulation_groups.size_run")

    #: Percentage of the optimizer that has already run
    percent_run = Property(Str, depends_on="size_run")

    # Output related traits ---------------------------------------------------

    #: Aggregation method to combine costs for all components & all experiments
    cost_agg_func = Enum("sum", "mean")

    #: Dict mapping each simulation group to its cost data.
    _group_cost_data = Dict

    #: Dict mapping each component to a list of the best simulations
    optimal_simulation_for_comp = Dict

    # Run related methods -----------------------------------------------------

    def run(self, job_manager, wait=False):
        """ Run optimization step by running all simulation groups it contains.
        """
        # Initialize run parameters
        super(BruteForceOptimizerStep, self).run(job_manager, wait=wait)
        if not self.simulation_groups:
            self.initialize_sim_group()

        first_group = self.simulation_groups[0]
        runner = first_group.run(job_manager, wait=wait)

        self._job_manager = job_manager
        self._next_group_to_run = 1
        self._wait_on_run = wait

        return runner

    def wait(self):
        """ Wait for currently known simulation groups to finish running.
        """
        for group in self.simulation_groups:
            msg = "Waiting for {} to finish...".format(group.name)
            logger.debug(msg)
            group.wait()

    def initialize_sim_group(self):
        """ Initialize simulation groups with one based on self attribute.

        Depending on the group_max_size, there may be multiple simulation
        groups to target a given experiment.
        """
        for exp, start_point_sim in zip(self.target_experiments,
                                        self.starting_point_simulations):
            name = "Grid {}_{}".format(exp.name, self.name)
            groups = param_scans_to_sim_group(
                name, self.parameter_list, start_point_sim,
                max_size=self.sim_group_max_size
            )
            self.simulation_groups.extend(groups)

    # Cost related methods ----------------------------------------------------

    def recompute_costs_for_weights(self, new_weights):
        """ Assume new weights for all cost functions.

        Also recompute costs for all groups if they have already been computed.
        """
        if not self.has_run:
            self.cost_func_kw["weights"] = new_weights
            return

        # Otherwise, recompute all costs data (using cached metrics stored in
        # cost functions:
        self.invalidate_group_cost_data()
        for group in self.simulation_groups:
            # Rebuild the simulations so that we can recover parameter values
            # for the cost data dataframe:
            if not group.simulations:
                group.initialize_simulations(use_output_cache=True)

            group_name = group.name
            cost_func = self.group_cost_functions[group_name]
            cost_func.weights = new_weights
            cost_data = cost_func.compute_costs()
            # Don't aggregate yet, to avoid triggering listeners until all
            # cost_data recomputed:
            self.update_cost_data_dict(group, cost_data, skip_aggregate=True)

        # Now we are ready to compute the step's cost_data:
        self.aggregate_cost_data()

    def compute_costs(self, sim_group, cost_function=None):
        """ Compute the costs of one of the SimulationGroups of the step.

        Also cache the cost_function for each sim_group, so that costs can be
        recomputed if weights are changed.

        Parameters
        ----------
        sim_group : SimulationGroup
            Group for which to compute costs.

        cost_function : Callable [OPTIONAL]
            Target cost function to use to compute costs. Optional: if a
            cost_function_type has been provided at step creation, and this is
            None, a cost_function will be created.
        """
        if cost_function is None:
            klass = ALL_COST_FUNCTIONS[self.cost_function_type]
            cost_function = klass(**self.cost_func_kw)

        target_exp = sim_group.center_point_simulation.source_experiment
        cost_data = cost_function(sim_group.simulations,
                                  target_exps=target_exp)
        self.group_cost_functions[sim_group.name] = cost_function
        self.update_cost_data_dict(sim_group, cost_data)

    def update_cost_data_dict(self, group, cost_data, skip_aggregate=False):
        """ Collect all cost_function cost data for all sim groups.

        Also aggregates all into the step's cost_data if the step has finished
        running. The step's cost data will aggregate data from all simulation
        groups, sum/average it over all components, and display the scanned
        parameters values along side with the aggregate cost.
        """
        if cost_data is None:
            return

        # Copy to avoid modifying the cost function object which has a hold on
        # the cost_data
        cost_data = cost_data.copy()
        simulations = group.simulations

        # Aggregate the cost function data
        df_agg_method = getattr(cost_data, self.cost_agg_func)
        cost_data[ALL_COST_COL_NAME] = df_agg_method(axis=1)

        # Add the values of the scanned parameters
        self.append_param_values(cost_data, simulations)

        # Collect the group's cost data with the rest of the data targeting the
        # same experiment if any:
        exp_name = group.center_point_simulation.source_experiment.name
        if exp_name in self._group_cost_data:
            existing = self._group_cost_data[exp_name]
            self._group_cost_data[exp_name] = pd.concat([existing, cost_data])
        else:
            self._group_cost_data[exp_name] = cost_data

        if self.has_run and not skip_aggregate:
            self.aggregate_cost_data()

    def invalidate_group_cost_data(self):
        """ Past cost_data are invalid. Delete them.
        """
        self._group_cost_data = {}

    def aggregate_cost_data(self):
        """ Aggregate cost data over all target experiment.

        The step's cost data will aggregate data from all simulation groups,
        sum/average it over all components, and display the scanned parameters
        values along side with the aggregate cost.
        """
        # Remove the column name from the final cost_data since there may be
        # more than 1 simulation for a given parameter setup, one per target
        # experiment:
        cost_data_list = [data.drop(SIM_COL_NAME, axis=1)
                          for data in self._group_cost_data.values()]
        average_cost_data = sum(cost_data_list)
        if self.cost_agg_func == "mean":
            average_cost_data /= len(self.target_experiments)

        self.cost_data = average_cost_data

    def append_param_values(self, costs_df, simulations):
        """ Evaluate parameters for provided sims and reset as cost DF index.
        """
        for param_name in self.scanned_param_names:
            expr = "sim.{}".format(param_name)
            costs_df[param_name] = [eval(expr, {"sim": sim})
                                    for sim in simulations]
            first_val = costs_df[param_name][0]
            if isinstance(first_val, UnitScalar):
                costs_df[param_name] = costs_df[param_name].apply(float)
            elif is_squeezable(first_val):
                # FIXME: WHEN DOES THIS HAPPEN?
                costs_df[param_name] = costs_df[param_name].apply(float)
            elif is_repeating_array(first_val):
                # This can happen when a parameter is a slice of an array:
                # replace with the first value if all the same because we can't
                # index with an array (unhashable).
                costs_df[param_name] = costs_df[param_name].apply(
                    lambda x: x[0]
                )

        costs_df.reset_index(inplace=True)
        costs_df.set_index(self.scanned_param_names, inplace=True)

    # Optimal simulation methods ----------------------------------------------

    def update_optimal_simulation_for_comp(self):
        """ Extract the best simulation for each product component.
        """
        best_simulations = defaultdict(list)
        for comp in self.target_components:
            for group_cost_data in self._group_cost_data.values():
                data = group_cost_data[comp]
                try:
                    idx = data.argmin(axis=0)
                    sim_name = group_cost_data.loc[idx, SIM_COL_NAME]
                    sim = self._get_sim_from_name(sim_name)
                    best_simulations[comp].append(sim)
                except Exception as e:
                    msg = "Failing to find the simulation with minimal cost " \
                          "for component {}. Data was {}. (Exception was {})"
                    logger.error(msg.format(comp, data, e))

        self.optimal_simulation_for_comp = best_simulations

    def get_optimal_sims(self, exp_name, num_sims):
        """ Collect optimal num_sims simulations matching specific experiment.
        """
        if len(self.cost_data) == 0:
            return []

        # Make sure we are not trying to extract more optimal simulations that
        # the total number of available simulations (for a given experiment)

        sorted_data = self.cost_data.sort_values(by=ALL_COST_COL_NAME)
        optim_sim_idx = sorted_data.index[:num_sims]
        # This assumes that self.cost_data and elements of
        # self._group_cost_data are indexed on the same columns:
        group_data = self._group_cost_data[exp_name]
        sim_names = group_data.loc[optim_sim_idx, SIM_COL_NAME].tolist()
        return [self._get_sim_from_name(name) for name in sim_names]

    # Private interface -------------------------------------------------------

    def _get_sim_from_name(self, sim_name):
        """ Find a simulation ran in the step in the simulation sim groups.

        Raises
        ------
        ValueError
            If the simulation isn't found.
        """
        pattern = "Sim (\d+)_(.+)"
        match = re.match(pattern, sim_name)
        target_sim_num, target_group_name = match.groups()
        group = self._get_group_from_name(target_group_name)
        try:
            sim = group.get_simulation(int(target_sim_num))
            if sim.name != sim_name:
                msg = "Logical error: the simulation's name isn't what was " \
                      "expected!"
                logger.exception(msg)
                raise ValueError(msg)

            return sim
        except (IndexError, AssertionError) as e:
            msg = "Simulation with name {} not found in step's simulation " \
                  "groups. Error was {}."
            msg = msg.format(sim_name, e)
            logger.error(msg)
            raise ValueError(msg)

    def _get_group_from_name(self, group_name):
        """ Return the simulation group with provided name.
        """
        for group in self.simulation_groups:
            if group.name.startswith(group_name):
                return group

        msg = "SimulationGroup with name {} not found in step's groups. " \
              "Known names are {}"
        known_group_names = [group.name for group in self.simulation_groups]
        msg = msg.format(group_name, known_group_names)
        logger.error(msg)
        raise ValueError(msg)

    def _get_step_has_run(self):
        if not self.simulation_groups:
            return False
        return all([group.has_run for group in self.simulation_groups])

    # Traits listeners --------------------------------------------------------

    @on_trait_change("simulation_groups:has_run")
    def optimize_costs(self, sim_group, attr_name, group_has_run):
        self.has_run = self._get_step_has_run()
        if group_has_run:
            msg = "Group {} has finished running: updating costs."
            msg = msg.format(sim_group.name)
            logger.info(msg)

            self.compute_costs(sim_group)
            if self.has_run:
                self.update_optimal_simulation_for_comp()
            else:
                self._run_next_sim_group()

            # Save memory by throwing away simulations: they can be rebuilt
            # from the simulation diffs.
            sim_group.release_simulation_list()
            self.data_updated = True

    def _run_next_sim_group(self):
        """ A simGroup has finished running: run the next one.
        """
        next_group = self.simulation_groups[self._next_group_to_run]
        msg = "Now submitting {} to run...".format(next_group.name)
        logger.debug(msg)
        next_group.run(self._job_manager, wait=self._wait_on_run)
        self._next_group_to_run += 1

    # Traits property getters -------------------------------------------------

    def _get_size(self):
        return sum([group.size for group in self.simulation_groups])

    def _get_size_run(self):
        return sum([group.size_run for group in self.simulation_groups])

    def _get_percent_run(self):
        if self.size:
            percent_run = self.size_run / self.size * 100.
        else:
            percent_run = np.nan

        return "{:.2f} %".format(percent_run)

    def _get_scanned_param_names(self):
        step_params = []
        for param in self.parameter_list:
            p_name = param.name
            parallel_params = hasattr(param, "parallel_parameters") and \
                len(param.parallel_parameters) > 0
            if parallel_params:
                step_params.extend([p.name for p in param.parallel_parameters])

            step_params.append(p_name)

        return step_params

    # Traits initialization methods -------------------------------------------

    def _cost_data_default(self):
        cols = self.target_components + [ALL_COST_COL_NAME]
        data = {name: [] for name in cols}
        return pd.DataFrame(data, index=[])

    def _sim_group_max_size_default(self):
        preferences = get_preferences()
        return preferences.optimizer_preferences.optimizer_step_chunk_size
Beispiel #10
0
class FlowPeaks1DView(By1DView, AnnotatingView, HistogramView):
    """
    A one-dimensional diagnostic view for :class:`FlowPeaksOp`.  Plots a histogram
    of the channel, then overlays the k-means centroids in blue.

    Attributes
    ----------    

    """

    id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks1dview')
    friendly_id = Constant("1D FlowPeaks Diagnostic Plot")

    channel = Str
    scale = util.ScaleEnum

    def plot(self, experiment, **kwargs):
        """
        Plot the plots.
        
        Parameters
        ----------
        
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        view, trait_name = self._strip_trait(self.op.name)

        if self.channel in self.op._scale:
            scale = self.op._scale[self.channel]
        else:
            scale = util.scale_factory(self.scale,
                                       experiment,
                                       channel=self.channel)

        super(FlowPeaks1DView, view).plot(experiment,
                                          annotation_facet=self.op.name,
                                          annotation_trait=trait_name,
                                          annotations=self.op._kmeans,
                                          scale=scale,
                                          **kwargs)

    def _annotation_plot(self, axes, annotation, annotation_facet,
                         annotation_value, annotation_color, **kwargs):

        kwargs.setdefault('orientation', 'vertical')

        if kwargs['orientation'] == 'horizontal':
            cidx = self.op.channels.index(self.channel)
            for k in range(0, self.op.num_clusters):
                c = self.op._scale[self.channel].inverse(
                    annotation.cluster_centers_[k][cidx])
                plt.axhline(c, linewidth=3, color='blue')
        else:
            cidx = self.op.channels.index(self.channel)
            for k in range(0, self.op.num_clusters):
                c = self.op._scale[self.channel].inverse(
                    annotation.cluster_centers_[k][cidx])
                plt.axvline(c, linewidth=3, color='blue')
Beispiel #11
0
class FlowPeaks2DView(By2DView, AnnotatingView, ScatterplotView):
    """
    A two-dimensional diagnostic view for :class:`FlowPeaksOp`.  Plots a 
    scatter-plot of the two channels, then overlays the k-means centroids in 
    blue and the clusters-of-k-means in pink.

    Attributes
    ----------

    """

    id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks2dview')
    friendly_id = Constant("FlowPeaks 2D Diagnostic Plot")

    xchannel = Str
    ychannel = Str
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum

    def plot(self, experiment, **kwargs):
        """
        Plot the plots.
        
        Parameters
        ----------
        
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        annotations = {}
        for k in self.op._kmeans:
            annotations[k] = (self.op._kmeans[k], self.op._peaks[k],
                              self.op._cluster_peak[k])

        view, trait_name = self._strip_trait(self.op.name)

        if self.xchannel in self.op._scale:
            xscale = self.op._scale[self.xchannel]
        else:
            xscale = util.scale_factory(self.xscale,
                                        experiment,
                                        channel=self.xchannel)

        if self.ychannel in self.op._scale:
            yscale = self.op._scale[self.ychannel]
        else:
            yscale = util.scale_factory(self.yscale,
                                        experiment,
                                        channel=self.ychannel)

        super(FlowPeaks2DView, view).plot(experiment,
                                          annotation_facet=self.op.name,
                                          annotation_trait=trait_name,
                                          annotations=annotations,
                                          xscale=xscale,
                                          yscale=yscale,
                                          **kwargs)

    def _annotation_plot(self, axes, annotation, annotation_facet,
                         annotation_value, annotation_color, **kwargs):

        ix = self.op.channels.index(self.xchannel)
        iy = self.op.channels.index(self.ychannel)

        xscale = kwargs['xscale']
        yscale = kwargs['yscale']

        km = annotation[0]
        peaks = annotation[1]
        cluster_peak = annotation[2]

        for k in range(len(km.cluster_centers_)):
            x = self.op._scale[self.xchannel].inverse(
                km.cluster_centers_[k][ix])
            y = self.op._scale[self.ychannel].inverse(
                km.cluster_centers_[k][iy])

            plt.plot(x, y, '*', color='blue')

            peak_idx = cluster_peak[k]
            peak = peaks[peak_idx]
            peak_x = xscale.inverse(peak[0])
            peak_y = yscale.inverse(peak[1])

            plt.plot([x, peak_x], [y, peak_y])

        for peak in peaks:
            x = self.op._scale[self.ychannel].inverse(peak[0])
            y = self.op._scale[self.xchannel].inverse(peak[1])
            plt.plot(x, y, 'o', color="magenta")
Beispiel #12
0
class FlowPeaksOp(HasStrictTraits):
    """
    This module uses the **flowPeaks** algorithm to assign events to clusters in
    an unsupervised manner.
    
    Call :meth:`estimate` to compute the clusters.
      
    Calling :meth:`apply` creates a new categorical metadata variable 
    named ``name``, with possible values ``{name}_1`` .... ``name_n`` where 
    ``n`` is the number of clusters estimated.
    
    The same model may not be appropriate for different subsets of the data set.
    If this is the case, you can use the :attr:`by` attribute to specify 
    metadata by which to aggregate the data before estimating (and applying) 
    a model.  The number of clusters is a model parameter and it may vary in 
    each subset. 

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the clustering algorithm to.

    scale : Dict(Str : Enum("linear", "logicle", "log"))
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`set_default_scale`) is used.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit the model 
        separately to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.
        
    h : Float (default = 1.5)
        A scalar value by which to scale the covariance matrices of the 
        underlying density function.  (See ``Notes``, below, for more details.)
        
    h0 : Float (default = 1.0)
        A scalar value by which to smooth the covariance matrices of the
        underlying density function.  (See ``Notes``, below, for more details.)
        
    tol : Float (default = 0.5)
        How readily should clusters be merged?  Must be between 0 and 1.
        See ``Notes``, below, for more details.
        
    merge_dist : Float (default = 5)
        How far apart can clusters be before they are merged?  This is
        a unit-free scalar, and is approximately the maximum number of
        k-means clusters between peaks. 
        
    find_outliers : Bool (default = False)
        Should the algorithm use an extra step to identify outliers?
        
        .. note::
            I have disabled this code until I can try to make it faster.
        
    Notes
    -----
    
    This algorithm uses kmeans to find a large number of clusters, then 
    hierarchically merges those clusters.  Thus, the user does not need to
    specify the number of clusters in advance, and it can find non-convex
    clusters.  It also operates in an arbitrary number of dimensions.
    
    The merging happens in two steps.  First, the cluster centroids are used
    to estimate an underlying density function.  Then, the local maxima of
    the density function are found using a numerical optimization starting from
    each centroid, and k-means clusters that converge to the same local maximum
    are merged.  Finally, these clusters-of-clusters are merged if their local 
    maxima are (a) close enough, and (b) the density function between them is 
    smooth enough.  Thus, the final assignment of each event depends on the 
    k-means cluster it ends up in, and which cluster-of-clusters that k-means 
    centroid is assigned to.
    
    There are a lot of parameters that affect this process.  The k-means
    clustering is pretty robust (though somewhat sensitive to the number of 
    clusters, which is currently not exposed in the API.) The most important
    are exposed as attributes of the :class:`FlowPeaksOp` class.  These include:
    
     - :attr:`h`, :attr:`h0`: sometimes the density function is too "rough" to 
         find good local maxima.  These parameters smooth it out by widening the
         covariance matrices.  Increasing :attr:`h` makes the density rougher; 
         increasing :attr:`h0` makes it smoother.
              
    - :attr:`tol`: How smooth does the density function have to be between two 
        density maxima to merge them?  Must be between 0 and 1.
           
    - :attr:`merge_dist`: How close must two maxima be to merge them?  This 
        value is a unit-free scalar, and is approximately the number of
        k-means clusters between the two maxima.
        
    For details and a theoretical justification, see [1]_
    
    References
    ----------
    
    .. [1] Ge, Yongchao and Sealfon, Stuart C.  "flowPeaks: a fast unsupervised
       clustering for flow cytometry data via K-means and density peak finding" 
       Bioinformatics (2012) 28 (15): 2052-2058.         
  
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> fp_op = flow.FlowPeaksOp(name = 'Flow',
        ...                          channels = ['V2-A', 'Y2-A'],
        ...                          scale = {'V2-A' : 'log',
        ...                                   'Y2-A' : 'log'},
        ...                          h0 = 3)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> fp_op.estimate(ex)
        
    Plot a diagnostic view of the underlying density
    
    .. plot::
        :context: close-figs
        
        >>> fp_op.default_view(density = True).plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = fp_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> fp_op.default_view().plot(ex2)
        

    """

    id = Constant('edu.mit.synbio.cytoflow.operations.flowpeaks')
    friendly_id = Constant("FlowPeaks Clustering")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    by = List(Str)
    #     find_outliers = Bool(False)

    # parameters that control estimation, with sensible defaults
    h = util.PositiveFloat(1.5, allow_zero=False)
    h0 = util.PositiveFloat(1, allow_zero=False)
    tol = util.PositiveFloat(0.5, allow_zero=False)
    merge_dist = util.PositiveFloat(5, allow_zero=False)

    # parameters that control outlier selection, with sensible defaults

    _kmeans = Dict(Any,
                   Instance(sklearn.cluster.MiniBatchKMeans),
                   transient=True)
    _normals = Dict(Any, List(Function), transient=True)
    _density = Dict(Any, Function, transient=True)
    _peaks = Dict(Any, List(Array), transient=True)
    _cluster_peak = Dict(Any, List,
                         transient=True)  # kmeans cluster idx --> peak idx
    _cluster_group = Dict(Any, List,
                          transient=True)  # kmeans cluster idx --> group idx
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the k-means clusters, then hierarchically merge them.
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
#                 if self.scale[c] == 'log':
#                     self._scale[c].mode = 'mask'
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for data_group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(data_group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            #### choose the number of clusters and fit the kmeans
            num_clusters = [
                util.num_hist_bins(x[:, c]) for c in range(len(self.channels))
            ]
            num_clusters = np.ceil(np.median(num_clusters))
            num_clusters = int(num_clusters)

            self._kmeans[data_group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters,
                                                random_state = 0)

            kmeans.fit(x)
            x_labels = kmeans.predict(x)
            d = len(self.channels)

            #### use the kmeans centroids to parameterize a finite gaussian
            #### mixture model which estimates the density function

            d = len(self.channels)
            s0 = np.zeros([d, d])
            for j in range(d):
                r = x[d].max() - x[d].min()
                s0[j, j] = (r / (num_clusters**(1. / d)))**0.5

            means = []
            weights = []
            normals = []

            for k in range(num_clusters):
                xk = x[x_labels == k]
                num_k = np.sum(x_labels == k)
                weight_k = num_k / len(x_labels)
                mu = xk.mean(axis=0)
                means.append(mu)
                s = np.cov(xk, rowvar=False)

                el = num_k / (num_clusters + num_k)
                s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0

                n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth)
                weights.append(weight_k)
                normals.append(lambda x, n=n: n.pdf(x))

            self._normals[data_group] = normals
            self._density[
                data_group] = density = lambda x, weights=weights, normals=normals: np.sum(
                    [w * n(x) for w, n in zip(weights, normals)], axis=0)

            ### use optimization on the finite gmm to find the local peak for
            ### each kmeans cluster
            peaks = []
            peak_clusters = []  # peak idx --> list of clusters

            min_mu = [np.inf] * len(self.channels)
            max_mu = [-1.0 * np.inf] * len(self.channels)

            for k in range(num_clusters):
                mu = means[k]
                for ci in range(len(self.channels)):
                    if mu[ci] < min_mu[ci]:
                        min_mu[ci] = mu[ci]
                    if mu[ci] > max_mu[ci]:
                        max_mu[ci] = mu[ci]

            for k in range(num_clusters):
                mu = means[k]
                f = lambda x: -1.0 * density(x)

                res = scipy.optimize.minimize(f,
                                              mu,
                                              method="CG",
                                              options={'gtol': 1e-3})

                if not res.success:
                    warn(
                        "Peak finding failed for cluster {}: {}".format(
                            k, res.message), util.CytoflowWarning)


#                 ### The peak-searching algorithm from the paper.  works fine,
#                 ### but slow!  we get similar results with the conjugate gradient
#                 ### optimization method from scipy

#                 x0 = x = means[k]
#                 k0 = k
#                 b = beta_max[k] / 10.0
#                 Nsuc = 0
#                 n = 0
#
#                 while(n < 1000):
# #                     df = scipy.misc.derivative(density, x, 1e-6)
#                     df = statsmodels.tools.numdiff.approx_fprime(x, density)
#                     if np.linalg.norm(df) < 1e-3:
#                         break
#
#                     y = x + b * df / np.linalg.norm(df)
#                     if density(y) <= density(x):
#                         Nsuc = 0
#                         b = b / 2.0
#                         continue
#
#                     Nsuc += 1
#                     if Nsuc >= 2:
#                         b = min(2*b, beta_max[k])
#
#                     ky = kmeans.predict(y[np.newaxis, :])[0]
#                     if ky == k:
#                         x = y
#                     else:
#                         k = ky
#                         b = beta_max[k] / 10.0
#                         mu = means[k]
#                         if density(mu) > density(y):
#                             x = mu
#                         else:
#                             x = y
#
#                     n += 1

                merged = False
                for pi, p in enumerate(peaks):
                    # TODO - this probably only works for scaled measurements
                    if np.linalg.norm(p - res.x) < (1e-2):
                        peak_clusters[pi].append(k)
                        merged = True
                        break

                if not merged:
                    peak_clusters.append([k])
                    peaks.append(res.x)

            self._peaks[data_group] = peaks

            ### merge peaks that are sufficiently close

            groups = [[x] for x in range(len(peaks))]
            peak_groups = [x for x in range(len(peaks))
                           ]  # peak idx --> group idx

            def max_tol(x, y):
                f = lambda a: density(a[np.newaxis, :])
                #                 lx = kmeans.predict(x[np.newaxis, :])[0]
                #                 ly = kmeans.predict(y[np.newaxis, :])[0]
                n = len(x)
                n_scale = 1

                #                 n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters))

                def tol(t):
                    zt = x + t * (y - x)
                    fhat_zt = f(x) + t * (f(y) - f(x))
                    return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale

                res = scipy.optimize.minimize_scalar(tol,
                                                     bounds=[0, 1],
                                                     method='Bounded')

                if res.status != 0:
                    raise util.CytoflowOpError(
                        None,
                        "tol optimization failed for {}, {}".format(x, y))
                return -1.0 * res.fun

            def nearest_neighbor_dist(k):
                min_dist = np.inf

                for i in range(num_clusters):
                    if i == k:
                        continue
                    dist = np.linalg.norm(means[k] - means[i])
                    if dist < min_dist:
                        min_dist = dist

                return min_dist

            sk = [nearest_neighbor_dist(x) for x in range(num_clusters)]

            def s(x):
                k = kmeans.predict(x[np.newaxis, :])[0]
                return sk[k]

            def can_merge(g, h):
                for pg in g:
                    for ph in h:
                        vg = peaks[pg]
                        vh = peaks[ph]
                        dist_gh = np.linalg.norm(vg - vh)

                        if max_tol(vg, vh) < self.tol and dist_gh / (
                                s(vg) + s(vh)) <= self.merge_dist:
                            return True

                return False

            while True:
                if len(groups) == 1:
                    break

                # find closest mergable groups
                min_dist = np.inf
                for gi in range(len(groups)):
                    g = groups[gi]

                    for hi in range(gi + 1, len(groups)):
                        h = groups[hi]

                        if can_merge(g, h):
                            dist_gh = np.inf
                            for pg in g:
                                vg = peaks[pg]
                                for ph in h:
                                    vh = peaks[ph]
                                    #                                     print("vg {} vh {}".format(vg, vh))
                                    dist_gh = min(dist_gh,
                                                  np.linalg.norm(vg - vh))

                            if dist_gh < min_dist:
                                min_gi = gi
                                min_hi = hi
                                min_dist = dist_gh

                if min_dist == np.inf:
                    break

                # merge the groups
                groups[min_gi].extend(groups[min_hi])
                for g in groups[min_hi]:
                    peak_groups[g] = min_gi
                del groups[min_hi]

            cluster_group = [0] * num_clusters
            cluster_peaks = [0] * num_clusters

            for gi, g in enumerate(groups):
                for p in g:
                    for cluster in peak_clusters[p]:
                        cluster_group[cluster] = gi
                        cluster_peaks[cluster] = p

            self._cluster_peak[data_group] = cluster_peaks
            self._cluster_group[data_group] = cluster_group

    def apply(self, experiment):
        """
        Assign events to a cluster.
        
        Assigns each event to one of the k-means centroids from :meth:`estimate`,
        then groups together events in the same cluster hierarchy.
        
        Parameters
        ----------
        experiment : Experiment
            the :class:`.Experiment` to apply the gate to.
            
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the gate applied to it.  
            TODO - document the extra statistics
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        if not self._peaks:
            raise util.CytoflowOpError(
                None, "No model found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series(["{}_None".format(self.name)] *
                                      len(experiment),
                                      dtype="object")

        # make the statistics
        #         clusters = [x + 1 for x in range(self.num_clusters)]
        #
        #         idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels],
        #                                          names = list(self.by) + ["Cluster"] + ["Channel"])
        #         centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))

            if group not in self._kmeans:
                raise util.CytoflowOpError(
                    'by', "Group {} not found in the estimated "
                    "model.  Do you need to re-run estimate()?".format(group))

            x = data_subset.loc[:, self.channels[:]]

            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            kmeans = self._kmeans[group]

            predicted_km = np.full(len(x), -1, "int")
            predicted_km[~x_na] = kmeans.predict(x[~x_na])

            groups = np.asarray(self._cluster_group[group])
            predicted_group = np.full(len(x), -1, "int")
            predicted_group[~x_na] = groups[predicted_km[~x_na]]

            # outlier detection code.  this is disabled for the moment
            # because it is really slow.

            #             num_groups = len(set(groups))
            #             if self.find_outliers:
            #                 density = self._density[group]
            #                 max_d = [-1.0 * np.inf] * num_groups
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #                     d_x_c = density(x[xi])
            #                     if d_x_c > max_d[x_c]:
            #                         max_d[x_c] = d_x_c
            #
            #                 group_density = [None] * num_groups
            #                 group_weight = [0.0] * num_groups
            #
            #                 for c in range(num_groups):
            #                     num_c = np.sum(predicted_group == c)
            #                     clusters = np.argwhere(groups == c).flatten()
            #
            #                     normals = []
            #                     weights = []
            #                     for k in range(len(clusters)):
            #                         num_k = np.sum(predicted_km == k)
            #                         weight_k = num_k / num_c
            #                         group_weight[c] += num_k / len(x)
            #                         weights.append(weight_k)
            #                         normals.append(self._normals[group][k])
            #
            #                     group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0)
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #
            #                     if density(x[xi]) / max_d[x_c] < 0.01:
            #                         predicted_group[xi] = -1
            #                         continue
            #
            #                     sum_d = 0
            #                     for c in set(groups):
            #                         sum_d += group_weight[c] * group_density[c](x[xi])
            #
            #                     if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8:
            #                         predicted_group[xi] = -1

            #
            #                     max_d = -1.0 * np.inf
            #                     for x_c in x[predicted_group == c]:
            #                         x_c_d = density(x_c)
            #                         if x_c_d > max_d:
            #                             max_d = x_c_d
            #
            #                     for i in range(len(x)):
            #                         if predicted_group[i] == c and density(x[i]) / max_d <= 0.01:
            #                             predicted_group[i] = -1
            #
            #

            predicted_str = pd.Series(["(none)"] * len(predicted_group))
            for c in range(len(self._cluster_group[group])):
                predicted_str[predicted_group == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted_group == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "category", event_assignments)

        #         new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Parameters
        ----------
        channels : List(Str)
            Which channels to plot?  Must be contain either one or two channels.
            
        scale : List({'linear', 'log', 'logicle'})
            How to scale the channels before plotting them
            
        density : bool
            Should we plot a scatterplot or the estimated density function?
         
        Returns
        -------
        IView
            an IView, call :meth:`plot` to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        density = kwargs.pop('density', False)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                'channels',
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = FlowPeaks1DView(op=self)
            v.trait_set(channel=channels[0],
                        scale=scale[channels[0]],
                        **kwargs)
            return v

        elif len(channels) == 2:
            if density:
                v = FlowPeaks2DDensityView(op=self)
                v.trait_set(xchannel=channels[0],
                            ychannel=channels[1],
                            xscale=scale[channels[0]],
                            yscale=scale[channels[1]],
                            **kwargs)
                return v

            else:
                v = FlowPeaks2DView(op=self)
                v.trait_set(xchannel=channels[0],
                            ychannel=channels[1],
                            xscale=scale[channels[0]],
                            yscale=scale[channels[1]],
                            **kwargs)
                return v
        else:
            raise util.CytoflowViewError(
                None,
                "Can't specify more than two channels for a default view")
Beispiel #13
0
class BeadCalibrationOp(HasStrictTraits):
    """
    Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent
    beads (eg, the Spherotech RCP-30-5A rainbow beads.)
    
    To use, set the `beads_file` property to an FCS file containing the beads'
    events; specify which beads you ran by setting the `beads_type` property
    to match one of the values of BeadCalibrationOp.BEADS; and set the
    `units` dict to which channels you want calibrated and in which units.
    Then, call `estimate()` and check the peak-finding with 
    `default_view().plot()`.  If the peak-finding is wacky, try adjusting
    `bead_peak_quantile` and `bead_brightness_threshold`.  When the peaks are
    successfully identified, call apply() on your experimental data set. 
    
    If you can't make the peak finding work, please submit a bug report!
    
    This procedure works best when the beads file is very clean data.  It does
    not do its own gating (maybe a future addition?)  In the meantime, 
    I recommend gating the *acquisition* on the FSC/SSC channels in order
    to get rid of debris, cells, and other noise.
    
    Finally, because you can't have a negative number of fluorescent molecules
    (MEFLs, etc) (as well as for math reasons), this module filters out
    negative values.
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation.)

    units : Dict(Str, Str)
        A dictionary specifying the channels you want calibrated (keys) and
        the units you want them calibrated in (values).  The units must be
        keys of the `beads` attribute.       
        
    beads_file : File
        A file containing the FCS events from the beads.  Must be set to use
        `estimate()`.  This isn't persisted by `pickle()`.

    beads : Dict(Str, List(Float))
        The beads' characteristics.  Keys are calibrated units (ie, MEFL or
        MEAP) and values are ordered lists of known fluorophore levels.  Common
        values for this dict are included in BeadCalibrationOp.BEADS.
        Must be set to use `estimate()`.
        
    bead_peak_quantile : Int
        The quantile threshold used to choose bead peaks.  Default == 80.
        Must be set to use `estimate()`.
        
    bead_brightness_threshold : Float
        How bright must a bead peak be to be considered?  Default == 100.
        Must be set to use `estimate()`.
        
    bead_brightness_cutoff : Float
        If a bead peak is above this, then don't consider it.  Takes care of
        clipping saturated detection.  Defaults to 70% of the detector range.
        
    Notes
    -----
    The peak finding is rather sophisticated.  
    
    For each channel, a 256-bin histogram is computed on the log-transformed
    bead data, and then the histogram is smoothed with a Savitzky-Golay 
    filter (with a window length of 5 and a polynomial order of 1).  
    
    Next, a wavelet-based peak-finding algorithm is used: it convolves the
    smoothed histogram with a series of wavelets and looks for relative 
    maxima at various length-scales.  The parameters of the smoothing 
    algorithm were arrived at empircally, using beads collected at a wide 
    range of PMT voltages.
    
    Finally, the peaks are filtered by height (the histogram bin has a quantile
    greater than `bead_peak_quantile`) and intensity (brighter than 
    `bead_brightness_threshold`).
    
    How to convert from a series of peaks to mean equivalent fluorochrome?
    If there's one peak, we assume that it's the brightest peak.  If there
    are two peaks, we assume they're the brightest two.  If there are n >=3
    peaks, we check all the contiguous n-subsets of the bead intensities
    and find the one whose linear regression (in log space!) has the smallest
    norm (square-root sum-of-squared-residuals.)
    
    There's a slight subtlety in the fact that we're performing the linear
    regression in log-space: if the relationship in log10-space is Y=aX + b,
    then the same relationship in linear space is x = 10**X, y = 10**y, and
    y = (10**b) * (x ** a).
    
    One more thing.  Because the beads are (log) evenly spaced across all
    the channels, we can directly compute the fluorophore equivalent in channels
    where we wouldn't usually measure that fluorophore: for example, you can
    compute MEFL (mean equivalent fluorosceine) in the PE-Texas Red channel,
    because the bead peak pattern is the same in the PE-Texas Red channel
    as it would be in the FITC channel.
    
    Examples
    --------
    >>> bead_op = flow.BeadCalibrationOp()
    >>> bead_op.beads = flow.BeadCalibrationOp.BEADS["Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R"]
    >>> bead_op.units = {"Pacific Blue-A" : "MEFL",
                         "FITC-A" : "MEFL",
                         "PE-Tx-Red-YG-A" : "MEFL"}
    >>>
    >>> bead_op.beads_file = "beads.fcs"
    >>> bead_op.estimate(ex3)
    >>>
    >>> bead_op.default_view().plot(ex3)  
    >>> # check the plot!
    >>>
    >>> ex4 = bead_op.apply(ex3)  
    """
    
    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate')
    friendly_id = Constant("Bead Calibration")
    
    name = Constant("Bead Calibration")
    units = Dict(Str, Str)
    
    beads_file = File(exists = True)
    bead_peak_quantile = Int(80)

    bead_brightness_threshold = Float(100)
    bead_brightness_cutoff = Float(Undefined)
    # TODO - bead_brightness_threshold should probably be different depending
    # on the data range of the input.
    
    beads = Dict(Str, List(Float))

    _calibration_functions = Dict(Str, Python, transient = True)
    _peaks = Dict(Str, Python, transient = True)
    _mefs = Dict(Str, Python, transient = True)

    def estimate(self, experiment, subset = None): 
        """
        Estimate the calibration coefficients from the beads file.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")
        
        if not self.beads_file:
            raise util.CytoflowOpError("No beads file specified")

        if not set(self.units.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError("Specified channels that weren't found in "
                                  "the experiment.")
            
        if not set(self.units.values()) <= set(self.beads.keys()):
            raise util.CytoflowOpError("Units don't match beads.")
                        
        # make a little Experiment
        check_tube(self.beads_file, experiment)
        beads_exp = ImportOp(tubes = [Tube(file = self.beads_file)],
                             name_metadata = experiment.metadata['name_metadata']).apply()
        
        channels = self.units.keys()

        for channel in channels:
            data = beads_exp.data[channel]
            
            # TODO - this assumes the data is on a linear scale.  check it!
            data_range = experiment.metadata[channel]['range']

            if self.bead_brightness_cutoff is Undefined:
                cutoff = 0.7 * data_range
            else:
                cutoff = self.bead_brightness_cutoff
                                            
            # bin the data on a log scale

            hist_bins = np.logspace(1, math.log(data_range, 2), num = 256, base = 2)
            hist = np.histogram(data, bins = hist_bins)
            
            # mask off-scale values
            hist[0][0] = 0
            hist[0][-1] = 0
            
            # smooth it with a Savitzky-Golay filter
            hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1)
            
            # find peaks
            peak_bins = scipy.signal.find_peaks_cwt(hist_smooth, 
                                                    widths = np.arange(3, 20),
                                                    max_distances = np.arange(3, 20) / 2)
            
            # filter by height and intensity
            peak_threshold = np.percentile(hist_smooth, self.bead_peak_quantile)
            peak_bins_filtered = \
                [x for x in peak_bins if hist_smooth[x] > peak_threshold 
                 and hist[1][x] > self.bead_brightness_threshold
                 and hist[1][x] < cutoff]
            
            peaks = [hist_bins[x] for x in peak_bins_filtered]
            
            mef_unit = self.units[channel]
            
            if not mef_unit in self.beads:
                raise util.CytoflowOpError("Invalid unit {0} specified for channel {1}".format(mef_unit, channel))
            
            # "mean equivalent fluorochrome"
            mef = self.beads[mef_unit]
            
            if len(peaks) == 0:
                raise util.CytoflowOpError("Didn't find any peaks; check the diagnostic plot")
            elif len(peaks) > len(self.beads):
                raise util.CytoflowOpError("Found too many peaks; check the diagnostic plot")
            elif len(peaks) == 1:
                # if we only have one peak, assume it's the brightest peak
                a = mef[-1] / peaks[0]
                self._peaks[channel] = peaks
                self._mefs[channel] = [mef[-1]]
                self._calibration_functions[channel] = lambda x, a=a: a * x
            elif len(peaks) == 2:
                # if we have only two peaks, assume they're the brightest two
                self._peaks[channel] = peaks
                self._mefs[channel] = [mef[-1], mef[-2]]
                a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0])
                self._calibration_functions[channel] = lambda x, a=a: a * x
            else:
                # if there are n > 2 peaks, check all the contiguous n-subsets
                # of mef for the one whose linear regression with the peaks
                # has the smallest (norm) sum-of-residuals.
                
                # do it in log10 space because otherwise the brightest peaks
                # have an outsized influence.
                
                best_resid = np.inf
                for start, end in [(x, x+len(peaks)) for x in range(len(mef) - len(peaks) + 1)]:
                    mef_subset = mef[start:end]
                    
                    # linear regression of the peak locations against mef subset
                    lr = np.polyfit(np.log10(peaks), 
                                    np.log10(mef_subset), 
                                    deg = 1, 
                                    full = True)
                    
                    resid = lr[1][0]
                    if resid < best_resid:
                        best_lr = lr[0]
                        best_resid = resid
                        self._peaks[channel] = peaks
                        self._mefs[channel] = mef_subset
                        
                
                # remember, these (linear) coefficients came from logspace, so 
                # if the relationship in log10 space is Y = aX + b, then in
                # linear space the relationship is x = 10**X, y = 10**Y,
                # and y = (10**b) * x ^ a
                
                # also remember that the result of np.polyfit is a list of
                # coefficients with the highest power first!  so if we
                # solve y=ax + b, coeff #0 is a and coeff #1 is b
                
                a = best_lr[0]
                b = 10 ** best_lr[1]
                self._calibration_functions[channel] = \
                    lambda x, a=a, b=b: b * np.power(x, a)

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        old_experiment : Experiment
            the experiment to which this op is applied
            
        Returns
        -------
            a new experiment calibrated in physical units.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")
        
        channels = self.units.keys()

        if not self.units:
            raise util.CytoflowOpError("No channels to calibrate.")
        
        if not self._calibration_functions:
            raise util.CytoflowOpError("Calibration not found. "
                                  "Did you forget to call estimate()?")
        
        if not set(channels) <= set(experiment.channels):
            raise util.CytoflowOpError("Module units don't match experiment channels")
                
        if set(channels) != set(self._calibration_functions.keys()):
            raise util.CytoflowOpError("Calibration doesn't match units. "
                                  "Did you forget to call estimate()?")

        # two things.  first, you can't raise a negative value to a non-integer
        # power.  second, negative physical units don't make sense -- how can
        # you have the equivalent of -5 molecules of fluoresceine?  so,
        # we filter out negative values here.

        new_experiment = experiment.clone()
        
        for channel in channels:
            new_experiment.data = \
                new_experiment.data[new_experiment.data[channel] > 0]
                
        new_experiment.data.reset_index(drop = True, inplace = True)
        
        for channel in channels:
            calibration_fn = self._calibration_functions[channel]
            
            new_experiment[channel] = calibration_fn(new_experiment[channel])
            new_experiment.metadata[channel]['bead_calibration_fn'] = calibration_fn
            new_experiment.metadata[channel]['units'] = self.units[channel]
            if 'range' in experiment.metadata[channel]:
                new_experiment.metadata[channel]['range'] = calibration_fn(experiment.metadata[channel]['range'])
            
        new_experiment.history.append(self.clone_traits(transient = lambda t: True)) 
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the bleedthrough spline estimation
        is working.
        
        Returns
        -------
            IView : An IView, call plot() to see the diagnostic plots
        """

        return BeadCalibrationDiagnostic(op = self, **kwargs)
    
    BEADS = {
             # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls
             "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01" :
                { "MECSB" : [216, 464, 1232, 2940, 7669, 19812, 35474],
                  "MEBFP" : [861, 1997, 5776, 15233, 45389, 152562, 396759],
                  "MEFL" :  [792, 2079, 6588, 16471, 47497, 137049, 271647],
                  "MEPE" :  [531, 1504, 4819, 12506, 36159, 109588, 250892],
                  "MEPTR" : [233, 669, 2179, 5929, 18219, 63944, 188785],
                  "MECY" : [1614, 4035, 12025, 31896, 95682, 353225, 1077421],
                  "MEPCY7" : [14916, 42336, 153840, 494263],
                  "MEAP" :  [373, 1079, 3633, 9896, 28189, 79831, 151008],
                  "MEAPCY7" : [2864, 7644, 19081, 37258]},
             # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls
             "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R":
                { "MECSB" : [179, 400, 993, 3203, 6083, 17777, 36331],
                  "MEBFP" : [700, 1705, 4262, 17546, 35669, 133387, 412089],
                  "MEFL" :  [692, 2192, 6028, 17493, 35674, 126907, 290983],
                  "MEPE" :  [505, 1777, 4974, 13118, 26757, 94930, 250470],
                  "MEPTR" : [207, 750, 2198, 6063, 12887, 51686, 170219],
                  "MECY" :  [1437, 4693, 12901, 36837, 76621, 261671, 1069858],
                  "MEPCY7" : [32907, 107787, 503797],
                  "MEAP" :  [587, 2433, 6720, 17962, 30866, 51704, 146080],
                  "MEAPCY7" : [718, 1920, 5133, 9324, 14210, 26735]}}
Beispiel #14
0
class LogScale(ScaleMixin):
    id = Constant("edu.mit.synbio.cytoflow.utility.log_scale")
    name = "log"

    experiment = Instance("cytoflow.Experiment")

    # must set one of these.  they're considered in order.
    channel = Str
    condition = Str
    statistic = Tuple(Str, Str)
    error_statistic = Tuple(Str, Str)
    data = Array

    mode = Enum("mask", "clip")
    threshold = Property(
        Float,
        depends_on=
        "[experiment, condition, channel, statistic, error_statistic]")
    _channel_threshold = Float(0.1)

    mpl_params = Property(Dict)

    def _get_mpl_params(self):
        return {"nonposx": self.mode, "nonposy": self.mode}

    def _set_threshold(self, threshold):
        self._channel_threshold = threshold

    def _get_threshold(self):
        if self.channel:
            return self._channel_threshold
        elif self.condition:
            cond = self.experiment[self.condition][
                self.experiment[self.condition] > 0]
            return cond.min()
        elif self.statistic in self.experiment.statistics \
            and not self.error_statistic in self.experiment.statistics:
            stat = self.experiment.statistics[self.statistic]
            assert is_numeric(stat)
            return stat[stat > 0].min()
        elif self.statistic in self.experiment.statistics \
            and self.error_statistic in self.experiment.statistics:
            stat = self.experiment.statistics[self.statistic]
            err_stat = self.experiment.statistics[self.error_statistic]
            stat_min = stat[stat > 0].min()

            try:
                err_min = min([x for x in [min(x) for x in err_stat] if x > 0])
                return err_min

            except (TypeError, IndexError):
                err_min = min([x for x in err_stat if stat_min - x > 0])
                return stat_min - err_min

        elif self.data.size > 0:
            return self.data[self.data > 0].min()

    def __call__(self, data):
        # this function should work with: int, float, tuple, list, pd.Series,
        # np.ndframe.  it should return the same data type as it was passed.

        if isinstance(data, (int, float)):
            if self.mode == "mask":
                if data < self.threshold:
                    raise CytoflowError(
                        "data <= scale.threshold (currently: {})".format(
                            self.threshold))
                else:
                    return np.log10(data)
            else:
                if data < self.threshold:
                    return np.log10(self.threshold)
                else:
                    return np.log10(data)
        elif isinstance(data, (list, tuple)):
            ret = [self.__call__(x) for x in data]
            if isinstance(data, tuple):
                return tuple(ret)
            else:
                return ret
        elif isinstance(data, (np.ndarray, pd.Series)):
            mask_value = np.nan if self.mode == "mask" else self.threshold
            x = pd.Series(data)
            x = x.mask(lambda x: x < self.threshold, other=mask_value)
            ret = np.log10(x)

            if isinstance(data, pd.Series):
                return ret
            else:
                return ret.values
        else:
            raise CytoflowError(
                "Unknown type {} passed to log_scale.__call__".format(
                    type(data)))

    def inverse(self, data):
        # this function shoujld work with: int, float, tuple, list, pd.Series,
        # np.ndframe
        if isinstance(data, (int, float)):
            return np.power(10, data)
        elif isinstance(data, (list, tuple)):
            ret = [np.power(10, x) for x in data]
            if isinstance(data, tuple):
                return tuple(ret)
            else:
                return ret
        elif isinstance(data, (np.ndarray, pd.Series)):
            return np.power(10, data)
        else:
            raise CytoflowError(
                "Unknown type {} passed to log_scale.inverse".format(
                    type(data)))

    def clip(self, data):
        if isinstance(data, pd.Series):
            return data.clip(lower=self.threshold)
        elif isinstance(data, np.ndarray):
            return data.clip(min=self.threshold)
        elif isinstance(data, float):
            return max(data, self.threshold)
        else:
            try:
                return [max(data, self.threshold) for x in data]
            except TypeError as e:
                raise CytoflowError(
                    "Unknown data type in LogScale.clip") from e

    def color_norm(self):
        if self.channel:
            vmin = self.experiment[self.channel].min()
            vmax = self.experiment[self.channel].max()

        elif self.condition:
            vmin = self.experiment[self.condition].min()
            vmax = self.experiment[self.condition].max()

        elif self.statistic in self.experiment.statistics:
            stat = self.experiment.statistics[self.statistic]
            try:
                vmin = min([min(x) for x in stat])
                vmax = max([max(x) for x in stat])
            except (TypeError, IndexError):
                vmin = stat.min()
                vmax = stat.max()

            if self.error_statistic in self.experiment.statistics:
                err_stat = self.experiment.statistics[self.error_statistic]
                try:
                    vmin = min([min(x) for x in err_stat])
                    vmax = max([max(x) for x in err_stat])
                except (TypeError, IndexError):
                    vmin = vmin - err_stat.min()
                    vmax = vmax + err_stat.max()
        elif self.data.size > 0:
            vmin = self.data.min()
            vmax = self.data.max()
        else:
            raise CytoflowError("Must set one of 'channel', 'condition' "
                                "or 'statistic'.")

        return matplotlib.colors.LogNorm(vmin=self.clip(vmin),
                                         vmax=self.clip(vmax))
Beispiel #15
0
class GaussianMixture2DOp(HasStrictTraits):
    """
    This module fits a 2D Gaussian mixture model with a specified number of
    components to a pair of channels.
    
    Creates a new categorical metadata variable named `name`, with possible
    values `name_1` .... `name_n` where `n` is the number of components.
    An event is assigned to `name_i` category if it falls within `sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if `sigma == 0.0`), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to `name_None`.
    
    As a special case, if `num_components` is `1` and `sigma` > 0.0, then
    the new condition is boolean, `True` if the event fell in the gate and
    `False` otherwise.
    
    Optionally, if `posteriors` is `True`, this module will also compute the 
    posterior probability of each event in its assigned component, returning
    it in a new colunm named `{Name}_Posterior`.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the `by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the mixture model to.
        
    ychannel : Str
        The Y channel to apply the mixture model to.

    xscale : Enum("linear", "logicle", "log") (default = "linear")
        Re-scale the data on the X acis before fitting the data?  

    yscale : Enum("linear", "logicle", "log") (default = "linear")
        Re-scale the data on the Y axis before fitting the data?  
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.

    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
        
    Statistics
    ----------
    xmean : Float
        the mean of the fitted gaussian in the x dimension.
        
    ymean : Float
        the mean of the fitted gaussian in the y dimension.
        
    proportion : Float
        the proportion of events in each component of the mixture model.  only
        set if `num_components` > 1.
        
    PS -- if someone has good ideas for summarizing spread in a 2D (non-isotropic)
    Gaussian, or other useful statistics, let me know!
    
    Examples
    --------
    
    >>> gauss_op = GaussianMixture2DOp(name = "Gaussian",
    ...                                xchannel = "V2-A",
    ...                                ychannel = "Y2-A",
    ...                                num_components = 2)
    >>> gauss_op.estimate(ex2)
    >>> gauss_op.default_view().plot(ex2)
    >>> ex3 = gauss_op.apply(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d')
    friendly_id = Constant("2D Gaussian Mixture")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    num_components = util.PositiveInt
    sigma = util.PositiveFloat(0.0, allow_zero=True)
    by = List(Str)

    posteriors = Bool(False)

    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True)
    _xscale = Instance(util.IScale, transient=True)
    _yscale = Instance(util.IScale, transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.ychannel))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError(
                "If num_components == 1, all posteriors are 1.")

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda x: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = util.scale_factory(self.xscale,
                                          experiment,
                                          channel=self.xchannel)
        self._yscale = util.scale_factory(self.yscale,
                                          experiment,
                                          channel=self.ychannel)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])

            # drop data that isn't in the scale range
            x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))]
            x = x.values

            gmm = mixture.GaussianMixture(n_components=self.num_components,
                                          covariance_type="full",
                                          random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                           " for group {0}".format(group))

            # in the 1D version, we sort the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.  that doesn't work in a 2D area,
            # obviously.

            # instead, we assume that the clusters are likely (?) to be
            # arranged along *one* of the axes, so we take the |norm| of the
            # x,y mean of each cluster and sort that way.

            norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """

        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self.xchannel:
            raise util.CytoflowOpError("Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError("Must set Y channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if not self._gmms:
            raise util.CytoflowOpError(
                "No components found.  Did you forget to "
                "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(
                "Couldn't find _xscale.  What happened??")

        if not self._yscale:
            raise util.CytoflowOpError(
                "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.ychannel))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError(
                    "Column {0} already found in the experiment".format(
                        col_name))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))

            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")

        event_assignments = pd.Series([None] * len(experiment), dtype="object")

        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))

        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.  for example, this is why
        # we don't use Ellipse.contains().

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])

            # which values are missing?
            x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel])
            x_na = x_na.values

            x = x.values
            group_idx = groupby.groups[group]

            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na])

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:

                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({
                    "x": x[:, 0],
                    "y": x[:, 1],
                    "p": predicted
                })

                # for each component, get the ellipse that follows the isoline
                # around the mixture component
                # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html
                # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389
                # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm
                # i am not proud of how many tries this took me to get right.

                for c in range(0, self.num_components):
                    mean = gmm.means_[c]
                    covar = gmm.covariances_[c]

                    # xc is the center on the x axis
                    # yc is the center on the y axis
                    xc = mean[0]  # @UnusedVariable
                    yc = mean[1]  # @UnusedVariable

                    v, w = linalg.eigh(covar)
                    u = w[0] / linalg.norm(w[0])

                    # xl is the length along the x axis
                    # yl is the length along the y axis
                    xl = np.sqrt(v[0]) * self.sigma  # @UnusedVariable
                    yl = np.sqrt(v[1]) * self.sigma  # @UnusedVariable

                    # t is the rotation in radians (counter-clockwise)
                    t = 2 * np.pi - np.arctan(u[1] / u[0])

                    sin_t = np.sin(t)  # @UnusedVariable
                    cos_t = np.cos(t)  # @UnusedVariable

                    # and build an expression with numexpr so it evaluates fast!

                    gate_bool = gate_df.eval(
                        "p == @c and "
                        "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + "
                        "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1"
                    ).values

                    predicted[np.logical_and(predicted == c,
                                             gate_bool == False)] = -1

            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0,
                                      "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, :])
                posteriors = pd.Series([0.0] * len(predicted))
                for c in range(0, self.num_components):
                    posteriors[predicted == c] = probability[predicted == c, c]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors

        new_experiment = experiment.clone()

        if self.num_components == 1 and self.sigma > 0:
            new_experiment.add_condition(
                self.name, "bool",
                event_assignments == "{0}_1".format(self.name))
        elif self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.posteriors and self.num_components > 1:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)

        # add the statistics
        levels = list(self.by)
        if self.num_components > 1:
            levels.append(self.name)

        if levels:
            idx = pd.MultiIndex.from_product(
                [new_experiment[x].unique() for x in levels], names=levels)

            xmean_stat = pd.Series(index=idx,
                                   dtype=np.dtype(object)).sort_index()
            ymean_stat = pd.Series(index=idx,
                                   dtype=np.dtype(object)).sort_index()
            prop_stat = pd.Series(index=idx,
                                  dtype=np.dtype(object)).sort_index()

            for group, _ in groupby:
                gmm = self._gmms[group]
                for c in range(self.num_components):
                    if self.num_components > 1:
                        component_name = "{}_{}".format(self.name, c + 1)

                        if group is True:
                            g = [component_name]
                        elif isinstance(group, tuple):
                            g = list(group)
                            g.append(component_name)
                        else:
                            g = list([group])
                            g.append(component_name)

                        if len(g) > 1:
                            g = tuple(g)
                        else:
                            g = g[0]
                    else:
                        g = group

                    xmean_stat.loc[g] = self._xscale.inverse(gmm.means_[c][0])
                    ymean_stat.loc[g] = self._yscale.inverse(gmm.means_[c][0])
                    prop_stat.loc[g] = gmm.weights_[c]

            new_experiment.statistics[(self.name, "xmean")] = xmean_stat
            new_experiment.statistics[(self.name, "ymean")] = ymean_stat
            if self.num_components > 1:
                new_experiment.statistics[(self.name,
                                           "proportion")] = prop_stat

        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        return GaussianMixture2DView(op=self, **kwargs)
Beispiel #16
0
class MATS3DMplDamageEEQ(MATS3DEval):
    # To use the model directly in the simulator specify the
    # time stepping classes

    epsilon_0 = Float(59.0e-6,
                      label="a",
                      desc="Lateral pressure coefficient",
                      enter_set=True,
                      auto_set=False)

    epsilon_f = Float(250.0e-6,
                      label="a",
                      desc="Lateral pressure coefficient",
                      enter_set=True,
                      auto_set=False)

    c_T = Float(0.01,
                label="a",
                desc="Lateral pressure coefficient",
                enter_set=True,
                auto_set=False)

    #=========================================================================
    # Configurational parameters
    #=========================================================================
    state_var_shapes = tr.Property(tr.Dict(), depends_on='n_mp')
    r'''
    Shapes of the state variables
    to be stored in the global array at the level 
    of the domain.
    '''
    @cached_property
    def _get_state_var_shapes(self):
        return {'kappa_n': (self.n_mp, ), 'omega_n': (self.n_mp, )}

    U_var_shape = (6, )
    '''Shape of the primary variable required by the TStepState.
    '''

    node_name = 'Desmorat model'

    tree_node_list = tr.List([])

    #=========================================================================
    # Evaluation - get the corrector and predictor
    #=========================================================================

    def get_corr_pred(self, eps_ab, tn1, kappa_n, omega_n):

        self._update_state_variables(eps_ab, kappa_n, omega_n)
        #----------------------------------------------------------------------
        # if the regularization using the crack-band concept is on calculate the
        # effective element length in the direction of principle strains
        #----------------------------------------------------------------------
        # if self.regularization:
        #    h = self.get_regularizing_length(sctx, eps_app_eng)
        #    self.phi_fn.h = h

        #------------------------------------------------------------------
        # Damage tensor (2th order):
        #------------------------------------------------------------------
        phi_ab = self._get_phi_ab(kappa_n)

        #------------------------------------------------------------------
        # Damage tensor (4th order) using product- or sum-type symmetrization:
        #------------------------------------------------------------------
        beta_abcd = self._get_beta_abcd(phi_ab)

        #------------------------------------------------------------------
        # Damaged stiffness tensor calculated based on the damage tensor beta4:
        #------------------------------------------------------------------
        D_ijab = einsum('...ijab, abef, ...cdef -> ...ijcd', beta_abcd,
                        self.D_abef, beta_abcd)

        sig_ab = einsum('...abef,...ef -> ...ab', D_ijab, eps_ab)

        return sig_ab, D_ijab

    #=========================================================================
    # MICROPLANE-Kinematic constraints
    #=========================================================================

    _MPNN = Property(depends_on='n_mp')
    r'''Get the dyadic product of the microplane normals
    '''

    @cached_property
    def _get__MPNN(self):
        # dyadic product of the microplane normals

        MPNN_nij = einsum('ni,nj->nij', self._MPN, self._MPN)
        return MPNN_nij

    _MPTT = Property(depends_on='n_mp')
    r'''Get the third order tangential tensor (operator) for each microplane
    '''

    @cached_property
    def _get__MPTT(self):
        # Third order tangential tensor for each microplane
        delta = identity(3)
        MPTT_nijr = 0.5 * (
            einsum('ni,jr -> nijr', self._MPN, delta) +
            einsum('nj,ir -> njir', self._MPN, delta) -
            2.0 * einsum('ni,nj,nr -> nijr', self._MPN, self._MPN, self._MPN))
        return MPTT_nijr

    def _get_e_na(self, eps_ab):
        r'''
        Projection of apparent strain onto the individual microplanes
        '''
        e_ni = einsum('nb,...ba->...na', self._MPN, eps_ab)
        return e_ni

    def _get_e_N_n(self, e_na):
        r'''
        Get the normal strain array for each microplane
        '''
        e_N_n = einsum('...na, na->...n', e_na, self._MPN)
        return e_N_n

    def _get_e_equiv_n(self, e_na):
        r'''
        Returns a list of the microplane equivalent strains
        based on the list of microplane strain vectors
        '''
        # magnitude of the normal strain vector for each microplane
        e_N_n = self._get_e_N_n(e_na)
        # positive part of the normal strain magnitude for each microplane
        e_N_pos_n = (np.abs(e_N_n) + e_N_n) / 2.0
        # normal strain vector for each microplane
        e_N_na = einsum('...n,ni -> ...ni', e_N_n, self._MPN)
        # tangent strain ratio
        c_T = self.c_T
        # tangential strain vector for each microplane
        e_T_na = e_na - e_N_na
        # squared tangential strain vector for each microplane
        e_TT_n = einsum('...ni,...ni -> ...n', e_T_na, e_T_na)
        # equivalent strain for each microplane
        e_equiv_n = sqrt(e_N_pos_n * e_N_pos_n + c_T * e_TT_n)
        return e_equiv_n

    def _update_state_variables(self, eps_ab, kappa_n, omega_n):
        e_na = self._get_e_na(eps_ab)
        eps_eq_n = self._get_e_equiv_n(e_na)
        f_trial_n = eps_eq_n - self.epsilon_0
        I = np.where(f_trial_n > 0)
        k_n = np.max(np.array([kappa_n[I], eps_eq_n[I]]), axis=0)
        kappa_n[I] = k_n
        omega_n[I] = self._get_omega(k_n)

    def _get_omega(self, kappa_n):
        '''
        Return new value of damage parameter
        @param kappa:
        '''
        omega_n = np.zeros_like(kappa_n)
        epsilon_0 = self.epsilon_0
        epsilon_f = self.epsilon_f
        I = np.where(kappa_n >= epsilon_0)
        omega_n[I] = (
            1.0 - (epsilon_0 / kappa_n[I] * np.exp(-1.0 *
                                                   (kappa_n[I] - epsilon_0) /
                                                   (epsilon_f - epsilon_0))))
        return omega_n

    def _get_phi_ab(self, kappa_n):
        # Returns the 2nd order damage tensor 'phi_mtx'
        # scalar integrity factor for each microplane
        phi_n = np.sqrt(1.0 - self._get_omega(kappa_n))
        # print 'phi_Emn', phi_Emn[:, -1, :]
        # integration terms for each microplanes
        phi_ab = einsum('...n,n,nab->...ab', phi_n, self._MPW, self._MPNN)
        return phi_ab

    def _get_beta_abcd(self, phi_ab):
        '''
        Returns the 4th order damage tensor 'beta4' using sum-type symmetrization
        (cf. [Jir99], Eq.(21))
        '''
        delta = identity(3)
        beta_ijkl = 0.25 * (einsum('...ik,jl->...ijkl', phi_ab, delta) +
                            einsum('...il,jk->...ijkl', phi_ab, delta) +
                            einsum('...jk,il->...ijkl', phi_ab, delta) +
                            einsum('...jl,ik->...ijkl', phi_ab, delta))
        return beta_ijkl

    #-----------------------------------------------
    # number of microplanes - currently fixed for 3D
    #-----------------------------------------------
    n_mp = Constant(28)

    #-----------------------------------------------
    # get the normal vectors of the microplanes
    #-----------------------------------------------
    _MPN = Property(depends_on='n_mp')

    @cached_property
    def _get__MPN(self):
        return array([[.577350259, .577350259, .577350259],
                      [.577350259, .577350259, -.577350259],
                      [.577350259, -.577350259, .577350259],
                      [.577350259, -.577350259, -.577350259],
                      [.935113132, .250562787, .250562787],
                      [.935113132, .250562787, -.250562787],
                      [.935113132, -.250562787, .250562787],
                      [.935113132, -.250562787, -.250562787],
                      [.250562787, .935113132, .250562787],
                      [.250562787, .935113132, -.250562787],
                      [.250562787, -.935113132, .250562787],
                      [.250562787, -.935113132, -.250562787],
                      [.250562787, .250562787, .935113132],
                      [.250562787, .250562787, -.935113132],
                      [.250562787, -.250562787, .935113132],
                      [.250562787, -.250562787, -.935113132],
                      [.186156720, .694746614, .694746614],
                      [.186156720, .694746614, -.694746614],
                      [.186156720, -.694746614, .694746614],
                      [.186156720, -.694746614, -.694746614],
                      [.694746614, .186156720, .694746614],
                      [.694746614, .186156720, -.694746614],
                      [.694746614, -.186156720, .694746614],
                      [.694746614, -.186156720, -.694746614],
                      [.694746614, .694746614, .186156720],
                      [.694746614, .694746614, -.186156720],
                      [.694746614, -.694746614, .186156720],
                      [.694746614, -.694746614, -.186156720]])

    #-------------------------------------
    # get the weights of the microplanes
    #-------------------------------------
    _MPW = Property(depends_on='n_mp')

    @cached_property
    def _get__MPW(self):
        return array([
            .0160714276, .0160714276, .0160714276, .0160714276, .0204744730,
            .0204744730, .0204744730, .0204744730, .0204744730, .0204744730,
            .0204744730, .0204744730, .0204744730, .0204744730, .0204744730,
            .0204744730, .0158350505, .0158350505, .0158350505, .0158350505,
            .0158350505, .0158350505, .0158350505, .0158350505, .0158350505,
            .0158350505, .0158350505, .0158350505
        ]) * 6.0

    def _get_lame_params(self):
        la = self.E * self.nu / ((1. + self.nu) * (1. - 2. * self.nu))
        # second Lame parameter (shear modulus)
        mu = self.E / (2. + 2. * self.nu)
        return la, mu

    D_abef = tr.Property(tr.Array, depends_on='+input')

    @tr.cached_property
    def _get_D_abef(self):
        la = self._get_lame_params()[0]
        mu = self._get_lame_params()[1]
        delta = identity(3)
        D_abef = (einsum(',ij,kl->ijkl', la, delta, delta) +
                  einsum(',ik,jl->ijkl', mu, delta, delta) +
                  einsum(',il,jk->ijkl', mu, delta, delta))

        return D_abef

    def _get_var_dict(self):
        var_dict = super(MATS3DMplDamageEEQ, self)._get_var_dict()
        var_dict.update(phi_ab=self.get_phi_ab)
        return var_dict

    def get_phi_ab(self, eps_ab, tn1, kappa_n, omega_n):
        return self._get_phi_ab(kappa_n)
class TasbeCalibrationOp(PluginOpMixin):
    handler_factory = Callable(TasbeHandler)

    id = Constant(
        'edu.mit.synbio.cytoflowgui.op_plugins.bleedthrough_piecewise')
    friendly_id = Constant("Quantitative Pipeline")
    name = Constant("TASBE")

    fsc_channel = DelegatesTo('_polygon_op', 'xchannel', estimate=True)
    ssc_channel = DelegatesTo('_polygon_op', 'ychannel', estimate=True)
    vertices = DelegatesTo('_polygon_op', 'vertices', estimate=True)
    channels = List(Str, estimate=True)

    blank_file = File(filter=["*.fcs"], estimate=True)

    bleedthrough_list = List(_BleedthroughControl, estimate=True)

    beads_name = Str(estimate=True)
    beads_file = File(filter=["*.fcs"], estimate=True)
    units_list = List(_Unit, estimate=True)

    bead_peak_quantile = Int(80, estimate=True)
    bead_brightness_threshold = Float(100, estimate=True)
    bead_brightness_cutoff = util.FloatOrNone("", estimate=True)

    do_color_translation = Bool(estimate=True)
    to_channel = Str(estimate=True)
    translation_list = List(_TranslationControl, estimate=True)
    mixture_model = Bool(False, estimate=True)

    do_estimate = Event
    valid_model = Bool(False, status=True)
    do_exit = Event
    input_files = List(File)
    output_directory = Directory

    _blank_exp_file = File(transient=True)
    _blank_exp = Instance(Experiment, transient=True)
    _blank_exp_file = File(transient=True)
    _blank_exp_channels = List(Str, status=True)
    _polygon_op = Instance(PolygonOp,
                           kw={
                               'name': 'polygon',
                               'xscale': 'log',
                               'yscale': 'log'
                           },
                           transient=True)
    _af_op = Instance(AutofluorescenceOp, (), transient=True)
    _bleedthrough_op = Instance(BleedthroughLinearOp, (), transient=True)
    _bead_calibration_op = Instance(BeadCalibrationOp, (), transient=True)
    _color_translation_op = Instance(ColorTranslationOp, (), transient=True)

    status = Str(status=True)

    @on_trait_change('channels[], to_channel, do_color_translation',
                     post_init=True)
    def _channels_changed(self, obj, name, old, new):
        for channel in self.channels:
            if channel not in [
                    control.channel for control in self.bleedthrough_list
            ]:
                self.bleedthrough_list.append(
                    _BleedthroughControl(channel=channel))

            if channel not in [unit.channel for unit in self.units_list]:
                self.units_list.append(_Unit(channel=channel))

        to_remove = []
        for control in self.bleedthrough_list:
            if control.channel not in self.channels:
                to_remove.append(control)

        for control in to_remove:
            self.bleedthrough_list.remove(control)

        to_remove = []
        for unit in self.units_list:
            if unit.channel not in self.channels:
                to_remove.append(unit)

        for unit in to_remove:
            self.units_list.remove(unit)

        if self.do_color_translation:
            to_remove = []
            for unit in self.units_list:
                if unit.channel != self.to_channel:
                    to_remove.append(unit)

            for unit in to_remove:
                self.units_list.remove(unit)

            self.translation_list = []
            for c in self.channels:
                if c == self.to_channel:
                    continue
                self.translation_list.append(
                    _TranslationControl(from_channel=c,
                                        to_channel=self.to_channel))

            self.changed = (Changed.ESTIMATE, ('translation_list',
                                               self.translation_list))

        self.changed = (Changed.ESTIMATE, ('bleedthrough_list',
                                           self.bleedthrough_list))
        self.changed = (Changed.ESTIMATE, ('units_list', self.units_list))

    @on_trait_change('_polygon_op:vertices', post_init=True)
    def _polygon_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, (None, None))

    @on_trait_change("bleedthrough_list_items, bleedthrough_list.+",
                     post_init=True)
    def _bleedthrough_controls_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('bleedthrough_list',
                                           self.bleedthrough_list))

    @on_trait_change("translation_list_items, translation_list.+",
                     post_init=True)
    def _translation_controls_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('translation_list',
                                           self.translation_list))

    @on_trait_change('units_list_items,units_list.+', post_init=True)
    def _units_changed(self, obj, name, old, new):
        self.changed = (Changed.ESTIMATE, ('units_list', self.units_list))


#

    def estimate(self, experiment, subset=None):
        #         if not self.subset:
        #             warnings.warn("Are you sure you don't want to specify a subset "
        #                           "used to estimate the model?",
        #                           util.CytoflowOpWarning)

        #         if experiment is None:
        #             raise util.CytoflowOpError("No valid result to estimate with")

        #         experiment = experiment.clone()

        if not self.fsc_channel:
            raise util.CytoflowOpError('fsc_channel', "Must set FSC channel")

        if not self.ssc_channel:
            raise util.CytoflowOpError('ssc_channel', "Must set SSC channel")

        if not self._polygon_op.vertices:
            raise util.CytoflowOpError(
                None, "Please draw a polygon around the "
                "single-cell population in the "
                "Morphology tab")

        experiment = self._blank_exp.clone()
        experiment = self._polygon_op.apply(experiment)

        self._af_op.channels = self.channels
        self._af_op.blank_file = self.blank_file

        self._af_op.estimate(experiment, subset="polygon == True")
        self.changed = (Changed.ESTIMATE_RESULT, "Autofluorescence")
        experiment = self._af_op.apply(experiment)

        self.status = "Estimating bleedthrough"

        self._bleedthrough_op.controls.clear()
        for control in self.bleedthrough_list:
            self._bleedthrough_op.controls[control.channel] = control.file

        self._bleedthrough_op.estimate(experiment, subset="polygon == True")
        self.changed = (Changed.ESTIMATE_RESULT, "Bleedthrough")
        experiment = self._bleedthrough_op.apply(experiment)

        self.status = "Estimating bead calibration"

        self._bead_calibration_op.beads = BeadCalibrationOp.BEADS[
            self.beads_name]
        self._bead_calibration_op.beads_file = self.beads_file
        self._bead_calibration_op.bead_peak_quantile = self.bead_peak_quantile
        self._bead_calibration_op.bead_brightness_threshold = self.bead_brightness_threshold
        self._bead_calibration_op.bead_brightness_cutoff = self.bead_brightness_cutoff

        self._bead_calibration_op.units.clear()

        for unit in self.units_list:
            self._bead_calibration_op.units[unit.channel] = unit.unit

        self._bead_calibration_op.estimate(experiment)
        self.changed = (Changed.ESTIMATE_RESULT, "Bead Calibration")

        if self.do_color_translation:
            self.status = "Estimating color translation"

            experiment = self._bead_calibration_op.apply(experiment)

            self._color_translation_op.mixture_model = self.mixture_model

            self._color_translation_op.controls.clear()
            for control in self.translation_list:
                self._color_translation_op.controls[(
                    control.from_channel, control.to_channel)] = control.file

            self._color_translation_op.estimate(experiment,
                                                subset='polygon == True')

            self.changed = (Changed.ESTIMATE_RESULT, "Color Translation")

        self.status = "Done estimating"
        self.valid_model = True

    def should_clear_estimate(self, changed, payload):
        """
        Should the owning WorkflowItem clear the estimated model by calling
        op.clear_estimate()?  `changed` can be:
        - Changed.ESTIMATE -- the parameters required to call 'estimate()' (ie
          traits with estimate = True metadata) have changed
        - Changed.PREV_RESULT -- the previous WorkflowItem's result changed

         """
        if changed == Changed.ESTIMATE:
            name, val = payload
            if name == 'fsc_channel' or name == 'ssc_channel':
                return False

        return True

    def clear_estimate(self):
        self._af_op = AutofluorescenceOp()
        self._bleedthrough_op = BleedthroughLinearOp()
        self._bead_calibration_op = BeadCalibrationOp()
        self._color_translation_op = ColorTranslationOp()
        self.valid_model = False

        self.changed = (Changed.ESTIMATE_RESULT, self)

    def should_apply(self, changed, payload):
        """
        Should the owning WorkflowItem apply this operation when certain things
        change?  `changed` can be:
        - Changed.OPERATION -- the operation's parameters changed
        - Changed.PREV_RESULT -- the previous WorkflowItem's result changed
        - Changed.ESTIMATE_RESULT -- the results of calling "estimate" changed

        """
        if changed == Changed.ESTIMATE_RESULT and \
            self.blank_file != self._blank_exp_file:
            return True

        elif changed == Changed.OPERATION:
            name, _ = payload
            if name == "output_directory":
                return False

            return True

        return False

    def apply(self, experiment):

        if self.blank_file != self._blank_exp_file:
            self._blank_exp = ImportOp(tubes=[Tube(
                file=self.blank_file)]).apply()
            self._blank_exp_file = self.blank_file
            self._blank_exp_channels = self._blank_exp.channels
            self.changed = (Changed.PREV_RESULT, None)
            return

        out_dir = Path(self.output_directory)
        for path in self.input_files:
            in_file_path = Path(path)
            out_file_path = out_dir / in_file_path.name
            if out_file_path.exists():
                raise util.CytoflowOpError(
                    None, "File {} already exists".format(out_file_path))

        tubes = [
            Tube(file=path, conditions={'filename': Path(path).stem})
            for path in self.input_files
        ]

        for tube in tubes:
            self.status = "Converting " + Path(tube.file).stem
            experiment = ImportOp(tubes=[tube],
                                  conditions={
                                      'filename': 'category'
                                  }).apply()

            experiment = self._af_op.apply(experiment)
            experiment = self._bleedthrough_op.apply(experiment)
            experiment = self._bead_calibration_op.apply(experiment)

            if self.do_color_translation:
                experiment = self._color_translation_op.apply(experiment)

            ExportFCS(path=self.output_directory,
                      by=['filename'],
                      _include_by=False).export(experiment)

        self.input_files = []
        self.status = "Done converting!"

    def default_view(self, **kwargs):
        return TasbeCalibrationView(op=self, **kwargs)

    def get_help(self):
        current_dir = os.path.abspath(__file__)
        help_dir = os.path.split(current_dir)[0]
        help_dir = os.path.join(help_dir, "help")

        help_file = None
        for klass in self.__class__.__mro__:
            mod = klass.__module__
            mod_html = mod + ".html"

            h = os.path.join(help_dir, mod_html)
            if os.path.exists(h):
                help_file = h
                break

        with open(help_file, encoding='utf-8') as f:
            help_html = f.read()

        return help_html
Beispiel #18
0
class HlogScale(ScaleMixin):
    """
    A scale that transforms the data using the `hyperlog` function.
    
    This scaling method implements a "linear-like" region around 0, and a
    "log-like" region for large values, with a smooth transition between
    them.
    
    The transformation has one parameter, `b`, which specifies the location of
    the transition from linear to log-like.  The default, `500`, is good for
    18-bit scales and not good for other scales.
    
    Attributes
    ----------
    b : Float (default = 500)
        the location of the transition from linear to log-like.
    
    References
    ----------
    [1] Hyperlog-a flexible log-like transform for negative, zero, and positive 
        valued data.
        Bagwell CB.
        Cytometry A. 2005 Mar;64(1):34-42. 
        PMID: 15700280
        http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20114/abstract
    """

    id = Constant("edu.mit.synbio.cytoflow.utility.hlog")
    name = "hlog"

    experiment = Instance("cytoflow.Experiment")

    # what data do we use to compute scale parameters?  set one.
    channel = Str
    condition = Str
    statistic = Tuple(Str, Str)

    range = Property(Float)
    b = Float(200, desc="location of the log transition")

    mpl_params = Property(Dict, depends_on="[b, range, scale_min, scale_max]")

    def __call__(self, data):
        """
        Transforms `data` using this scale.
        
        Careful!  May return `NaN` if the scale domain doesn't match the data 
        (ie, applying a log10 scale to negative numbers.)
        """

        f = _make_hlog_numeric(self.b, 1.0, np.log10(self.range))

        if isinstance(data, pd.Series):
            return data.apply(f)
        elif isinstance(data, np.ndarray):
            return f(data)
        elif isinstance(data, (int, float)):
            # numpy returns a 0-dim array.  wtf.
            return float(f(data))
        else:
            try:
                return map(f, data)
            except TypeError:
                raise CytoflowError("Unknown data type in HlogScale.__call__")

    def inverse(self, data):
        """
        Transforms 'data' using the inverse of this scale.
        """

        f_inv = lambda y, b=self.b, d=np.log10(self.range): hlog_inv(
            y, b, 1.0, d)

        if isinstance(data, pd.Series):
            return data.apply(f_inv)
        elif isinstance(data, np.ndarray):
            inverse = np.vectorize(f_inv)
            return inverse(data)
        elif isinstance(data, float):
            return f_inv(data)
        else:
            try:
                return map(f_inv, data)
            except TypeError:
                raise CytoflowError("Unknown data type in HlogScale.inverse")

    def clip(self, data):
        return data

    def _get_range(self):
        if self.experiment:
            if self.channel and self.channel in self.experiment.channels:
                if "range" in self.experiment.metadata[self.channel]:
                    return self.experiment.metadata[self.channel]["range"]
                else:
                    return self.experiment.data[self.channel].max()
            elif self.condition and self.condition in self.experiment.conditions:
                return self.experiment.data[self.condition].max()
            elif self.statistic and self.statistic in self.experiment.statistics:
                return self.experiment.statistics[self.statistic].max()
            else:
                return Undefined
        else:
            return Undefined

    @cached_property
    def _get_mpl_params(self):
        return {"b": self.b, "range": self.range}
class Heading(Label):
    """ An item that is a fancy label.
    """

    # Override the 'style' trait to default to the fancy 'custom' style:
    style = Constant('custom')
Beispiel #20
0
class MATS2DMicroplaneDamageJir(MATSXDMicroplaneDamageFatigueJir):

    # implements(IMATSEval)

    #-----------------------------------------------
    # number of microplanes
    #-----------------------------------------------
    n_mp = Constant(360)

    #-----------------------------------------------
    # get the normal vectors of the microplanes
    #-----------------------------------------------
    _MPN = Property(depends_on='n_mp')

    @cached_property
    def _get__MPN(self):
        # microplane normals:

        alpha_list = linspace(0, 2 * pi, self.n_mp)

        MPN = array([[cos(alpha), sin(alpha)] for alpha in alpha_list])

        return MPN

    #-------------------------------------
    # get the weights of the microplanes
    #-------------------------------------
    _MPW = Property(depends_on='n_mp')

    @cached_property
    def _get__MPW(self):
        # Note that the values in the array must be multiplied by 6 (cf. [Baz05])!
        # The sum of of the array equals 0.5. (cf. [BazLuz04]))
        # The values are given for an Gaussian integration over the unit
        # hemisphere.
        MPW = ones(self.n_mp) / self.n_mp * 2

        return MPW

    #-------------------------------------------------------------------------
    # Cached elasticity tensors
    #-------------------------------------------------------------------------

    elasticity_tensors = Property(
        depends_on='E, nu, dimensionality, stress_state')

    @cached_property
    def _get_elasticity_tensors(self):
        '''
        Intialize the fourth order elasticity tensor for 2D or 2D plane strain or 2D plane stress
        '''
        # ----------------------------------------------------------------------------
        # Lame constants calculated from E and nu
        # ----------------------------------------------------------------------------

        # first Lame paramter
        la = self.E * self.nu / ((1 + self.nu) * (1 - 2 * self.nu))
        # second Lame parameter (shear modulus)
        mu = self.E / (2 + 2 * self.nu)

        # -----------------------------------------------------------------------------------------------------
        # Get the fourth order elasticity and compliance tensors for the 2D-case
        # -----------------------------------------------------------------------------------------------------

        # construct the elasticity tensor (using Numpy - einsum function)
        delta = identity(2)
        D_ijkl = (einsum(',ij,kl->ijkl', la, delta, delta) +
                  einsum(',ik,jl->ijkl', mu, delta, delta) +
                  einsum(',il,jk->ijkl', mu, delta, delta))

        return D_ijkl
Beispiel #21
0
class PolygonOp(HasStrictTraits):
    """
    Apply a polygon gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by :meth:`apply`
        
    xchannel, ychannel : Str
        The names of the x and y channels to apply the gate.
        
    xscale, yscale : {'linear', 'log', 'logicle'} (default = 'linear')
        The scales applied to the data before drawing the polygon.
        
    vertices : List((Float, Float))
        The polygon verticies.  An ordered list of 2-tuples, representing
        the x and y coordinates of the vertices.
        
    Notes
    -----
    This module uses :meth:`matplotlib.path.Path` to represent the polygon, because
    membership testing is very fast.
    
    You can set the verticies by hand, I suppose, but it's much easier to use
    the interactive view you get from :meth:`default_view` to do so.

    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> p = flow.PolygonOp(name = "Polygon",
        ...                    xchannel = "V2-A",
        ...                    ychannel = "Y2-A")
        >>> p.vertices = [(23.411982294776319, 5158.7027015021222), 
        ...               (102.22182270573683, 23124.058843387455), 
        ...               (510.94519955277201, 23124.058843387455), 
        ...               (1089.5215641232173, 3800.3424832180476), 
        ...               (340.56382570202402, 801.98947404942271), 
        ...               (65.42597937575897, 1119.3133482602157)]

        
    Show the default view.  

    .. plot::
        :context: close-figs
            
        >>> df = p.default_view(huefacet = "Dox",
        ...                    xscale = 'log',
        ...                    yscale = 'log')
        
        >>> df.plot(ex)
        
    
    .. note::
       If you want to use the interactive default view in a Jupyter notebook,
       make sure you say ``%matplotlib notebook`` in the first cell 
       (instead of ``%matplotlib inline`` or similar).  Then call 
       ``default_view()`` with ``interactive = True``::
       
           df = p.default_view(huefacet = "Dox",
                               xscale = 'log',
                               yscale = 'log',
                               interactive = True)
           df.plot(ex)
        
    Apply the gate, and show the result
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = p.apply(ex)
        >>> ex2.data.groupby('Polygon').size()
        Polygon
        False    15875
        True      4125
        dtype: int64
            
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.polygon')
    friendly_id = Constant("Polygon")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    vertices = List((Float, Float))

    xscale = util.ScaleEnum()
    yscale = util.ScaleEnum()

    _selection_view = Instance('PolygonSelection', transient=True)

    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old :class:`Experiment` to which this op is applied
            
        Returns
        -------
        Experiment
            a new :class:'Experiment`, the same as ``old_experiment`` but with 
            a new column of type `bool` with the same as the operation name.  
            The bool is ``True`` if the event's measurement is within the 
            polygon, and ``False`` otherwise.
            
        Raises
        ------
        util.CytoflowOpError
            if for some reason the operation can't be applied to this
            experiment. The reason is in :attr:`.CytoflowOpError.args`
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name', "{} is in the experiment already!".format(self.name))

        if not self.xchannel:
            raise util.CytoflowOpError('xchannel', "Must specify an x channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel', "Must specify a y channel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError(
                'xchannel',
                "xchannel {0} is not in the experiment".format(self.xchannel))

        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError(
                'ychannel',
                "ychannel {0} is not in the experiment".format(self.ychannel))

        if len(self.vertices) < 3:
            raise util.CytoflowOpError('vertices',
                                       "Must have at least 3 vertices")

        if any([len(x) != 2 for x in self.vertices]):
            return util.CytoflowOpError(
                'vertices', "All vertices must be lists or tuples "
                "of length = 2")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the Polygon gate's name "
                "before applying it!")

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                'name',
                "Experiment already contains a column {0}".format(self.name))

        # there's a bit of a subtlety here: if the vertices were
        # selected with an interactive plot, and that plot had scaled
        # axes, we need to apply that scale function to both the
        # vertices and the data before looking for path membership
        xscale = util.scale_factory(self.xscale,
                                    experiment,
                                    channel=self.xchannel)
        yscale = util.scale_factory(self.yscale,
                                    experiment,
                                    channel=self.ychannel)

        vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices]
        data = experiment.data[[self.xchannel, self.ychannel]].copy()
        data[self.xchannel] = xscale(data[self.xchannel])
        data[self.ychannel] = yscale(data[self.ychannel])

        # use a matplotlib Path because testing for membership is a fast C fn.
        path = mpl.path.Path(np.array(vertices))
        xy_data = data.as_matrix(columns=[self.xchannel, self.ychannel])

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool",
                                     path.contains_points(xy_data))
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))

        return new_experiment

    def default_view(self, **kwargs):
        self._selection_view = PolygonSelection(op=self)
        self._selection_view.trait_set(**kwargs)
        return self._selection_view
Beispiel #22
0
class MATS3DMicroplaneDamage(MATSXDMicroplaneDamage, MATS3DEval):

    implements(IMATSEval)
    # number of spatial dimensions
    #
    n_dim = Constant(3)

    # number of components of engineering tensor representation
    #
    n_eng = Constant(6)

    #-------------------------------------------------------------------------
    # PolarDiscr related data
    #-------------------------------------------------------------------------
    #
    # number of microplanes - currently fixed for 3D
    #
    n_mp = Constant(28)

    # get the normal vectors of the microplanes
    #
    _MPN = Property(depends_on='n_mp')

    @cached_property
    def _get__MPN(self):
        # microplane normals:
        return array([[.577350259, .577350259, .577350259],
                      [.577350259, .577350259, -.577350259],
                      [.577350259, -.577350259, .577350259],
                      [.577350259, -.577350259, -.577350259],
                      [.935113132, .250562787, .250562787],
                      [.935113132, .250562787, -.250562787],
                      [.935113132, -.250562787, .250562787],
                      [.935113132, -.250562787, -.250562787],
                      [.250562787, .935113132, .250562787],
                      [.250562787, .935113132, -.250562787],
                      [.250562787, -.935113132, .250562787],
                      [.250562787, -.935113132, -.250562787],
                      [.250562787, .250562787, .935113132],
                      [.250562787, .250562787, -.935113132],
                      [.250562787, -.250562787, .935113132],
                      [.250562787, -.250562787, -.935113132],
                      [.186156720, .694746614, .694746614],
                      [.186156720, .694746614, -.694746614],
                      [.186156720, -.694746614, .694746614],
                      [.186156720, -.694746614, -.694746614],
                      [.694746614, .186156720, .694746614],
                      [.694746614, .186156720, -.694746614],
                      [.694746614, -.186156720, .694746614],
                      [.694746614, -.186156720, -.694746614],
                      [.694746614, .694746614, .186156720],
                      [.694746614, .694746614, -.186156720],
                      [.694746614, -.694746614, .186156720],
                      [.694746614, -.694746614, -.186156720]])

    # get the weights of the microplanes
    #
    _MPW = Property(depends_on='n_mp')

    @cached_property
    def _get__MPW(self):
        # Note that the values in the array must be multiplied by 6 (cf. [Baz05])!
        # The sum of of the array equals 0.5. (cf. [BazLuz04]))
        # The values are given for an Gaussian integration over the unit
        # hemisphere.
        return array([
            .0160714276, .0160714276, .0160714276, .0160714276, .0204744730,
            .0204744730, .0204744730, .0204744730, .0204744730, .0204744730,
            .0204744730, .0204744730, .0204744730, .0204744730, .0204744730,
            .0204744730, .0158350505, .0158350505, .0158350505, .0158350505,
            .0158350505, .0158350505, .0158350505, .0158350505, .0158350505,
            .0158350505, .0158350505, .0158350505
        ]) * 6.0

    #-------------------------------------------------------------------------
    # Cached elasticity tensors
    #-------------------------------------------------------------------------

    elasticity_tensors = Property(
        depends_on='E, nu, dimensionality, stress_state')

    @cached_property
    def _get_elasticity_tensors(self):
        '''
        Intialize the fourth order elasticity tensor for 3D or 2D plane strain or 2D plane stress
        '''
        # ----------------------------------------------------------------------------
        # Lame constants calculated from E and nu
        # ----------------------------------------------------------------------------
        E = self.E
        nu = self.nu

        # first Lame paramter
        la = E * nu / ((1 + nu) * (1 - 2 * nu))
        # second Lame parameter (shear modulus)
        mu = E / (2 + 2 * nu)

        # -----------------------------------------------------------------------------------------------------
        # Get the fourth order elasticity and compliance tensors for the 3D-case
        # -----------------------------------------------------------------------------------------------------

        # The following line correspond to the tensorial expression:
        # (using numpy functionality in order to avoid the loop):
        #
        # D4_e_3D = zeros((3,3,3,3),dtype=float)
        # C4_e_3D = zeros((3,3,3,3),dtype=float)
        # delta = identity(3)
        # for i in range(0,3):
        #     for j in range(0,3):
        #         for k in range(0,3):
        #             for l in range(0,3):
        # elasticity tensor (cf. Jir/Baz Inelastic analysis of structures Eq.D25):
        #                 D4_e_3D[i,j,k,l] = la * delta[i,j] * delta[k,l] + \
        #                                    mu * ( delta[i,k] * delta[j,l] + delta[i,l] * delta[j,k] )
        # elastic compliance tensor (cf. Simo, Computational Inelasticity, Eq.(2.7.16) AND (2.1.16)):
        #                 C4_e_3D[i,j,k,l] = (1+nu)/(E) * \
        #                                    ( delta[i,k] * delta[j,l] + delta[i,l]* delta[j,k] ) - \
        #                                    nu / E * delta[i,j] * delta[k,l]
        # NOTE: swapaxes returns a reference not a copy!
        # (the index notation always refers to the initial indexing (i=0,j=1,k=2,l=3))
        delta = identity(3)
        delta_ijkl = outer(delta, delta).reshape(3, 3, 3, 3)
        delta_ikjl = delta_ijkl.swapaxes(1, 2)
        delta_iljk = delta_ikjl.swapaxes(2, 3)
        D4_e_3D = la * delta_ijkl + mu * (delta_ikjl + delta_iljk)
        C4_e_3D = -nu / E * delta_ijkl + \
            (1 + nu) / (2 * E) * (delta_ikjl + delta_iljk)

        # -----------------------------------------------------------------------------------------------------
        # Get the fourth order elasticity and compliance tensors for the 3D-case
        # -----------------------------------------------------------------------------------------------------
        D2_e_3D = self.map_tns4_to_tns2(D4_e_3D)

        return D4_e_3D, C4_e_3D, D2_e_3D

    #-------------------------------------------------------------------------
    # Dock-based view with its own id
    #-------------------------------------------------------------------------
    traits_view = View(Include('polar_fn_group'),
                       dock='tab',
                       id='ibvpy.mats.mats3D.mats_3D_cmdm.MATS3D_cmdm',
                       kind='modal',
                       resizable=True,
                       scrollable=True,
                       width=0.6,
                       height=0.8,
                       buttons=['OK', 'Cancel'])
Beispiel #23
0
class BeadCalibrationOp(HasStrictTraits):
    """
    Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent
    beads (eg, the Spherotech RCP-30-5A rainbow beads.)
    
    Computes a log-linear calibration function that maps arbitrary fluorescence
    units to physical units (ie molecules equivalent fluorophore, or *MEF*).
    
    To use, set :attr:`beads_file` to an FCS file containing events collected *using
    the same cytometer settings as the data you're calibrating*.  Specify which 
    beads you ran by setting :attr:`beads` to match one of the  values of 
    :data:`BeadCalibrationOp.BEADS`; and set :attr:`units` to which channels you 
    want calibrated and in which units.  Then, call :meth:`estimate()` and check the 
    peak-finding with :meth:`default_view().plot()`.  If the peak-finding is wacky, 
    try adjusting :attr:`bead_peak_quantile` and :attr:`bead_brightness_threshold`.  When 
    the peaks are successfully identified, call :meth:`apply` to scale your 
    experimental data set. 
    
    If you can't make the peak finding work, please submit a bug report!
    
    This procedure works best when the beads file is very clean data.  It does
    not do its own gating (maybe a future addition?)  In the meantime, 
    I recommend gating the *acquisition* on the FSC/SSC channels in order
    to get rid of debris, cells, and other noise.
    
    Finally, because you can't have a negative number of fluorescent molecules
    (MEFLs, etc) (as well as for math reasons), this module filters out
    negative values.    
    
    Attributes
    ----------
    units : Dict(Str, Str)
        A dictionary specifying the channels you want calibrated (keys) and
        the units you want them calibrated in (values).  The units must be
        keys of the :attr:`beads` attribute.       
        
    beads_file : File
        A file containing the FCS events from the beads.

    beads : Dict(Str, List(Float))
        The beads' characteristics.  Keys are calibrated units (ie, MEFL or
        MEAP) and values are ordered lists of known fluorophore levels.  Common
        values for this dict are included in :data:`BeadCalibrationOp.BEADS`.
        
    bead_peak_quantile : Int (default = 80)
        The quantile threshold used to choose bead peaks. 
        
    bead_brightness_threshold : Float (default = 100)
        How bright must a bead peak be to be considered?  
        
    bead_brightness_cutoff : Float
        If a bead peak is above this, then don't consider it.  Takes care of
        clipping saturated detection.  Defaults to 70% of the detector range.
        
    bead_histogram_bins : Int (default = 512)
        The number of bins to use in computing the bead histogram.  Tweak
        this if the peak find is having difficulty, or if you have a small 
        number of events
        
    force_linear : Bool (default = False)
        A linear fit in log space doesn't always go through the origin, which 
        means that the calibration function isn't strictly a multiplicative
        scaling operation.  Set :attr:`force_linear` to force the such
        behavior.  Keep an eye on the diagnostic plot, though, to see how much
        error you're introducing!
   
           
    Notes
    -----
    The peak finding is rather sophisticated.  
    
    For each channel, a 256-bin histogram is computed on the log-transformed
    bead data, and then the histogram is smoothed with a Savitzky-Golay 
    filter (with a window length of 5 and a polynomial order of 1).  
    
    Next, a wavelet-based peak-finding algorithm is used: it convolves the
    smoothed histogram with a series of wavelets and looks for relative 
    maxima at various length-scales.  The parameters of the smoothing 
    algorithm were arrived at empircally, using beads collected at a wide 
    range of PMT voltages.
    
    Finally, the peaks are filtered by height (the histogram bin has a quantile
    greater than `bead_peak_quantile`) and intensity (brighter than 
    :attr:`bead_brightness_threshold`).
    
    How to convert from a series of peaks to mean equivalent fluorochrome?
    If there's one peak, we assume that it's the brightest peak.  If there
    are two peaks, we assume they're the brightest two.  If there are ``n >=3``
    peaks, we check all the contiguous `n`-subsets of the bead intensities
    and find the one whose linear regression (in log space!) has the smallest
    norm (square-root sum-of-squared-residuals.)
    
    There's a slight subtlety in the fact that we're performing the linear
    regression in log-space: if the relationship in log10-space is ``Y=aX + b``,
    then the same relationship in linear space is ``x = 10**X``, ``y = 10**y``, and
    ``y = (10**b) * (x ** a)``.

    
    Examples
    --------
    Create a small experiment:
    
    .. plot::
        :context: close-figs
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")]
        >>> ex = import_op.apply()
    
    Create and parameterize the operation
    
    .. plot::
        :context: close-figs

        >>> bead_op = flow.BeadCalibrationOp()
        >>> beads = "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R"
        >>> bead_op.beads = flow.BeadCalibrationOp.BEADS[beads]
        >>> bead_op.units = {"Pacific Blue-A" : "MEBFP",
        ...                  "FITC-A" : "MEFL",
        ...                  "PE-Tx-Red-YG-A" : "MEPTR"}
        >>>
        >>> bead_op.beads_file = "tasbe/beads.fcs"
    
    Estimate the model parameters
    
    .. plot::
        :context: close-figs 
    
        >>> bead_op.estimate(ex)
    
    Plot the diagnostic plot
    
    .. plot::
        :context: close-figs

        >>> bead_op.default_view().plot(ex)  

    Apply the operation to the experiment
    
    .. plot::
        :context: close-figs
    
        >>> ex = bead_op.apply(ex)  
        
    """
    
    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate')
    friendly_id = Constant("Bead Calibration")
    
    name = Constant("Beads")
    units = Dict(Str, Str)
    
    beads_file = File(exists = True)
    bead_peak_quantile = Int(80)

    bead_brightness_threshold = Float(100.0)
    bead_brightness_cutoff = util.FloatOrNone(None)
    bead_histogram_bins = Int(512)
    
    # TODO - bead_brightness_threshold should probably be different depending
    # on the data range of the input.
    
    force_linear = Bool(False)
    
    beads = Dict(Str, List(Float))

    _histograms = Dict(Str, Any, transient = True)
    _calibration_functions = Dict(Str, Callable, transient = True)
    _peaks = Dict(Str, Any, transient = True)
    _mefs = Dict(Str, Any, transient = True)

    def estimate(self, experiment): 
        """
        Estimate the calibration coefficients from the beads file.
        
        Parameters
        ----------
        experiment : Experiment
            The experiment used to compute the calibration.
            
        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")
        
        if not self.beads_file:
            raise util.CytoflowOpError('beads_file', "No beads file specified")

        if not set(self.units.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError('units',
                                       "Specified channels that weren't found in "
                                       "the experiment.")
            
        if not set(self.units.values()) <= set(self.beads.keys()):
            raise util.CytoflowOpError('units',
                                       "Units don't match beads.")
            
        self._histograms.clear()
        self._calibration_functions.clear()
        self._peaks.clear()
        self._mefs.clear()
                        
        # make a little Experiment
        check_tube(self.beads_file, experiment)
        beads_exp = ImportOp(tubes = [Tube(file = self.beads_file)],
                             channels = {experiment.metadata[c]["fcs_name"] : c for c in experiment.channels},
                             name_metadata = experiment.metadata['name_metadata']).apply()
        
        channels = list(self.units.keys())

        # make the histogram
        for channel in channels:
            data = beads_exp.data[channel]
            
            # TODO - this assumes the data is on a linear scale.  check it!
            data_range = experiment.metadata[channel]['range']

            if self.bead_brightness_cutoff is None:
                cutoff = 0.7 * data_range
            else:
                cutoff = self.bead_brightness_cutoff
                                            
            # bin the data on a log scale

            hist_bins = np.logspace(1, math.log(data_range, 2), num = self.bead_histogram_bins, base = 2)
            hist = np.histogram(data, bins = hist_bins)
            
            # mask off-scale values
            hist[0][0] = 0
            hist[0][-1] = 0
            
            # smooth it with a Savitzky-Golay filter
            hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1)
            
            self._histograms[channel] = (hist, hist_bins, hist_smooth)

            
        # find peaks
        for channel in channels:
            hist = self._histograms[channel][0]
            hist_bins = self._histograms[channel][1]
            hist_smooth = self._histograms[channel][2]

            peak_bins = scipy.signal.find_peaks_cwt(hist_smooth, 
                                                    widths = np.arange(3, 20),
                                                    max_distances = np.arange(3, 20) / 2)
                                    
            # filter by height and intensity
            peak_threshold = np.percentile(hist_smooth, self.bead_peak_quantile)
            peak_bins_filtered = \
                [x for x in peak_bins if hist_smooth[x] > peak_threshold 
                 and hist[1][x] > self.bead_brightness_threshold
                 and hist[1][x] < cutoff]
            
            self._peaks[channel] = [hist_bins[x] for x in peak_bins_filtered]    


        # compute the conversion        
        for channel in channels:
            peaks = self._peaks[channel]
            mef_unit = self.units[channel]
            
            if not mef_unit in self.beads:
                raise util.CytoflowOpError('units',
                                           "Invalid unit {0} specified for channel {1}".format(mef_unit, channel))
            
            # "mean equivalent fluorochrome"
            mef = self.beads[mef_unit]
                                                    
            if len(peaks) == 0:
                raise util.CytoflowOpError(None,
                                           "Didn't find any peaks for channel {}; "
                                           "check the diagnostic plot"
                                           .format(channel))
            elif len(peaks) > len(mef):
                raise util.CytoflowOpError(None,
                                           "Found too many peaks for channel {}; "
                                           "check the diagnostic plot"
                                           .format(channel))
            elif len(peaks) == 1:
                # if we only have one peak, assume it's the brightest peak
                a = mef[-1] / peaks[0]
                self._mefs[channel] = [mef[-1]]
                self._calibration_functions[channel] = lambda x, a=a: a * x
            elif len(peaks) == 2:
                # if we have only two peaks, assume they're the brightest two
                self._mefs[channel] = [mef[-2], mef[-1]]
                a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0])
                self._calibration_functions[channel] = lambda x, a=a: a * x
            else:
                # if there are n > 2 peaks, check all the contiguous n-subsets
                # of mef for the one whose linear regression with the peaks
                # has the smallest (norm) sum-of-residuals.
                
                # do it in log10 space because otherwise the brightest peaks
                # have an outsized influence.
                                
                best_resid = np.inf
                for start, end in [(x, x+len(peaks)) for x in range(len(mef) - len(peaks) + 1)]:
                    mef_subset = mef[start:end]
                    
                    # linear regression of the peak locations against mef subset
                    lr = np.polyfit(np.log10(peaks), 
                                    np.log10(mef_subset), 
                                    deg = 1, 
                                    full = True)
                                        
                    resid = lr[1][0]
                    if resid < best_resid:
                        best_lr = lr[0]
                        best_resid = resid
                        self._mefs[channel] = mef_subset
   
                if self.force_linear:
                    # if we're forcing a linear scale for the calibration
                    # function, find that scale with an optimization.  (we can't
                    # use this above, to find the MEFs from the peaks, because
                    # when i tried it mis-identified the proper subset.)
                    
                    # even though this keeps things a linear scale, it can
                    # actually introduce *more* errors because "blank" beads
                    # still fluoresce.
                    
                    def s(x):
                        p = np.multiply(self._peaks[channel], x)
                        return np.sum(np.abs(np.subtract(p, self._mefs[channel])))
                    
                    res = scipy.optimize.minimize(s, [1])
                    
                    a = res.x[0]
                    self._calibration_functions[channel] = \
                        lambda x, a=a: a * x
                              
                else:              
                    # remember, these (linear) coefficients came from logspace, so 
                    # if the relationship in log10 space is Y = aX + b, then in
                    # linear space the relationship is x = 10**X, y = 10**Y,
                    # and y = (10**b) * x ^ a
                    
                    # also remember that the result of np.polyfit is a list of
                    # coefficients with the highest power first!  so if we
                    # solve y=ax + b, coeff #0 is a and coeff #1 is b
                    
                    a = best_lr[0]
                    b = 10 ** best_lr[1]
                    self._calibration_functions[channel] = \
                        lambda x, a=a, b=b: b * np.power(x, a)


    def apply(self, experiment):
        """
        Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the experiment to which this operation is applied
            
        Returns
        -------
        Experiment 
            A new experiment with the specified channels calibrated in
            physical units.  The calibrated channels also have new metadata:
            
            - **bead_calibration_fn** : Callable (pandas.Series --> pandas.Series)
                The function to calibrate raw data to bead units
        
            - **bead_units** : String
                The units this channel was calibrated to
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")
        
        channels = list(self.units.keys())

        if not self.units:
            raise util.CytoflowOpError('units', "No channels to calibrate.")
        
        if not self._calibration_functions:
            raise util.CytoflowOpError(None,
                                       "Calibration not found. "
                                       "Did you forget to call estimate()?")
        
        if not set(channels) <= set(experiment.channels):
            raise util.CytoflowOpError('units',
                                       "Module units don't match experiment channels")
                
        if set(channels) != set(self._calibration_functions.keys()):
            raise util.CytoflowOpError('units',
                                       "Calibration doesn't match units. "
                                       "Did you forget to call estimate()?")

        # two things.  first, you can't raise a negative value to a non-integer
        # power.  second, negative physical units don't make sense -- how can
        # you have the equivalent of -5 molecules of fluoresceine?  so,
        # we filter out negative values here.

        new_experiment = experiment.clone()
        
        for channel in channels:
            new_experiment.data = \
                new_experiment.data[new_experiment.data[channel] > 0]
                                
        new_experiment.data.reset_index(drop = True, inplace = True)
        
        for channel in channels:
            calibration_fn = self._calibration_functions[channel]
            
            new_experiment[channel] = calibration_fn(new_experiment[channel])
            new_experiment.metadata[channel]['bead_calibration_fn'] = calibration_fn
            new_experiment.metadata[channel]['bead_units'] = self.units[channel]
            if 'range' in experiment.metadata[channel]:
                new_experiment.metadata[channel]['range'] = calibration_fn(experiment.metadata[channel]['range'])
            if 'voltage' in experiment.metadata[channel]:
                del new_experiment.metadata[channel]['voltage']
            
        new_experiment.history.append(self.clone_traits(transient = lambda t: True)) 
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the peak finding is working.
        
        Returns
        -------
        IView
            An diagnostic view, call :meth:`~BeadCalibrationDiagnostic.plot` to 
            see the diagnostic plots
        """

        v = BeadCalibrationDiagnostic(op = self)
        v.trait_set(**kwargs)
        return v
    
    BEADS = \
    {
     # from http://www.spherotech.com/RCP-30-5A%20%20Rev%20K%20ML%23%20073112%20Rev.%20B.xls
     "Spherotech RCP-30-5A Lot AK02, AK03, AK04" :
        { "MECSB" :   [205,   470,   1211,   2740,  7516,  20122,  35573],
          "MEBFP" :   [844,   1958,  5422,   13522, 42717, 153501, 420359],
          "MEFL"  :   [771,   2106,  6262,   15183, 45292, 136258, 291042],
          "MEPE"  :   [487,   1474,  4516,   11260, 34341, 107608, 260461],
          "MEPTR" :   [205,   643,   2021,   5278,  17018, 62451,  198933],
          "MECY"  :   [1414,  3809,  10852,  27904, 85866, 324106, 1040895],
          "MECY7" :   [12752, 39057, 142958, 448890],
          "MEAP"  :   [341,   1027,  3156,   7750,  23446, 68702,  116813],
          "MEAPCY7" : [173,   427,   1097,   2399,  6359,  17475,  30725]},
        
     # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls
     "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01" :
        { "MECSB" :   [216,   464,   1232,   2940,  7669,  19812,  35474],
          "MEBFP" :   [861,   1997,  5776,   15233, 45389, 152562, 396759],
          "MEFL" :    [792,   2079,  6588,   16471, 47497, 137049, 271647],
          "MEPE" :    [531,   1504,  4819,   12506, 36159, 109588, 250892],
          "MEPTR" :   [233,   669,   2179,   5929,  18219, 63944,  188785],
          "MECY" :    [1614,  4035,  12025,  31896, 95682, 353225, 1077421],
          "MEPCY7" :  [14916, 42336, 153840, 494263],
          "MEAP" :    [373,   1079,  3633,   9896,  28189, 79831,  151008],
          "MEAPCY7" : [2864,  7644,  19081,  37258]},
     # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls
     "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R":
        { "MECSB" :   [179,   400,    993,   3203,  6083,  17777,  36331],
          "MEBFP" :   [700,   1705,   4262,  17546, 35669, 133387, 412089],
          "MEFL" :    [692,   2192,   6028,  17493, 35674, 126907, 290983],
          "MEPE" :    [505,   1777,   4974,  13118, 26757, 94930,  250470],
          "MEPTR" :   [207,   750,    2198,  6063,  12887, 51686,  170219],
          "MECY" :    [1437,  4693,   12901, 36837, 76621, 261671, 1069858],
          "MEPCY7" :  [32907, 107787, 503797],
          "MEAP" :    [587,   2433,   6720,  17962, 30866, 51704,  146080],
          "MEAPCY7" : [718,   1920,   5133,  9324,  14210, 26735]},
    "Spherotech URCP-100-2H (9 peaks)":
        {
          "MEFL" :    [3531, 11373, 34643, 107265, 324936, 835306,  2517654, 6069240],
          "MEPE" :    [2785, 9525,  28421, 90313,  275589, 713181,  2209251, 5738784],
          "MEPTR" :   [1158, 4161,  12528, 41140,  130347, 344149,  1091393, 2938710],
          "MEPCY" :   [6501, 20302, 59517, 183870, 550645, 1569470, 5109318, 17854584],
          "MEPCY7" :  [4490, 10967, 30210, 87027,  283621, 975312,  4409101, 24259524],
          "MEAP" :    [369,  749,   3426,  10413,  50013,  177490,  500257,  1252120],
          "MEAPCY7" : [1363, 2656,  9791,  25120,  96513,  328967,  864905,  2268931],
          "MECSB" :   [989,  2959,  8277,  25524,  71603,  173069,  491388,  1171641],
          "MEBFP" :   [1957, 5579,  16005, 53621,  168302, 459809,  1581762, 4999251]}}
    """
Beispiel #24
0
class GaussianMixtureOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to one or more channels.
    
    If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical 
    metadata variable named  ``name``, with possible values ``{name}_1`` .... 
    ``name_n`` where ``n`` is the number of components.  An event is assigned to 
    ``name_i`` category if it has the highest posterior probability of having been 
    produced by component ``i``.  If an event has a value that is outside the
    range of one of the channels' scales, then it is assigned to ``{name}_None``.
    
    Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new  
    ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where 
    ``n`` is the number of components.  The column ``{name}_i`` is ``True`` if 
    the event is less than :attr:`sigma` standard deviations from the mean of 
    component ``i``.  If :attr:`num_components` is ``1``, :attr:`sigma` must be 
    greater than 0.
    
    Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new 
    ``double`` metadata variables named ``{name}_1_posterior`` ... 
    ``{name}_n_posterior`` where ``n`` is the number of components.  The column 
    ``{name}_i_posterior`` contains the posterior probability that this event is 
    a member of component ``i``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components must be the same across each subset, though.
    
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the mixture model to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`~.set_default_scale`) is used.

    num_components : Int (default = 1)
        How many components to fit to the data?  Must be a positive integer.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in the boolean variable ``{name}_i``?  Must be ``>= 0.0``.  If 
        :attr:`num_components` is ``1``, must be ``> 0``.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit 
        the model separately to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.

    posteriors : Bool (default = False)
        If ``True``, add columns named ``{name}_{i}_posterior`` giving the 
        posterior probability that the event is in component ``i``.  Useful for 
        filtering out low-probability events.
        
    Notes
    -----
    
    We use the Mahalnobis distance as a multivariate generalization of the 
    number of standard deviations an event is from the mean of the multivariate
    gaussian.  If :math:`\\vec{x}` is an observation from a distribution with 
    mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the 
    Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`.
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['Y2-A'],
        ...                                scale = {'Y2-A' : 'log'},
        ...                                num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)
        
    And with two channels:
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['V2-A', 'Y2-A'],
        ...                                scale = {'V2-A' : 'log',
        ...                                         'Y2-A' : 'log'},
        ...                                num_components = 2)
        >>> gm_op.estimate(ex)   
        >>> ex2 = gm_op.apply(ex)
        >>> gm_op.default_view().plot(ex2)
        
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian')
    friendly_id = Constant("Gaussian Mixture Model")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(1, allow_zero=False)
    sigma = util.PositiveFloat(allow_zero=True)
    by = List(Str)

    posteriors = Bool(False)

    # the key is either a single value or a tuple
    _gmms = Dict(Any,
                 Instance(sklearn.mixture.GaussianMixture),
                 transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'channels', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    None, "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            gmm = sklearn.mixture.GaussianMixture(
                n_components=self.num_components,
                covariance_type="full",
                random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError(
                    None, "Estimator didn't converge"
                    " for group {0}".format(group))

            # in the 1D version, we sorted the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.

            # that doesn't work in the general case.  instead, we assume that
            # the clusters are likely (?) to be arranged along *one* of the
            # axes, so we take the |norm| of the mean of each cluster and
            # sort that way.

            norms = np.sum(gmm.means_**2, axis=1)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
            gmm.precisions_ = gmm.precisions_[sort_idx]
            gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if self.sigma > 0:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        'name',
                        "Experiment already has a column named {}".format(
                            cname))

        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        'name',
                        "Experiment already has a column named {}".format(
                            cname))

        if not self._gmms:
            raise util.CytoflowOpError(
                None, "No components found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(
                    None, "Model scale not set.  Did you forget "
                    "to call estimate()?")

        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))
#
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")

        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)


#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")

        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] *
                                          len(experiment),
                                          dtype="object")

        if self.sigma > 0:
            event_gate = {
                i: pd.Series([False] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.posteriors:
            event_posteriors = {
                i: pd.Series([0.0] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        # make the statistics
        components = [x + 1 for x in range(self.num_components)]

        prop_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components],
            names=list(self.by) + ["Component"])
        prop_stat = pd.Series(name="{} : {}".format(self.name, "proportion"),
                              index=prop_idx,
                              dtype=np.dtype(object)).sort_index()

        mean_idx = pd.MultiIndex.from_product(
            [experiment[x].unique()
             for x in self.by] + [components] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name="{} : {}".format(self.name, "mean"),
                              index=mean_idx,
                              dtype=np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name="{} : {}".format(self.name, "sigma"),
                               index=mean_idx,
                               dtype=np.dtype(object)).sort_index()
        interval_stat = pd.Series(name="{} : {}".format(self.name, "interval"),
                                  index=mean_idx,
                                  dtype=np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components] +
            [self.channels] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel_1"] +
            ["Channel_2"])
        corr_stat = pd.Series(name="{} : {}".format(self.name, "correlation"),
                              index=corr_idx,
                              dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])

                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(
                        self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx

                event_assignments.iloc[group_idx] = predicted_str

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]

                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s),
                                                (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed

                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)

                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)

            if self.posteriors:
                p = np.full((len(x), self.num_components), 0.0)
                p[~x_na] = gmm.predict_proba(x[~x_na])
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]

            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = tuple([c + 1])
                elif hasattr(group, '__iter__') and not isinstance(
                        group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.at[g] = gmm.weights_[c]

                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.at[g2] = self._scale[channel1].inverse(
                        gmm.means_[c, cidx1])

                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.at[g2] = (
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] -
                                                      s[cidx1]),
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] +
                                                      s[cidx1]))

                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]

                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True)

        new_experiment = experiment.clone()

        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.sigma > 0:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])

        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double",
                                             event_posteriors[c])

        new_experiment.statistics[(self.name,
                                   "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(
                self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(
                self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.

         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    'scale',
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                'channels',
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = GaussianMixture1DView(op=self)
            v.trait_set(channel=channels[0],
                        scale=scale[channels[0]],
                        **kwargs)
            return v

        elif len(channels) == 2:
            v = GaussianMixture2DView(op=self)
            v.trait_set(xchannel=channels[0],
                        ychannel=channels[1],
                        xscale=scale[channels[0]],
                        yscale=scale[channels[1]],
                        **kwargs)
            return v

        else:
            raise util.CytoflowViewError(
                'channels',
                "Can't specify more than two channels for a default view")
class BleedthroughPiecewiseDiagnostic(HasStrictTraits):
    """
    Plots a scatterplot of each channel vs every other channel and the 
    bleedthrough spline
    
    Attributes
    ----------
    name : Str
        The instance name (for serialization, UI etc.)
    
    op : Instance(BleedthroughPiecewiseOp)
        The op whose parameters we're viewing
        
    """

    # traits
    id = Constant(
        "edu.mit.synbio.cytoflow.view.autofluorescencediagnosticview")
    friendly_id = Constant("Autofluorescence Diagnostic")

    name = Str
    subset = Str

    # TODO - why can't I use BleedthroughPiecewiseOp here?
    op = Instance(IOperation)

    def plot(self, experiment=None, **kwargs):
        """Plot a faceted histogram view of a channel"""

        if experiment is None:
            raise util.CytoflowViewError("No experiment specified")

        if not self.op.controls:
            raise util.CytoflowViewError("No controls specified")

        if not self.op._splines:
            raise util.CytoflowViewError(
                "No splines. Did you forget to call estimate()?")

        kwargs.setdefault('histtype', 'stepfilled')
        kwargs.setdefault('alpha', 0.5)
        kwargs.setdefault('antialiased', True)

        plt.figure()

        channels = list(self.op._splines.keys())
        num_channels = len(channels)

        for from_idx, from_channel in enumerate(channels):
            for to_idx, to_channel in enumerate(channels):
                if from_idx == to_idx:
                    continue

                # make a little Experiment
                check_tube(self.op.controls[from_channel], experiment)
                tube_exp = ImportOp(
                    tubes=[Tube(file=self.op.controls[from_channel])],
                    channels={
                        experiment.metadata[c]["fcs_name"]: c
                        for c in experiment.channels
                    },
                    name_metadata=experiment.metadata['name_metadata'],
                    events=10000).apply()

                # apply previous operations
                for op in experiment.history:
                    tube_exp = op.apply(tube_exp)

                # subset it
                if self.subset:
                    try:
                        tube_exp = tube_exp.query(self.subset)
                    except Exception as e:
                        raise util.CytoflowOpError(
                            "Subset string '{0}' isn't valid".format(
                                self.subset)) from e

                    if len(tube_exp.data) == 0:
                        raise util.CytoflowOpError(
                            "Subset string '{0}' returned no events".format(
                                self.subset))

                # get scales
                xscale = util.scale_factory("logicle",
                                            tube_exp,
                                            channel=from_channel)
                yscale = util.scale_factory("logicle",
                                            tube_exp,
                                            channel=to_channel)

                tube_data = tube_exp.data

                plt.subplot(num_channels, num_channels,
                            from_idx + (to_idx * num_channels) + 1)
                plt.xscale('logicle', **xscale.mpl_params)
                plt.yscale('logicle', **yscale.mpl_params)
                plt.xlabel(from_channel)
                plt.ylabel(to_channel)
                plt.scatter(tube_data[from_channel],
                            tube_data[to_channel],
                            alpha=0.5,
                            s=1,
                            marker='o')

                spline = self.op._splines[from_channel][to_channel]
                xs = np.logspace(-1, math.log(tube_data[from_channel].max(),
                                              10))

                plt.plot(xs, spline(xs), 'g-', lw=3)

        plt.tight_layout(pad=0.8)
Beispiel #26
0
class GaussianMixture1DView(By1DView, AnnotatingView, HistogramView):
    """
    A default view for :class:`GaussianMixtureOp` that plots the histogram
    of a single channel, then the estimated Gaussian distributions on top of it.
    
    Attributes
    ----------    

    """

    id = Constant('edu.mit.synbio.cytoflow.view.gaussianmixture1dview')
    friendly_id = Constant("1D Gaussian Mixture Diagnostic Plot")

    channel = Str
    scale = util.ScaleEnum

    def plot(self, experiment, **kwargs):
        """
        Plot the plots.
        
        Parameters
        ----------
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if self.op.num_components == 1:
            annotation_facet = self.op.name + "_1"
        else:
            annotation_facet = self.op.name

        view, trait_name = self._strip_trait(annotation_facet)

        if self.channel in self.op._scale:
            scale = self.op._scale[self.channel]
        else:
            scale = util.scale_factory(self.scale,
                                       experiment,
                                       channel=self.channel)

        super(GaussianMixture1DView,
              view).plot(experiment,
                         annotation_facet=annotation_facet,
                         annotation_trait=trait_name,
                         annotations=self.op._gmms,
                         scale=scale,
                         **kwargs)

    def _annotation_plot(self, axes, annotation, annotation_facet,
                         annotation_value, annotation_color, **kwargs):

        # annotation is an instance of mixture.GaussianMixture
        gmm = annotation

        if annotation_value is None:
            for i in range(len(gmm.means_)):
                self._annotation_plot(axes, annotation, annotation_facet, i,
                                      annotation_color, **kwargs)
            return
        elif type(annotation_value) is str:
            try:
                idx_re = re.compile(annotation_facet + '_(\d+)')
                idx = idx_re.match(annotation_value).group(1)
                idx = int(idx) - 1
            except:
                return
        elif isinstance(annotation_value, np.bool_):
            if annotation_value:
                idx = 0
            else:
                return
        else:
            idx = annotation_value

        kwargs.setdefault('orientation', 'vertical')

        if kwargs['orientation'] == 'horizontal':
            scale = kwargs['yscale']
            patch_area = 0.0

            for k in range(0, len(axes.patches)):
                patch = axes.patches[k]
                if isinstance(patch, Polygon):
                    xy = patch.get_xy()
                    patch_area += poly_area([scale(p[1]) for p in xy],
                                            [p[0] for p in xy])
                elif isinstance(patch, Rectangle):
                    for xy in patch.get_path().to_polygons():
                        patch_area += poly_area([p[1] for p in xy],
                                                [p[0] for p in xy])

            plt_min, plt_max = plt.gca().get_ylim()
            y = scale.inverse(
                np.linspace(scale(scale.clip(plt_min)),
                            scale(scale.clip(plt_max)), 500))
            pdf_scale = patch_area * gmm.weights_[idx]
            mean = gmm.means_[idx][0]
            stdev = np.sqrt(gmm.covariances_[idx][0])
            x = scipy.stats.norm.pdf(scale(y), mean, stdev) * pdf_scale
            axes.plot(x, y, color=annotation_color)
        else:
            scale = kwargs['xscale']
            patch_area = 0.0

            for k in range(0, len(axes.patches)):
                patch = axes.patches[k]
                if isinstance(patch, Polygon):
                    xy = patch.get_xy()
                    patch_area += poly_area([scale(p[0]) for p in xy],
                                            [p[1] for p in xy])
                elif isinstance(patch, Rectangle):
                    for xy in patch.get_path().to_polygons():
                        patch_area += poly_area([p[0] for p in xy],
                                                [p[1] for p in xy])

            plt_min, plt_max = plt.gca().get_xlim()
            x = scale.inverse(
                np.linspace(scale(scale.clip(plt_min)),
                            scale(scale.clip(plt_max)), 500))
            pdf_scale = patch_area * gmm.weights_[idx]
            mean = gmm.means_[idx][0]
            stdev = np.sqrt(gmm.covariances_[idx][0])
            y = scipy.stats.norm.pdf(scale(x), mean, stdev) * pdf_scale
            axes.plot(x, y, color=annotation_color)
Beispiel #27
0
class AutofluorescenceDiagnosticView(HasStrictTraits):
    """
    Plots a histogram of each channel, and its median in red.  Serves as a
    diagnostic for the autofluorescence correction.
    
    Attributes
    ----------
    op : Instance(AutofluorescenceOp)
        The :class:`AutofluorescenceOp` whose parameters we're viewing. Set 
        automatically if you created the instance using 
        :meth:`AutofluorescenceOp.default_view`.
    
    subset : str (default = "")
        An expression that specifies the events that are plotted in the histograms
    """

    # traits
    id = Constant(
        'edu.mit.synbio.cytoflow.view.autofluorescencediagnosticview')
    friendly_id = Constant("Autofluorescence Diagnostic")

    op = Instance(AutofluorescenceOp)
    subset = Str

    def plot(self, experiment, **kwargs):
        """
        Plot a faceted histogram view of a channel
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if not self.op.channels:
            raise util.CytoflowViewError('op', "No channels specified")

        if not self.op._af_median:
            raise util.CytoflowViewError(
                'op', "Autofluorescence values aren't set. Did "
                "you forget to run estimate()?")

        if not set(self.op._af_median.keys()) <= set(experiment.channels) or \
           not set(self.op._af_stdev.keys()) <= set(experiment.channels):
            raise util.CytoflowViewError(
                'op', "Autofluorescence estimates aren't set, or are "
                "different than those in the experiment "
                "parameter. Did you forget to run estimate()?")

        if not set(self.op._af_median.keys()) == set(self.op._af_stdev.keys()):
            raise util.CytoflowOpError(
                'op', "Median and stdev keys are different! "
                "What the hell happened?!")

        if not set(self.op.channels) == set(self.op._af_median.keys()):
            raise util.CytoflowOpError(
                'op', "Estimated channels differ from the channels "
                "parameter.  Did you forget to (re)run estimate()?")

        import matplotlib.pyplot as plt
        import seaborn as sns  # @UnusedImport

        kwargs.setdefault('histtype', 'stepfilled')
        kwargs.setdefault('alpha', 0.5)
        kwargs.setdefault('antialiased', True)

        # make a little Experiment
        try:
            check_tube(self.op.blank_file, experiment)
            blank_exp = ImportOp(
                tubes=[Tube(file=self.op.blank_file)],
                channels={
                    experiment.metadata[c]["fcs_name"]: c
                    for c in experiment.channels
                },
                name_metadata=experiment.metadata['name_metadata']).apply()
        except util.CytoflowOpError as e:
            raise util.CytoflowViewError('op', e.__str__()) from e

        # apply previous operations
        for op in experiment.history:
            blank_exp = op.apply(blank_exp)

        # subset it
        if self.subset:
            try:
                blank_exp = blank_exp.query(self.subset)
            except Exception as exc:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(
                        self.subset)) from exc

            if len(blank_exp.data) == 0:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' returned no events".format(
                        self.subset))

        plt.figure()

        for idx, channel in enumerate(self.op.channels):
            d = blank_exp.data[channel]
            plt.subplot(len(self.op.channels), 1, idx + 1)
            plt.title(channel)
            plt.hist(d, bins=200, **kwargs)

            plt.axvline(self.op._af_median[channel], color='r')

        plt.tight_layout(pad=0.8)
Beispiel #28
0
class GaussianMixture2DView(By2DView, AnnotatingView, ScatterplotView):
    """
    A default view for :class:`GaussianMixtureOp` that plots the scatter plot
    of a two channels, then the estimated 2D Gaussian distributions on top of it.
    
    Attributes
    ----------
   
    """

    id = Constant('edu.mit.synbio.cytoflow.view.gaussianmixture2dview')
    friendly_id = Constant("2D Gaussian Mixture Diagnostic Plot")

    xchannel = Str
    xscale = util.ScaleEnum
    ychannel = Str
    yscale = util.ScaleEnum

    def plot(self, experiment, **kwargs):
        """
        Plot the plots.
        
        Parameters
        ----------
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if self.op.num_components == 1:
            annotation_facet = self.op.name + "_1"
        else:
            annotation_facet = self.op.name

        view, trait_name = self._strip_trait(annotation_facet)

        if self.xchannel in self.op._scale:
            xscale = self.op._scale[self.xchannel]
        else:
            xscale = util.scale_factory(self.xscale,
                                        experiment,
                                        channel=self.xchannel)

        if self.ychannel in self.op._scale:
            yscale = self.op._scale[self.ychannel]
        else:
            yscale = util.scale_factory(self.yscale,
                                        experiment,
                                        channel=self.ychannel)

        super(GaussianMixture2DView,
              view).plot(experiment,
                         annotation_facet=annotation_facet,
                         annotation_trait=trait_name,
                         annotations=self.op._gmms,
                         xscale=xscale,
                         yscale=yscale,
                         **kwargs)

    def _annotation_plot(self, axes, annotation, annotation_facet,
                         annotation_value, annotation_color, **kwargs):

        # annotation is an instance of mixture.GaussianMixture
        gmm = annotation

        if annotation_value is None:
            for i in range(len(gmm.means_)):
                self._annotation_plot(axes, annotation, annotation_facet, i,
                                      annotation_color, **kwargs)
            return
        elif isinstance(annotation_value, str):
            try:
                idx_re = re.compile(annotation_facet + '_(\d+)')
                idx = idx_re.match(annotation_value).group(1)
                idx = int(idx) - 1
            except:
                return
        elif isinstance(annotation_value, np.bool_):
            if annotation_value:
                idx = 0
            else:
                return
        else:
            idx = annotation_value

        xscale = kwargs['xscale']
        yscale = kwargs['yscale']

        mean = gmm.means_[idx]
        covar = gmm.covariances_[idx]

        v, w = scipy.linalg.eigh(covar)
        u = w[0] / scipy.linalg.norm(w[0])

        #rotation angle (in degrees)
        t = np.arctan(u[1] / u[0])
        t = 180 * t / np.pi

        # in order to scale the ellipses correctly, we have to make them
        # ourselves out of an affine-scaled unit circle.  The interface
        # is the same as matplotlib.patches.Ellipse

        _plot_ellipse(axes,
                      xscale,
                      yscale,
                      mean,
                      np.sqrt(v[0]),
                      np.sqrt(v[1]),
                      180 + t,
                      color=annotation_color,
                      fill=False,
                      linewidth=2)

        _plot_ellipse(axes,
                      xscale,
                      yscale,
                      mean,
                      np.sqrt(v[0]) * 2,
                      np.sqrt(v[1]) * 2,
                      180 + t,
                      color=annotation_color,
                      fill=False,
                      linewidth=2,
                      alpha=0.66)

        _plot_ellipse(axes,
                      xscale,
                      yscale,
                      mean,
                      np.sqrt(v[0]) * 3,
                      np.sqrt(v[1]) * 3,
                      180 + t,
                      color=annotation_color,
                      fill=False,
                      linewidth=2,
                      alpha=0.33)
class BleedthroughLinearOp(HasStrictTraits):
    """
    Apply matrix-based bleedthrough correction to a set of fluorescence channels.
    
    This is a traditional matrix-based compensation for bleedthrough.  For each
    pair of channels, the user specifies the proportion of the first channel
    that bleeds through into the second; then, the module performs a matrix
    multiplication to compensate the raw data.
    
    The module can also estimate the bleedthrough matrix using one
    single-color control per channel.
    
    This works best on data that has had autofluorescence removed first;
    if that is the case, then the autofluorescence will be subtracted from
    the single-color controls too.
    
    To use, set up the `controls` dict with the single color controls;
    call `estimate()` to parameterize the operation; check that the bleedthrough 
    plots look good with `default_view().plot()`; and then `apply()` to an 
    Experiment.
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation; optional for interactive use)
    
    controls : Dict(Str, File)
        The channel names to correct, and corresponding single-color control
        FCS files to estimate the correction splines with.  Must be set to
        use `estimate()`.
        
    spillover : Dict(Tuple(Str, Str), Float)
        The spillover "matrix" to use to correct the data.  The keys are pairs
        of channels, and the values are proportions of spectral overlap.  If 
        `("channel1", "channel2")` is present as a key, 
        `("channel2", "channel1")` must also be present.  The module does not
        assume that the matrix is symmetric.
        
    Notes
    -----


    Examples
    --------
    >>> bl_op = flow.BleedthroughLinearOp()
    >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs',
    ...                   'FITC-A' : 'merged/eyfp.fcs',
    ...                   'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'}
    >>>
    >>> bl_op.estimate(ex2)
    >>> bl_op.default_view().plot(ex2)    
    >>>
    >>> ex3 = bl_op.apply(ex2)
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_linear')
    friendly_id = Constant("Linear Bleedthrough Correction")

    name = CStr()

    controls = Dict(Str, File)
    spillover = Dict(Tuple(Str, Str), Float)

    def estimate(self, experiment, subset=None):
        """
        Estimate the bleedthrough from simgle-channel controls in `controls`
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        channels = self.controls.keys()

        if len(channels) < 2:
            raise util.CytoflowOpError(
                "Need at least two channels to correct bleedthrough.")

        # make sure the control files exist
        for channel in channels:
            if not os.path.isfile(self.controls[channel]):
                raise util.CytoflowOpError(
                    "Can't find file {0} for channel {1}.".format(
                        self.controls[channel], channel))

        for channel in channels:

            # make a little Experiment
            check_tube(self.controls[channel], experiment)
            tube_exp = ImportOp(tubes=[Tube(
                file=self.controls[channel])]).apply()

            # apply previous operations
            for op in experiment.history:
                tube_exp = op.apply(tube_exp)

            # subset it
            if subset:
                try:
                    tube_data = tube_exp.query(subset)
                except:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' isn't valid".format(self.subset))

                if len(tube_data.index) == 0:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' returned no events".format(
                            self.subset))
            else:
                tube_data = tube_exp.data

            # polyfit requires sorted data
            tube_data.sort(channel, inplace=True)

            for to_channel in channels:
                from_channel = channel

                if from_channel == to_channel:
                    continue

                # sometimes some of the data is off the edge of the
                # plot, and this screws up a linear regression

                from_min = np.min(tube_data[from_channel]) * 1.05
                from_max = np.max(tube_data[from_channel]) * 0.95
                tube_data = tube_data[tube_data[from_channel] > from_min]
                tube_data = tube_data[tube_data[from_channel] < from_max]

                to_min = np.min(tube_data[to_channel]) * 1.05
                to_max = np.max(tube_data[to_channel]) * 0.95
                tube_data = tube_data[tube_data[to_channel] > to_min]
                tube_data = tube_data[tube_data[to_channel] < to_max]

                tube_data.reset_index(drop=True, inplace=True)

                lr = np.polyfit(tube_data[from_channel],
                                tube_data[to_channel],
                                deg=1)

                self.spillover[(from_channel, to_channel)] = lr[0]

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment with the bleedthrough subtracted out.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self.spillover:
            raise util.CytoflowOpError("Spillover matrix isn't set. "
                                       "Did you forget to run estimate()?")

        for (from_channel, to_channel) in self.spillover:
            if not from_channel in experiment.data:
                raise util.CytoflowOpError(
                    "Can't find channel {0} in experiment".format(
                        from_channel))
            if not to_channel in experiment.data:
                raise util.CytoflowOpError(
                    "Can't find channel {0} in experiment".format(to_channel))

            if not (to_channel, from_channel) in self.spillover:
                raise util.CytoflowOpError("Must have both (from, to) and "
                                           "(to, from) keys in self.spillover")

        new_experiment = experiment.clone()

        # the completely arbitrary ordering of the channels
        channels = list(set([x for (x, _) in self.spillover.keys()]))

        # build the spillover matrix from the spillover dictionary
        a = [[self.spillover[(y, x)] if x != y else 1.0 for x in channels]
             for y in channels]

        # invert it.  use the pseudoinverse in case a is singular
        a_inv = np.linalg.pinv(a)

        new_experiment.data[channels] = np.dot(experiment.data[channels],
                                               a_inv)

        for channel in channels:
            # add the spillover values to the channel's metadata
            new_experiment.metadata[channel]['linear_bleedthrough'] = \
                {x : self.spillover[(x, channel)]
                     for x in channels if x != channel}

        new_experiment.history.append(self.clone_traits())
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to make sure spillover estimation is working.
        
        Returns
        -------
        IView : An IView, call plot() to see the diagnostic plots
        """

        # the completely arbitrary ordering of the channels
        channels = list(set([x for (x, _) in self.spillover.keys()]))

        if set(self.controls.keys()) != set(channels):
            raise util.CytoflowOpError(
                "Must have both the controls and bleedthrough to plot")

        return BleedthroughLinearDiagnostic(op=self, **kwargs)
Beispiel #30
0
class GaussianMixture1DWorkflowOp(WorkflowOperation, GaussianMixtureOp):
    # override id so we can differentiate the 1D and 2D ops
    id = Constant('edu.mit.synbio.cytoflowgui.operations.gaussian_1d')

    # add 'estimate' and 'apply' metadata
    name = Str(apply=True)
    channel = Str(estimate=True)
    channel_scale = util.ScaleEnum(estimate=True)
    num_components = util.PositiveCInt(1, allow_zero=False, estimate=True)
    sigma = util.PositiveCFloat(None,
                                allow_zero=True,
                                allow_none=True,
                                estimate=True)
    by = List(Str, estimate=True)

    # add the 'estimate_result' metadata
    _gmms = Dict(Any,
                 Instance(mixture.GaussianMixture),
                 transient=True,
                 estimate_result=True)

    # override the base class's "subset" with one that is dynamically generated /
    # updated from subset_list
    subset = Property(Str, observe="subset_list.items.str")
    subset_list = List(ISubset, estimate=True)

    # bits to support the subset editor
    @observe('subset_list:items.str')
    def _on_subset_changed(self, _):
        self.changed = 'subset_list'

    # MAGIC - returns the value of the "subset" Property, above
    def _get_subset(self):
        return " and ".join(
            [subset.str for subset in self.subset_list if subset.str])

    def estimate(self, experiment):
        self.channels = [self.channel]
        self.scale = {self.channel: self.channel_scale}
        super().estimate(experiment, subset=self.subset)

    def apply(self, experiment):
        if not self._gmms:
            raise util.CytoflowOpError(None, 'Click "Estimate"!')
        return GaussianMixtureOp.apply(self, experiment)

    def default_view(self, **kwargs):
        return GaussianMixture1DWorkflowView(op=self, **kwargs)

    def clear_estimate(self):
        self._gmms = {}
        self._scale = {}

    def get_notebook_code(self, idx):
        op = GaussianMixtureOp()
        op.copy_traits(self, op.copyable_trait_names())

        op.channels = [self.channel]
        op.scale = {self.channel: self.channel_scale}

        return dedent("""
        op_{idx} = {repr}
        
        op_{idx}.estimate(ex_{prev_idx}{subset})
        ex_{idx} = op_{idx}.apply(ex_{prev_idx})
        """.format(repr=repr(op),
                   idx=idx,
                   prev_idx=idx - 1,
                   subset=", subset = " +
                   repr(self.subset) if self.subset else ""))