Esempio n. 1
0
 def _get_trait(self):
     if self.type == 'metadata' or self.type == 'category':
         return CStr()
     elif self.type == 'float':
         return CFloat()
     elif self.type == 'bool':
         return ConvertingBool()
Esempio n. 2
0
class _Snake_Settings(HasTraits):
    length_weight = Float(0)  #alpha
    smoothness = Float(0.1)  #beta
    line_weight = Float(-1)  #w_line - -ve values seek dark pixels
    edge_weight = Float(0)
    boundaries = CStr('fixed')
    prefilter_sigma = Float(2)
Esempio n. 3
0
class FitSettings(HasTraits):
    coalescedProcessing = Enum(['useClumpIndexOnly','useTminTmaxIfAvailable'])
    cumulativeDistribution = Enum(['binned','empirical'])
    fitMode = Enum(['SingleMode','TwoModes'])
    Tau2Constant = Bool(False)
    Tau2FixedValue = Float(2.0)
    IDcolumn = CStr('objectID')
Esempio n. 4
0
class TimedSpecies(HasTraits):
    Species1 = CStr()
    Species1FromTime = Float()
    Species1ToTime = Float()
    
    Species2 = CStr()
    Species2FromTime = Float()
    Species2ToTime = Float()

    Species3 = CStr()
    Species3FromTime = Float()
    Species3ToTime = Float()


    traits_view = View(Group(Item(name = 'Species1'),
                             Item(name = 'Species1FromTime'),
                             Item(name = 'Species1ToTime'),
                             Item('_'),
                             Item(name = 'Species2'),
                             Item(name = 'Species2FromTime'),
                             Item(name = 'Species2ToTime'),
                             Item('_'),
                             Item(name = 'Species3'),
                             Item(name = 'Species3FromTime'),
                             Item(name = 'Species3ToTime'),
                             label = 'Specify Timed Species',
                             show_border = True),
                       buttons = OKCancelButtons)

    def getSpeciesDescriptor(self):
        speclist = {}
        if self.Species1: # empty strings will be ignored
            speclist[self.Species1] = (self.Species1FromTime,
                                               self.Species1ToTime)
        if self.Species2: # empty strings will be ignored
            speclist[self.Species2] = (self.Species2FromTime,
                                               self.Species2ToTime)
        if self.Species3: # empty strings will be ignored
            speclist[self.Species3] = (self.Species3FromTime,
                                               self.Species3ToTime)

        logger.info('speclist is ' + repr(speclist))
        return speclist
Esempio n. 5
0
class EllipseOp(HasStrictTraits):
    id = Constant('edu.mit.synbio.cytoflow.operations.ellipse')
    friendly_id = Constant("Ellipse")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    vertices = List((Float, Float))

    _xscale = Str("linear")
    _yscale = Str("linear")

    center =
    width =
    height =
    angle =


    def _plot_ellipse(self, center, width, height, angle, **kwargs):
        tf = transforms.Affine2D() \
             .scale(width * 0.5, height * 0.5) \
             .rotate_deg(angle) \
             .translate(*center)

        tf_path = tf.transform_path(path.Path.unit_circle())
        v = tf_path.vertices
        v = np.vstack((self.op._xscale.inverse(v[:, 0]),
                       self.op._yscale.inverse(v[:, 1]))).T

        scaled_path = path.Path(v, tf_path.codes)
        scaled_patch = patches.PathPatch(scaled_path, **kwargs)
        plt.gca().add_patch(scaled_patch)


    name = CStr()
    xchannel = Str()
    ychannel = Str()
    vertices = List((Float, Float))
Esempio n. 6
0
class ArduinoLCDActuator(AbstractArduinoActuator):
    """
        Actuator that sends target device digital output pin status change requests

        Needs `AutomateFirmata <https://github.com/tuomas2/AutomateFirmata>`_
    """

    _status = CStr(transient=True)

    #: Target device number
    device = CInt

    def _status_changed(self):
        self._arduino.lcd_print(self._status)
Esempio n. 7
0
class BleedthroughPiecewiseOp(HasStrictTraits):
    """
    Apply bleedthrough correction to a set of fluorescence channels.
    
    This is not a traditional bleedthrough matrix-based compensation; it uses
    a similar set of single-color controls, but instead of computing a compensation
    matrix, it fits a piecewise-linear spline to the untransformed data and
    uses those splines to compute the correction factor at each point in
    a mesh across the color space.  The experimental data is corrected using
    a linear interpolation along that mesh: this is much faster than computing
    the correction factor for each cell indiviually (an operation that takes
    5 msec each.)
    
    To use, set up the `controls` dict with the single color controls;
    call `estimate()` to parameterize the operation; check that the bleedthrough 
    plots look good with `default_view().plot()`; and then `apply()` to an 
    Experiment.
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation; optional for interactive use)
    
    controls : Dict(Str, File)
        The channel names to correct, and corresponding single-color control
        FCS files to estimate the correction splines with.  Must be set to
        use `estimate()`.
        
    num_knots : Int (default = 7)
        The number of internal control points to estimate, spaced log-evenly
        from 0 to the range of the channel.  Must be set to use `estimate()`.
        
    mesh_size : Int (default = 32)
        The size of each axis in the mesh used to interpolate corrected values.
        
    Notes
    -----
    We use an interpolation-based scheme to estimate corrected bleedthrough.
    The algorithm is as follows:
    
     - Fit a piecewise-linear spline to each single-color control's bleedthrough
       into other channels.  Because we want to fit the spline to untransfomed
       data, but capture both the negative, positive-linear and positive-log 
       portions of a traditional flow data set, we distribute the spline knots 
       evenly on an hlog-transformed axis for each color we're correcting.   

     - At each point on a regular mesh spanning the entire range of the
       instrument, estimate the mapping from (raw colors) --> (actual colors).
       The mesh points are also distributed evenly along the hlog-transformed
       color axes; this captures negative data as well as positive 
       This is quite slow: ~30 seconds for a mesh size of 32 in 3-space.
       Remember that additional channels expand the number of mesh points
       exponentially!

     - Use these estimates to paramaterize a linear interpolator (in linear
       space, this time).  There's one interpolator per output channel (so
       for a 3-channel correction, each interpolator is R^3 --> R).  For 
       each measured cell, run each interpolator to give the corrected output.

    Examples
    --------
    >>> bl_op = flow.BleedthroughPiecewiseOp()
    >>> bl_op.num_knots = 10
    >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs',
    ...                   'FITC-A' : 'merged/eyfp.fcs',
    ...                   'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'}
    >>>
    >>> bl_op.estimate(ex2)
    >>> bl_op.default_view().plot(ex2)    
    >>>
    >>> %time ex3 = bl_op.apply(ex2) # 410,000 cells
    CPU times: user 577 ms, sys: 27.7 ms, total: 605 ms
    Wall time: 607 ms
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_piecewise')
    friendly_id = Constant("Piecewise Bleedthrough Correction")

    name = CStr()

    controls = Dict(Str, File)
    num_knots = Int(7)
    mesh_size = Int(32)

    _splines = Dict(Str, Dict(Str, Python))
    _interpolators = Dict(Str, Python)

    # because the order of the channels is important, we can't just call
    # _interpolators.keys()
    # TODO - this is ugly and unpythonic.  :-/
    _channels = List(Str)

    def estimate(self, experiment, subset=None):
        """
        Estimate the bleedthrough from the single-channel controls in `controls`
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.num_knots < 3:
            raise util.CytoflowOpError(
                "Need to allow at least 3 knots in the spline")

        self._channels = self.controls.keys()

        if len(self._channels) < 2:
            raise util.CytoflowOpError(
                "Need at least two channels to correct bleedthrough.")

        self._splines = {}
        mesh_axes = []

        for channel in self._channels:
            self._splines[channel] = {}

            # make a little Experiment
            check_tube(self.controls[channel], experiment)
            tube_exp = ImportOp(tubes=[Tube(
                file=self.controls[channel])]).apply()

            # apply previous operations
            for op in experiment.history:
                tube_exp = op.apply(tube_exp)

            # subset it
            if subset:
                try:
                    tube_data = tube_exp.query(subset).copy()
                except:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' isn't valid".format(self.subset))

                if len(tube_data.index) == 0:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' returned no events".format(
                            self.subset))
            else:
                tube_data = tube_exp.data.copy()

            # polyfit requires sorted data
            tube_data.sort_values(by=channel, inplace=True)

            channel_min = tube_data[channel].min()
            channel_max = tube_data[channel].max()

            # we're going to set the knots and splines evenly across the hlog-
            # transformed data, so as to capture both the "linear" aspect
            # of near-0 and negative values, and the "log" aspect of large
            # values

            # parameterize the hlog transform
            r = experiment.metadata[channel]['range']  # instrument range
            d = np.log10(r)  # maximum display scale, in decades

            # the transition point from linear --> log scale
            # use half of the log-transformed scale as "linear".
            b = 2**(np.log2(r) / 2)

            # the splines' knots
            knot_min = channel_min
            knot_max = channel_max

            hlog_knot_min, hlog_knot_max = \
                hlog((knot_min, knot_max), b = b, r = r, d = d)
            hlog_knots = np.linspace(hlog_knot_min, hlog_knot_max,
                                     self.num_knots)
            knots = hlog_inv(hlog_knots, b=b, r=r, d=d)

            # only keep the interior knots
            knots = knots[1:-1]

            # the interpolators' mesh
            if 'af_median' in experiment.metadata[channel] and \
               'af_stdev' in experiment.metadata[channel]:
                mesh_min = experiment.metadata[channel]['af_median'] - \
                           3 * experiment.metadata[channel]['af_stdev']
            else:
                mesh_min = -0.01 * r  # TODO - does this even work?

            mesh_max = r

            hlog_mesh_min, hlog_mesh_max = \
                hlog((mesh_min, mesh_max), b = b, r = r, d = d)
            hlog_mesh_axis = \
                np.linspace(hlog_mesh_min, hlog_mesh_max, self.mesh_size)

            mesh_axis = hlog_inv(hlog_mesh_axis, b=b, r=r, d=d)
            mesh_axes.append(mesh_axis)

            for to_channel in self._channels:
                from_channel = channel
                if from_channel == to_channel:
                    continue

                self._splines[from_channel][to_channel] = \
                    scipy.interpolate.LSQUnivariateSpline(tube_data[from_channel].values,
                                                          tube_data[to_channel].values,
                                                          t = knots,
                                                          k = 1)

        mesh = pandas.DataFrame(util.cartesian(mesh_axes),
                                columns=[x for x in self._channels])

        mesh_corrected = mesh.apply(_correct_bleedthrough,
                                    axis=1,
                                    args=([[x for x in self._channels],
                                           self._splines]))

        for channel in self._channels:
            chan_values = np.reshape(mesh_corrected[channel],
                                     [len(x) for x in mesh_axes])
            self._interpolators[channel] = \
                scipy.interpolate.RegularGridInterpolator(points = mesh_axes,
                                                          values = chan_values,
                                                          bounds_error = False,
                                                          fill_value = 0.0)

        # TODO - some sort of validity checking.

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment with the bleedthrough subtracted out.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self._interpolators:
            raise util.CytoflowOpError("Module interpolators aren't set. "
                                       "Did you run estimate()?")

        if not set(self._interpolators.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError(
                "Module parameters don't match experiment channels")

        new_experiment = experiment.clone()

        # get rid of data outside of the interpolators' mesh
        # (-3 * autofluorescence sigma)
        for channel in self._channels:

            # if you update the mesh calculation above, update it here too!
            if 'af_median' in experiment.metadata[channel] and \
               'af_stdev' in experiment.metadata[channel]:
                mesh_min = experiment.metadata[channel]['af_median'] - \
                           3 * experiment.metadata[channel]['af_stdev']
            else:
                mesh_min = -0.01 * experiment.metadata[channel][
                    'range']  # TODO - does this even work?

            new_experiment.data = \
                new_experiment.data[new_experiment.data[channel] > mesh_min]

        new_experiment.data.reset_index(drop=True, inplace=True)

        old_data = new_experiment.data[self._channels]

        for channel in self._channels:
            new_experiment[channel] = self._interpolators[channel](old_data)

            # add the correction splines to the experiment metadata so we can
            # correct other controls later on
            new_experiment.metadata[channel]['piecewise_bleedthrough'] = \
                (self._channels, self._interpolators[channel])

        new_experiment.history.append(self.clone_traits())
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the bleedthrough spline estimation
        is working.
        
        Returns
        -------
            IView : An IView, call plot() to see the diagnostic plots
        """

        if set(self.controls.keys()) != set(self._splines.keys()):
            raise util.CytoflowOpError(
                "Must have both the controls and bleedthrough to plot")

        return BleedthroughPiecewiseDiagnostic(op=self, **kwargs)
Esempio n. 8
0
class TornadoService(AbstractUserService):
    """
    Abstract service that provides HTTP server for WSGI applications.
    """

    #: Which ip address to listen. Use ``0.0.0.0`` (default) to listen to all local networking interfaces.
    http_ipaddr = CStr("0.0.0.0")

    #: HTTP (or HTTPS if using SSL) port to listen
    http_port = Int(3000)

    #: Path to ssl certificate file. If set, SSL will be used.
    #:
    #: .. tip::
    #:
    #:   You may use script scripts/generate_selfsigned_certificate.sh to generate a
    #:   self-signed openssl certificate.
    ssl_certificate = CStr

    #: Path to ssl private key file
    ssl_private_key = CStr

    #: Number of listener threads to spawn
    num_threads = Int(5)

    #: Extra static dirs you want to serve. Example::
    #:
    #:    static_dirs = {'/my_static/(.*)': '/path/to/my_static'}
    static_dirs = Dict(key_trait=Str, value_trait=Str)

    _http_server = Instance(tornado.httpserver.TCPServer)

    @property
    def is_alive(self):
        return bool(self._http_server)

    def get_wsgi_application(self):
        """
            Get WSGI function. Implement this in subclasses.
        """
        raise NotImplementedError

    def get_websocket(self):
        return None

    def get_filehandler_class(self):
        return tornado.web.StaticFileHandler

    def get_tornado_handlers(self):
        tornado_handlers = []
        websocket = self.get_websocket()
        if websocket:
            tornado_handlers.append(('/socket', websocket))

        for entrypoint, path in self.static_dirs.items():
            tornado_handlers.append(
                (entrypoint, self.get_filehandler_class(), {
                    'path': path
                }))

        wsgi_app = self.get_wsgi_application()

        if wsgi_app:
            wsgi_container = tornado.wsgi.WSGIContainer(wsgi_app)
            tornado_handlers.append(('.*', tornado.web.FallbackHandler,
                                     dict(fallback=wsgi_container)))
        return tornado_handlers

    def setup(self):
        if self.is_alive:
            self.logger.debug(
                'Server is already running, no need to start new')

        tornado_app = tornado.web.Application(self.get_tornado_handlers())

        if self.ssl_certificate and self.ssl_private_key:
            ssl_options = {
                "certfile": self.ssl_certificate,
                "keyfile": self.ssl_private_key,
            }
        else:
            ssl_options = None

        self._http_server = tornado.httpserver.HTTPServer(
            tornado_app, ssl_options=ssl_options)

        try:
            self._http_server.listen(self.http_port, self.http_ipaddr)
        except socket.error as e:
            self.logger.exception('Could not start server: %s', e)
            self._http_server = None
            return

        self.start_ioloop()

    def start_ioloop(self):
        global web_thread
        ioloop = tornado.ioloop.IOLoop.instance()
        if not ioloop._running:
            web_thread = threading.Thread(
                target=threaded(self.system, ioloop.start),
                name="%s::%s" % (self.system.name, self.__class__.__name__))
            web_thread.start()

    def cleanup(self):
        if self.is_alive:
            tornado.ioloop.IOLoop.instance().stop()
            self._http_server.stop()
            self._http_server = None
            web_thread.join()
Esempio n. 9
0
class SocketSensor(AbstractSensor):
    """
        Sensor that reads a TCP socket.

        Over TCP port, it reads data per lines and tries to set the status of the sensor
        to the value specified by the line. If content of the line is 'close', then connection
        is dropped.
    """

    #: Hostname/IP to listen. Use ``'0.0.0.0'``  to listen all interfaces.
    host = CStr('0.0.0.0')

    #: Port to listen
    port = CInt

    #: set to ``True`` to tell SocketSensor to stop listening to port
    stop = CBool(transient=True)

    _socket = Instance(socket.socket, transient=True)
    _status = CInt

    def listen_loop(self):
        while not self.stop:
            try:
                self.logger.info('%s listening to connections in port %s',
                                 self.name, self.port)
                self._socket.listen(1)
                self._socket.settimeout(1)
                while not self.stop:
                    try:
                        conn, addr = self._socket.accept()
                    except socket.timeout:
                        continue
                    break
                self.logger.info('%s connected from %s', self.name, addr)
                conn.settimeout(1)
                while not self.stop:
                    try:
                        data = conn.recv(1024)
                        if not data:
                            break
                        self.status = int(data.strip())
                        conn.sendall('OK\n')
                    except socket.timeout:
                        data = ''
                    except ValueError:
                        if data.strip() == 'close':
                            break
                        conn.sendall('NOK\n')
            except socket.error as e:
                self.logger.info("%s: Error %s caught.", self, e)
            except:
                if self.stop:
                    return
                else:
                    raise
            conn.close()
            self.logger.info('%s: connection %s closed', self.name, addr)
        self._socket.close()

    def setup(self):
        self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self._socket.bind((self.host, self.port))
        t = threading.Thread(target=self.listen_loop,
                             name='SocketSensor %s' % self.name)
        t.start()

    def cleanup(self):
        self.stop = True
Esempio n. 10
0
class DensityGateOp(HasStrictTraits):
    """
    This module computes a gate based on a 2D density plot.  The user chooses
    what proportion of events to keep, and the module creates a gate that selects
    that proportion of events in the highest-density bins of the 2D density
    histogram.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the binning to.
        
    ychannel : Str
        The Y channel to apply the binning to.

    xscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the X acis before fitting the data?  

    yscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the Y axis before fitting the data?  
        
    keep : Float (default = 0.9)
        What proportion of events to keep?  Must be ``>0`` and ``<1`` 
        
    bins : Int (default = 100)
        How many bins should there be on each axis?  Must be positive.
        
    min_quantile : Float (default = 0.001)
        Clip values below this quantile
        
    max_quantile : Float (default = 1.0)
        Clip values above this quantile

    sigma : Float (default = 1.0)
        What standard deviation to use for the gaussian blur?
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the gate.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit a 
        separate gate to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.
        
    Notes
    -----
    This gating method was developed by John Sexton, in Jeff Tabor's lab at
    Rice University.  
    
    From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html,
    the method is as follows:
    
    1. Determines the number of events to keep, based on the user specified 
       gating fraction and the total number of events of the input sample.
       
    2. Divides the 2D channel space into a rectangular grid, and counts the 
       number of events falling within each bin of the grid. The number of 
       counts per bin across all bins comprises a 2D histogram, which is a 
       coarse approximation of the underlying probability density function.
       
    3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. 
       Theoretically, the proper amount of smoothing results in a better 
       estimate of the probability density function. Practically, smoothing 
       eliminates isolated bins with high counts, most likely corresponding to 
       noise, and smoothes the contour of the gated region.
       
    4. Selects the bins with the greatest number of events in the smoothed 
       histogram, starting with the highest and proceeding downward until the 
       desired number of events to keep, calculated in step 1, is achieved.
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> dens_op = flow.DensityGateOp(name = 'Density',
        ...                              xchannel = 'FSC-A',
        ...                              xscale = 'log',
        ...                              ychannel = 'SSC-A',
        ...                              yscale = 'log',
        ...                              keep = 0.5)
        
    Find the bins to keep
    
    .. plot::
        :context: close-figs
        
        >>> dens_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> dens_op.default_view().plot(ex)
        
    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = dens_op.apply(ex)
        
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.density')
    friendly_id = Constant("Density Gate")
    
    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    keep = util.PositiveFloat(0.9, allow_zero = False)
    bins = util.PositiveInt(100, allow_zero = False)
    min_quantile = util.PositiveFloat(0.001, allow_zero = True)
    max_quantile = util.PositiveFloat(1.0, allow_zero = False)
    sigma = util.PositiveFloat(1.0, allow_zero = False)
    by = List(Str)
        
    _xscale = Instance(util.IScale, transient = True)
    _yscale = Instance(util.IScale, transient = True)
    
    _xbins = Array(transient = True)
    _ybins = Array(transient = True)

    _keep_xbins = Dict(Any, Array, transient = True)
    _keep_ybins = Dict(Any, Array, transient = True)
    _histogram = Dict(Any, Array, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Split the data set into bins and determine which ones to keep.
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the gate parameters.
            
        subset : Str (default = None)
            If set, determine the gate parameters on only a subset of the
            ``experiment`` parameter.
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError('xchannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.xchannel))
            
        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError('ychannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.ychannel))

        if self.min_quantile > 1.0:
            raise util.CytoflowOpError('min_quantile',
                                       "min_quantile must be <= 1.0")
            
        if self.max_quantile > 1.0:
            raise util.CytoflowOpError('max_quantile',
                                       "max_quantile must be <= 1.0")
               
        if not (self.max_quantile > self.min_quantile):
            raise util.CytoflowOpError('max_quantile',
                                       "max_quantile must be > min_quantile")
        
        if self.keep > 1.0:
            raise util.CytoflowOpError('keep',
                                       "keep must be <= 1.0")

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError('subset',
                                            "Subset string '{0}' isn't valid"
                                            .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = xscale = util.scale_factory(self.xscale, experiment, channel = self.xchannel)
        self._yscale = yscale = util.scale_factory(self.yscale, experiment, channel = self.ychannel)
        

        xlim = (xscale.clip(experiment[self.xchannel].quantile(self.min_quantile)),
                xscale.clip(experiment[self.xchannel].quantile(self.max_quantile)))
                  
        ylim = (yscale.clip(experiment[self.ychannel].quantile(self.min_quantile)),
                yscale.clip(experiment[self.ychannel].quantile(self.max_quantile)))
        
        self._xbins = xbins = xscale.inverse(np.linspace(xscale(xlim[0]), 
                                                         xscale(xlim[1]), 
                                                         self.bins))
        self._ybins = ybins = yscale.inverse(np.linspace(yscale(ylim[0]), 
                                                         yscale(ylim[1]), 
                                                         self.bins))
                    
        for group, group_data in groupby:
            if len(group_data) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))

            h, _, _ = np.histogram2d(group_data[self.xchannel], 
                                     group_data[self.ychannel], 
                                     bins=[xbins, ybins])
            
            h = scipy.ndimage.filters.gaussian_filter(h, sigma = self.sigma)
            
            i = scipy.stats.rankdata(h, method = "ordinal") - 1
            i = np.unravel_index(np.argsort(-i), h.shape)
            
            goal_count = self.keep * len(group_data)
            curr_count = 0
            num_bins = 0

            while(curr_count < goal_count and num_bins < i[0].size):
                curr_count += h[i[0][num_bins], i[1][num_bins]]
                num_bins += 1
                
            self._keep_xbins[group] = i[0][0:num_bins]
            self._keep_ybins[group] = i[1][0:num_bins]
            self._histogram[group] = h

            
    def apply(self, experiment):
        """
        Creates a new condition based on membership in the gate that was
        parameterized with :meth:`estimate`.
        
        Parameters
        ----------
        experiment : Experiment
            the :class:`.Experiment` to apply the gate to.
            
        Returns
        -------
        Experiment
            a new :class:`.Experiment` with the new gate applied.
        """
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if not self.xchannel:
            raise util.CytoflowOpError('xchannel',
                                       "Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel',
                                       "Must set Y channel")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
        
        if not (self._xbins.size and self._ybins.size and self._keep_xbins):
            raise util.CytoflowOpError(None,
                                       "No gate estimate found.  Did you forget to "
                                       "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _xscale.  What happened??")
        
        if not self._yscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError('xchannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError('ychannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.ychannel))
       
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        event_assignments = pd.Series([False] * len(experiment), dtype = "bool")
        
        for group, group_data in groupby:
            if group not in self._keep_xbins:
                # there weren't any events in this group, so we didn't get
                # an estimate
                continue
            
            group_idx = groupby.groups[group]
            
            cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False)
            cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False)

            group_keep = pd.Series([False] * len(group_data))
            
            keep_x = self._keep_xbins[group]
            keep_y = self._keep_ybins[group]
            
            for (xbin, ybin) in zip(keep_x, keep_y):
                group_keep = group_keep | ((cX == xbin) & (cY == ybin))
                            
            event_assignments.iloc[group_idx] = group_keep
                    
        new_experiment = experiment.clone()
        
        new_experiment.add_condition(self.name, "bool", event_assignments)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
     
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
        IView
            a diagnostic view, call :meth:`~DensityGateView.plot` to see the 
            diagnostic plot.
        """
        v = DensityGateView(op = self)
        v.trait_set(**kwargs)
        return v
Esempio n. 11
0
class PhysioData(HasTraits):
    """
    Contains the parameters needed to run a MEAP session
    """

    available_widgets = Instance(list)
    def _available_widgets_default(self):
        available_panels = ["Annotation"]
        if "dzdt" in self.contents and "z0" in self.contents:
            available_panels.append("ICG B Point")
        if "doppler" in self.contents:
            available_panels.append("Doppler")
        if self.dzdt_warping_functions.size > 0:
            available_panels.append("Registration")
        
        return available_panels


    contents = Property(Set)
    def _get_contents(self):
        """
        Assuming this object is already initialized, this trait
        will check for which data are available. For each signal
        type if the raw timeseries is available,
        """
        contents = set()
        for signal in ENSEMBLE_SIGNALS | set(('respiration',)):
            attr = signal+"_data"
            if not hasattr(self, attr): continue
            if getattr(self,attr).size > 0:
                contents.update((signal,))

        # Check for respiration-corrected versions of z0 and dzdt
        for signal in ["resp_corrected_z0", "resp_corrected_dzdt"]:
            if not hasattr(self, signal): continue
            if getattr(self,signal).size > 0:
                contents.update((signal,))

        return contents

    calculable_indexes = Property(Set)
    @cached_property
    def _get_calculable_indexes(self):
        """
        Determines, based on content, which indexes are possible
        to calculate.
        """
        # Signals
        has_ecg = "ecg" in self.contents
        has_z0 = "z0" in self.contents
        has_dzdt = "dzdt" in self.contents
        has_resp = "respiration" in self.contents
        has_systolic = "systolic" in self.contents
        has_diastolic = "diastolic" in self.contents
        has_bp = "bp" in self.contents
        has_resp_corrected_z0 = self.resp_corrected_z0.size > 0
        has_l = self.subject_l > 1


        # Indexes
        has_hr = False
        has_lvet = False
        has_sv = False
        has_map = False
        has_co = False

        ix = set()
        if has_ecg:
            has_hr = True
            ix.update(("hr","hrv"))
        if has_ecg and has_dzdt:
            has_lvet = True
            ix.update(("pep", "lvet", "eef"))
        if has_lvet and has_l and has_z0:
            has_sv = True
            ix.update(("sv",))
            if has_resp_corrected_z0:
                ix.update(("resp_corrected_sv",))
        if has_bp or has_systolic and has_diastolic:
            has_map = True
            ix.update(("map",))
        if has_hr and has_sv:
            has_co = True
            ix.update(("co",))
            if has_resp_corrected_z0:
                ix.update(("resp_corrected_co",))
        if has_co and has_map:
            ix.update(("tpr",))
            if has_resp_corrected_z0:
                ix.update(("resp_corrected_tpr",))
        if has_resp:
            ix.update(("nbreaths"))
        return ix

    meap_version = CStr(__version__)
    original_file = File
    file_location = File

    # -- Censored Epochs --
    censored_intervals = Array
    censoring_sources = List

    @cached_property
    def _get_censored_regions(self):
        censor_regions = []
        for signal in self.contents:
            censor_regions += getattr(self, signal+"_ts").censored_regions

    # MEA Weighting function
    mea_window_type = PrototypedFrom("config")
    mea_n_neighbors = PrototypedFrom("config")
    mea_window_secs = PrototypedFrom("config")
    mea_exp_power = PrototypedFrom("config")
    mea_func_name = PrototypedFrom("config")
    mea_weight_direction = PrototypedFrom("config")
    use_trimmed_co = PrototypedFrom("config")
    mea_smooth_hr = PrototypedFrom("config")
    mea_weights = Array

    use_secondary_heartbeat = PrototypedFrom("config")
    secondary_heartbeat = PrototypedFrom("config")
    secondary_heartbeat_pre_msec = PrototypedFrom("config")
    secondary_heartbeat_abs = PrototypedFrom("config")
    secondary_heartbeat_window = PrototypedFrom("config")
    secondary_heartbeat_window_len = PrototypedFrom("config")
    secondary_heartbeat_n_likelihood_bins = PrototypedFrom("config")

    use_ECG2 = PrototypedFrom("config")
    ecg2_weight = PrototypedFrom("config")
    qrs_signal_source = PrototypedFrom("config")

    # Bpoint classifier options
    bpoint_classifier_pre_point_msec = PrototypedFrom("config")
    bpoint_classifier_post_point_msec = PrototypedFrom("config")
    bpoint_classifier_sample_every_n_msec =PrototypedFrom("config")
    bpoint_classifier_false_distance_min =PrototypedFrom("config")
    bpoint_classifier_use_bpoint_prior =PrototypedFrom("config")
    bpoint_classifier_include_derivative =PrototypedFrom("config")
    # Contains errors in msec from bpoint cross validation
    bpoint_classifier_cv_error = Array

    # Points on doppler signal
    dx_point_type = PrototypedFrom("config")
    dx_point_window_len = PrototypedFrom("config")
    db_point_type = PrototypedFrom("config")
    db_point_window_len = PrototypedFrom("config")

    # Impedance Data
    z0_winsor_min = CFloat(0.005)
    z0_winsor_max = CFloat(0.005)
    z0_winsorize = CBool(False)
    z0_included = CBool(False)
    z0_decimated = CBool(False)
    z0_channel_name = CStr("")
    z0_sampling_rate = CFloat(1000)
    z0_sampling_rate_unit = CStr("Hz")
    z0_unit = CStr("Ohms")
    z0_start_time = CFloat(0.)
    z0_data = Array
    mea_z0_matrix = Array
    z0_matrix = Property(Array,depends_on="peak_indices")
    def _get_z0_matrix(self):
        if self.peak_indices.size == 0: return np.array([])
        return peak_stack(self.peak_indices,self.z0_data,
                          pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak,
                          sampling_rate=self.z0_sampling_rate)

    mea_resp_corrected_z0_matrix = Array
    resp_corrected_z0_matrix = Property(Array,depends_on="peak_indices")
    def _get_resp_corrected_z0_matrix(self):
        if self.peak_indices.size == 0 or self.resp_corrected_z0.size == 0:
            return np.array([])
        return peak_stack(self.peak_indices,self.resp_corrected_z0,
                          pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak,
                          sampling_rate=self.z0_sampling_rate)

    dzdt_winsor_min = CFloat(0.005)
    dzdt_winsor_max = CFloat(0.005)
    dzdt_winsorize = CBool(False)
    dzdt_included = CBool(False)
    dzdt_decimated = CBool(False)
    dzdt_channel_name = CStr("")
    dzdt_sampling_rate = CFloat(1000)
    dzdt_sampling_rate_unit = CStr("Hz")
    dzdt_unit = CStr("Ohms/Sec")
    dzdt_start_time = CFloat(0.)
    dzdt_data = Array
    dzdt_matrix = Property(Array,depends_on="peak_indices")
    mea_dzdt_matrix = Array
    @cached_property
    def _get_dzdt_matrix(self):
        logger.info("constructing dZ/dt matrix")
        if self.peak_indices.size == 0: return np.array([])
        return peak_stack(self.peak_indices,self.dzdt_data,
                          pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak,
                          sampling_rate=self.dzdt_sampling_rate)

    # Doppler radar
    doppler_winsor_min = CFloat(0.005)
    doppler_winsor_max = CFloat(0.005)
    doppler_winsorize = CBool(False)
    doppler_included = CBool(False)
    doppler_decimated = CBool(False)
    doppler_channel_name = CStr("")
    doppler_sampling_rate = CFloat(1000)
    doppler_sampling_rate_unit = CStr("Hz")
    doppler_unit = CStr("Ohms/Sec")
    doppler_start_time = CFloat(0.)
    doppler_data = Array
    doppler_matrix = Property(Array,depends_on="peak_indices")
    mea_doppler_matrix = Array
    @cached_property
    def _get_doppler_matrix(self):
        if self.peak_indices.size == 0: return np.array([])
        return peak_stack(self.peak_indices,self.doppler_data,
                          pre_msec=self.doppler_pre_peak,post_msec=self.doppler_post_peak,
                          sampling_rate=self.doppler_sampling_rate)

    # Respiration
    resp_corrected_dzdt_matrix = Property(Array,depends_on="peak_indices")
    mea_resp_corrected_dzdt_matrix = Array
    @cached_property
    def _get_resp_corrected_dzdt_matrix(self):
        if self.peak_indices.size == 0 or self.resp_corrected_dzdt.size == 0:
            return np.array([])
        return peak_stack(self.peak_indices,self.resp_corrected_dzdt,
                          pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak,
                          sampling_rate=self.dzdt_sampling_rate)


    # ECG
    ecg_included = CBool(False)
    ecg_winsor_min = CFloat(0.005)
    ecg_winsor_max = CFloat(0.005)
    ecg_winsorize = CBool(False)
    ecg_decimated = CBool(False)
    ecg_channel_name = CStr("")
    ecg_sampling_rate = CFloat(1000)
    ecg_sampling_rate_unit = CStr("Hz")
    ecg_unit = CStr("V")
    ecg_start_time = CFloat(0.)
    ecg_data = Array
    ecg_matrix = Property(Array,depends_on="peak_indices")
    mea_ecg_matrix = Array
    @cached_property
    def _get_ecg_matrix(self):
        if self.peak_indices.size == 0: return np.array([])
        return peak_stack(self.peak_indices,self.ecg_data,
                          pre_msec=self.ecg_pre_peak,post_msec=self.ecg_post_peak,
                          sampling_rate=self.ecg_sampling_rate)

    # ECG Secondary (eg from EEG)
    ecg2_included = CBool(False)
    ecg2_winsor_min = CFloat(0.005)
    ecg2_winsor_max = CFloat(0.005)
    ecg2_winsorize = CBool(False)
    ecg2_decimated = CBool(False)
    ecg2_channel_name = CStr("")
    ecg2_sampling_rate = CFloat(1000)
    ecg2_sampling_rate_unit = CStr("Hz")
    ecg2_unit = CStr("V")
    ecg2_start_time = CFloat(0.)
    ecg2_data = Array
    ecg2_matrix = Property(Array,depends_on="peak_indices")
    mea_ecg2_matrix = Array
    @cached_property
    def _get_ecg2_matrix(self):
        if self.peak_indices.size == 0: return np.array([])
        return peak_stack(self.peak_indices,self.ecg2_data,
                          pre_msec=self.ecg_pre_peak,post_msec=self.ecg_post_peak,
                          sampling_rate=self.ecg_sampling_rate)

    # Blood pressure might come from a CNAP
    using_continuous_bp = CBool(False)
    bp_included = CBool(False)
    bp_winsor_min = CFloat(0.005)
    bp_winsor_max = CFloat(0.005)
    bp_winsorize = CBool(False)
    bp_decimated = CBool(False)
    bp_channel_name = CStr("")
    bp_sampling_rate = CFloat(1000)
    bp_sampling_rate_unit = CStr("Hz")
    bp_unit = CStr("mmHg")
    bp_start_time = CFloat(0.)
    bp_data = Array
    bp_matrix = Property(Array,depends_on="peak_indices")
    mea_bp_matrix = Array
    @cached_property
    def _get_bp_matrix(self):
        return peak_stack(self.peak_indices,self.bp_data,
                          pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak,
                          sampling_rate=self.bp_sampling_rate)

    # Or two separate channels
    systolic_included = CBool(False)
    systolic_winsor_min = CFloat(0.005)
    systolic_winsor_max = CFloat(0.005)
    systolic_winsorize = CBool(False)
    systolic_decimated = CBool(False)
    systolic_channel_name = CStr("")
    systolic_sampling_rate = CFloat(1000)
    systolic_sampling_rate_unit = CStr("Hz")
    systolic_unit = CStr("mmHg")
    systolic_start_time = CFloat(0.)
    systolic_data = Array
    systolic_matrix = Property(Array,
                               depends_on="peak_indices,bp_pre_peak,bp_post_peak")
    mea_systolic_matrix = Array
    @cached_property
    def _get_systolic_matrix(self):
        if self.peak_indices.size == 0 or not ("systolic" in self.contents):
            return np.array([])
        return peak_stack(self.peak_indices,self.systolic_data,
                          pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak,
                          sampling_rate=self.bp_sampling_rate)

    diastolic_included = CBool(False)
    diastolic_winsor_min = CFloat(0.005)
    diastolic_winsor_max = CFloat(0.005)
    diastolic_winsorize = CBool(False)
    diastolic_decimated = CBool(False)
    diastolic_channel_name = CStr("")
    diastolic_sampling_rate = CFloat(1000)
    diastolic_sampling_rate_unit = CStr("Hz")
    diastolic_unit = CStr("Ohms")
    diastolic_start_time = CFloat(0.)
    diastolic_data = Array
    diastolic_matrix = Property(Array,
                                depends_on="peak_indices,bp_pre_peak,bp_post_peak")
    mea_diastolic_matrix = Array
    @cached_property
    def _get_diastolic_matrix(self):
        if self.peak_indices.size == 0 or not ("diastolic" in self.contents):
            return np.array([])
        return peak_stack(self.peak_indices,self.diastolic_data,
                          pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak,
                          sampling_rate=self.bp_sampling_rate)

    respiration_included = CBool(False)
    respiration_winsor_min = CFloat(0.005)
    respiration_winsor_max = CFloat(0.005)
    respiration_winsorize = CBool(False)
    respiration_decimated = CBool(False)
    respiration_channel_name = CStr("")
    respiration_sampling_rate = CFloat(1000)
    respiration_sampling_rate_unit = CStr("Hz")
    respiration_unit = CStr("Ohms")
    respiration_start_time = CFloat(0.)
    respiration_data = Array
    respiration_cycle = Array
    respiration_amount = Array
    resp_corrected_z0 = Array
    resp_corrected_dzdt = Array
    processed_respiration_data = Array
    processed_respiration_time = Array

    # -- Event marking signals (experiment and mri-related)
    mri_trigger_times = Array
    mri_trigger_included = CBool(False)
    mri_trigger_decimated = CBool(False)
    mri_trigger_channel_name = CStr("")
    mri_trigger_sampling_rate = CFloat(1000)
    mri_trigger_sampling_rate_unit = CStr("Hz")
    mri_trigger_unit = CStr("V")
    mri_trigger_start_time = CFloat(0.)
    event_names = List
    event_sampling_rate = CFloat(1000)
    event_included = CBool(True)
    event_decimated = CBool(False)
    event_start_time = CFloat(0.)
    event_sampling_rate_unit = "Hz"
    event_unit = CStr("Hz")

    # -- results of peak detection
    peak_times = Array
    peak_indices = CArray(dtype=np.int)
    # Non-markable heartbeats
    dne_peak_times = Array
    dne_peak_indices = CArray(dtype=np.int)
    # Any custom labels for heartbeats go here
    hand_labeled = Instance(np.ndarray) # An array of beat indices, each corresponding
    def _hand_labeled_default(self):
        return np.zeros_like(self.peak_indices)
    # Is the beat usable for analysis?
    usable = Instance(np.ndarray)
    def _usable_default(self):
        return np.ones(len(self.peak_indices),dtype=np.int)

    p_indices = Instance(np.ndarray)
    def _p_indices_default(self):
        return np.zeros_like(self.peak_indices)
    q_indices = Instance(np.ndarray)
    def _q_indices_default(self):
        return np.zeros_like(self.peak_indices)
    r_indices = Instance(np.ndarray)
    def _r_indices_default(self):
        return np.zeros_like(self.peak_indices)
    s_indices = Instance(np.ndarray)
    def _s_indices_default(self):
        return np.zeros_like(self.peak_indices)
    t_indices = Instance(np.ndarray)
    def _t_indices_default(self):
        return np.zeros_like(self.peak_indices)
    b_indices = Instance(np.ndarray)
    def _b_indices_default(self):
        return np.zeros_like(self.peak_indices)
    c_indices = Instance(np.ndarray)
    def _c_indices_default(self):
        return np.zeros_like(self.peak_indices)
    x_indices = Instance(np.ndarray)
    def _x_indices_default(self):
        return np.zeros_like(self.peak_indices)
    o_indices = Instance(np.ndarray)
    def _o_indices_default(self):
        return np.zeros_like(self.peak_indices)
    systole_indices = Instance(np.ndarray)
    def _systole_indices_default(self):
        return np.zeros_like(self.peak_indices)
    diastole_indices = Instance(np.ndarray)
    def _diastole_indices_default(self):
        return np.zeros_like(self.peak_indices)

    # Indices for doppler
    db_indices = Instance(np.ndarray)
    def _db_indices_default(self):
        return np.zeros_like(self.peak_indices)
    dx_indices = Instance(np.ndarray)
    def _dx_indices_default(self):
        return np.zeros_like(self.peak_indices)

    # Holds B points in the Karcher modes
    karcher_b_indices = Instance(np.ndarray)
    def _karcher_b_indices_default(self):
        return np.zeros(self.n_modes)
    
    # --- Subject information
    subject_age = CFloat(0.)
    subject_gender = Enum("M","F")
    subject_weight = CFloat(0.,label="Weight (lbs)")
    subject_height_ft = Int(0,label="Height (ft)",
                            desc="Subject's height in feet")
    subject_height_in = Int(0,label = "Height (in)",
                            desc="Subject's height in inches")
    subject_electrode_distance_front = CFloat(0.,
                                              label="Impedance electrode distance (front)")
    subject_electrode_distance_back = CFloat(0.,
                                             label="Impedance electrode distance (back)")
    subject_electrode_distance_right = CFloat(0.,
                                              label="Impedance electrode distance (back)")
    subject_electrode_distance_left = CFloat(0.,
                                             label="Impedance electrode distance (back)")
    subject_resp_max = CFloat(0.,label="Respiration circumference max (cm)")
    subject_resp_min = CFloat(0.,label="Respiration circumference min (cm)")
    subject_in_mri = CBool(False,label="Subject was in MRI scanner")
    subject_control_base_impedance = CFloat(0.,label="Control Imprdance",
                                            desc="If in MRI, store the z0 value from outside the MRI")

    subject_l = Property(CFloat,depends_on=
                         "subject_electrode_distance_front," + \
                         "subject_electrode_distance_back," + \
                         "subject_electrode_distance_right," + \
                         "subject_electrode_distance_left," + \
                         "subject_height_ft"
                         )
    @cached_property
    def _get_subject_l(self):
        """
        Uses information from the subject measurements to define the
        l variable for calculating stroke volume.

        if left and right electrode distances are provided, use the average
        if front and back electrode distances are provided, use the average
        if subject height in feet and inches is provided, use the estimate of
             l = 0.17 * height
        Otherwise return the first measurement found in front,back,left,right
        If nothing is found, returns 1

        """
        front = self.subject_electrode_distance_front
        back = self.subject_electrode_distance_back
        left = self.subject_electrode_distance_left
        right = self.subject_electrode_distance_right
        if left > 0 and right > 0:
            return (left + right) / 2.
        if front > 0 and back > 0:
            return (front + back) / 2.
        if self.subject_height_ft > 0:
            return (12*self.subject_height_ft + \
                    self.subject_height_in) * 2.54 * 0.17
        for measure in (front, back, left, right):
            if measure > 0.: return measure
        return 1

    # --- From the global configuration
    config = Instance(MEAPConfig)
    apply_ecg_smoothing = PrototypedFrom("config")
    ecg_smoothing_window_len = PrototypedFrom("config")
    apply_imp_smoothing = PrototypedFrom("config")
    imp_smoothing_window_len = PrototypedFrom("config")
    apply_bp_smoothing = PrototypedFrom("config")
    bp_smoothing_window_len = PrototypedFrom("config")
    regress_out_resp = PrototypedFrom("config")

    # parameters for processing the raw data before PT detecting
    subject_in_mri = PrototypedFrom("config")
    peak_detection_algorithm  = PrototypedFrom("config")
    # PanTomkins parameters
    qrs_source_signal = Enum("ecg", "ecg2")
    bandpass_min = PrototypedFrom("config")
    bandpass_max =PrototypedFrom("config")
    smoothing_window_len = PrototypedFrom("config")
    smoothing_window = PrototypedFrom("config")
    pt_adjust = PrototypedFrom("config")
    peak_threshold = PrototypedFrom("config")
    apply_filter = PrototypedFrom("config")
    apply_diff_sq = PrototypedFrom("config")
    apply_smooth_ma = PrototypedFrom("config")
    peak_window = PrototypedFrom("config")

    # Parameters for waveform extraction
    ecg_pre_peak = PrototypedFrom("config")
    ecg_post_peak = PrototypedFrom("config")
    dzdt_pre_peak = PrototypedFrom("config")
    dzdt_post_peak = PrototypedFrom("config")
    bp_pre_peak = PrototypedFrom("config")
    bp_post_peak = PrototypedFrom("config")
    systolic_pre_peak = PrototypedFrom("config")
    systolic_post_peak = PrototypedFrom("config")
    diastolic_pre_peak = PrototypedFrom("config")
    diastolic_post_peak = PrototypedFrom("config")
    doppler_pre_peak = PrototypedFrom("config")
    doppler_post_peak = PrototypedFrom("config")
    stroke_volume_equation = PrototypedFrom("config")

    # parameters for respiration analysis
    process_respiration = PrototypedFrom("config")
    resp_polort = PrototypedFrom("config")
    resp_high_freq_cutoff = PrototypedFrom("config")
    resp_inhale_begin_times = Array
    resp_exhale_begin_times = Array

    # Time points of the global ensemble average
    ens_avg_ecg_signal = Array
    ens_avg_dzdt_signal = Array
    ens_avg_bp_signal = Array
    ens_avg_systolic_signal = Array
    ens_avg_diastolic_signal = Array
    ens_avg_doppler_signal = Array
    ens_avg_p_time = CFloat
    ens_avg_q_time = CFloat
    ens_avg_r_time = CFloat
    ens_avg_s_time = CFloat
    ens_avg_t_time = CFloat
    ens_avg_b_time = CFloat
    ens_avg_db_time = CFloat
    ens_avg_dx_time = CFloat
    ens_avg_c_time = CFloat
    ens_avg_x_time = CFloat
    ens_avg_y_time = CFloat
    ens_avg_o_time = CFloat
    ens_avg_systole_time = CFloat
    ens_avg_diastole_time = CFloat
    using_hand_marked_point_priors = CBool(False)

    censored_secs_before = Array
    # MEA Physio timeseries
    lvet = Array
    co = Array
    resp_corrected_co = Array
    pep = Array
    sv = Array
    resp_corrected_sv = Array
    map = Array
    systolic = Array
    diastolic = Array
    hr = Array
    mea_hr = Array
    tpr = Array
    resp_corrected_tpr = Array

    def _config_default(self):
        return MEAPConfig()


    # SRVF-warping parameters
    srvf_lambda = PrototypedFrom("config")
    srvf_max_karcher_iterations = PrototypedFrom("config")
    srvf_update_min = PrototypedFrom("config")
    srvf_karcher_mean_subset_size = PrototypedFrom("config")
    srvf_multi_mode_variance_cutoff = PrototypedFrom("config")
    srvf_use_moving_ensembled = PrototypedFrom("config")
    dzdt_num_inputs_to_group_warping = PrototypedFrom("config")
    srvf_t_min = PrototypedFrom("config")
    srvf_t_max = PrototypedFrom("config")
    bspline_before_warping = PrototypedFrom("config")
    dzdt_srvf_karcher_mean = Array
    dzdt_karcher_mean = Array
    dzdt_karcher_mean_time = Array
    dzdt_warping_functions = Array
    dzdt_functions_to_warp = Array
    
    # Holds data related to initial karcher mean
    dzdt_karcher_mean_inputs = Array
    dzdt_karcher_mean_over_iterations = Array
    srvf_iteration_distances = Array
    srvf_iteration_energy = Array
    
    # Data related to the multiple modes
    n_modes = PrototypedFrom("config")
    max_kmeans_iterations = PrototypedFrom("config")
    mode_dzdt_karcher_means = Array
    mode_cluster_assignment = Array 
    mode_dzdt_srvf_karcher_means = Array

    # Storing and accessing the bpoint classifier
    bpoint_classifier_file = File

    def save(self,outfile):
        # Populate matfile-friendly data structures for censoring regions
        tmp = tempfile.NamedTemporaryFile()
        save_attrs = []
        for k in self.editable_traits():
            if k.endswith("ts"):
                continue
            if k == "available_widgets":
                continue
            if k == "bpoint_classifier":
                continue
            if k == "bpoint_classifier_file":
                continue
            if k in ("censored_regions","event_names"):
                continue
            v = getattr(self,k)
            if type(v) == np.ndarray:
                if v.size == 0: continue
            if type(v) is set: continue
            save_attrs.append(k)
        savedict = dict([(k,getattr(self,k)) \
                         for k in save_attrs if not (getattr(self,k) is None)])
        savedict["censoring_sources"] = np.array(self.censoring_sources)
        for evt in self.event_names:
            savedict[evt] = getattr(self,evt)
        savedict["event_names"] = np.array(self.event_names)
        for k,v in savedict.iteritems():
            try:
                savemat( tmp, {k:v}, long_field_names=True)
            except Exception, e:
                logger.warn("unable to save %s because of %s", k,e)
        tmp.close()
        try:
            savemat(outfile, savedict,long_field_names=True)
        except Exception,e:
            messagebox("Failed to save %s:\n\n%s"%(outfile,e))
Esempio n. 12
0
class BinningOp(HasStrictTraits):
    """
    Bin data along an axis.
    
    This operation creates equally spaced bins (in linear or log space)
    along an axis and adds a condition assigning each event to a bin.  The
    value of the event's condition is the left end of the bin's interval in
    which the event is located.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    channel : Str
        The name of the channel along which to bin.

    scale : {"linear", "log", "logicle"}
        Make the bins equidistant along what scale?
        
    num_bins : Int
        The number of bins to make.  Must set either :attr:`num_bins` or 
        :attr:`bin_width`. If both are defined, :attr:`num_bins` takes precedence.
        
    bin_width : Float
        The width of the bins.  Must set either :attr:`num_bins` or :attr:`bin_width`.  If
        :attr:`scale` is ``log``, :attr:`bin_width` is in log-10 units; if :attr:`scale` is
        ``logicle``, and error is thrown because the units are ill-defined.
        If both :attr:`num_bins` and :attr:`bin_width` are defined, :attr:`num_bins` takes 
        precedence. 
        
    bin_count_name : Str
        If :attr:`bin_count_name` is set, :meth:`apply` adds another column to 
        the resulting :class:`Experiment` that contains the number of events in 
        the bin that this event falls in.  Useful for filtering bins by number of events.
        
    Examples
    --------
    Create a small experiment:
    
    .. plot::
        :context: close-figs
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")]
        >>> ex = import_op.apply()
    
    Create and parameterize the operation
    
    .. plot::
        :context: close-figs

        >>> bin_op = flow.BinningOp()
        >>> bin_op.name = "Bin"
        >>> bin_op.channel = "FITC-A"
        >>> bin_op.scale = "log"
        >>> bin_op.bin_width = 0.2
    
    Apply the operation to the experiment
    
    .. plot::
        :context: close-figs 
    
        >>> ex2 = bin_op.apply(ex)
    
    Plot the result
    
    .. plot::
        :context: close-figs

        >>> bin_op.default_view().plot(ex2)  

    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.binning')
    friendly_id = Constant("Binning")

    name = CStr()
    bin_count_name = CStr()
    channel = Str()
    num_bins = util.PositiveInt(0, allow_zero=True)
    bin_width = util.PositiveFloat(0, allow_zero=True)
    scale = util.ScaleEnum

    _max_num_bins = Int(100)

    def apply(self, experiment):
        """
        Applies the binning to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            A new experiment with a condition column named :attr:`name`, which
            contains the location of the left-most edge of the bin that the
            event is in.  If :attr:`bin_count_name` is set, another column
            is added with that name as well, containing the number of events
            in the same bin as the event.

        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "no experiment specified")

        if not self.name:
            raise util.CytoflowOpError('name', "Name is not set")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Name {} is in the experiment already".format(self.name))

        if self.bin_count_name and self.bin_count_name in experiment.data.columns:
            raise util.CytoflowOpError(
                'bin_count_name',
                "bin_count_name {} is in the experiment already".format(
                    self.bin_count_name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "channel is not set")

        if self.channel not in experiment.data.columns:
            raise util.CytoflowOpError(
                'channel',
                "channel {} isn't in the experiment".format(self.channel))

        if not self.num_bins and not self.bin_width:
            raise util.CytoflowOpError('num_bins',
                                       "must set either bin number or width")

        if self.bin_width \
           and not (self.scale == "linear" or self.scale == "log"):
            raise util.CytoflowOpError(
                'scale', "Can only use bin_width with linear or log scale")

        scale = util.scale_factory(self.scale,
                                   experiment,
                                   channel=self.channel)
        scaled_data = scale(experiment.data[self.channel])

        scaled_min = bn.nanmin(scaled_data)
        scaled_max = bn.nanmax(scaled_data)

        num_bins = self.num_bins if self.num_bins else \
                   (scaled_max - scaled_min) / self.bin_width

        if num_bins > self._max_num_bins:
            raise util.CytoflowOpError(
                None, "Too many bins! To increase this limit, "
                "change _max_num_bins (currently {})".format(
                    self._max_num_bins))

        scaled_bins = np.linspace(start=scaled_min,
                                  stop=scaled_max,
                                  num=num_bins)

        if len(scaled_bins) < 2:
            raise util.CytoflowOpError('num_bins',
                                       "Must have more than one bin")

        # put the data in bins
        bin_idx = np.digitize(scaled_data, scaled_bins[1:-1])

        # now, back into data space
        bins = scale.inverse(scaled_bins)

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "float", bins[bin_idx])

        # keep track of the bins we used, for prettier plotting later.
        new_experiment.metadata[self.name]["bin_scale"] = self.scale
        new_experiment.metadata[self.name]["bins"] = bins

        if self.bin_count_name:
            # TODO - this is a HUGE memory hog?!
            # TODO - fix this, then turn it on by default
            agg_count = new_experiment.data.groupby(self.name).count()
            agg_count = agg_count[agg_count.columns[0]]

            # have to make the condition a float64, because if we're in log
            # space there may be events that have NaN as the bin number.

            new_experiment.add_condition(
                self.bin_count_name, "float64",
                new_experiment[self.name].map(agg_count))

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to check the binning.
        
        Returns
        -------
        IView
            An view instance, call :meth:`plot()` to plot the bins.
        """
        return BinningView(op=self, **kwargs)
Esempio n. 13
0
class PolygonOp(HasStrictTraits):
    """
    Apply a polygon gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by :meth:`apply`
        
    xchannel, ychannel : Str
        The names of the x and y channels to apply the gate.
        
    xscale, yscale : {'linear', 'log', 'logicle'} (default = 'linear')
        The scales applied to the data before drawing the polygon.
        
    vertices : List((Float, Float))
        The polygon verticies.  An ordered list of 2-tuples, representing
        the x and y coordinates of the vertices.
        
    Notes
    -----
    This module uses :meth:`matplotlib.path.Path` to represent the polygon, because
    membership testing is very fast.
    
    You can set the verticies by hand, I suppose, but it's much easier to use
    the interactive view you get from :meth:`default_view` to do so.

    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> p = flow.PolygonOp(name = "Polygon",
        ...                    xchannel = "V2-A",
        ...                    ychannel = "Y2-A")
        >>> p.vertices = [(23.411982294776319, 5158.7027015021222), 
        ...               (102.22182270573683, 23124.058843387455), 
        ...               (510.94519955277201, 23124.058843387455), 
        ...               (1089.5215641232173, 3800.3424832180476), 
        ...               (340.56382570202402, 801.98947404942271), 
        ...               (65.42597937575897, 1119.3133482602157)]

        
    Show the default view.  

    .. plot::
        :context: close-figs
            
        >>> df = p.default_view(huefacet = "Dox",
        ...                    xscale = 'log',
        ...                    yscale = 'log')
        
        >>> df.plot(ex)
        
    
    .. note::
       If you want to use the interactive default view in a Jupyter notebook,
       make sure you say ``%matplotlib notebook`` in the first cell 
       (instead of ``%matplotlib inline`` or similar).  Then call 
       ``default_view()`` with ``interactive = True``::
       
           df = p.default_view(huefacet = "Dox",
                               xscale = 'log',
                               yscale = 'log',
                               interactive = True)
           df.plot(ex)
        
    Apply the gate, and show the result
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = p.apply(ex)
        >>> ex2.data.groupby('Polygon').size()
        Polygon
        False    15875
        True      4125
        dtype: int64
            
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.polygon')
    friendly_id = Constant("Polygon")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    vertices = List((Float, Float))

    xscale = util.ScaleEnum()
    yscale = util.ScaleEnum()

    _selection_view = Instance('PolygonSelection', transient=True)

    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old :class:`Experiment` to which this op is applied
            
        Returns
        -------
        Experiment
            a new :class:'Experiment`, the same as ``old_experiment`` but with 
            a new column of type `bool` with the same as the operation name.  
            The bool is ``True`` if the event's measurement is within the 
            polygon, and ``False`` otherwise.
            
        Raises
        ------
        util.CytoflowOpError
            if for some reason the operation can't be applied to this
            experiment. The reason is in :attr:`.CytoflowOpError.args`
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name', "{} is in the experiment already!".format(self.name))

        if not self.xchannel:
            raise util.CytoflowOpError('xchannel', "Must specify an x channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel', "Must specify a y channel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError(
                'xchannel',
                "xchannel {0} is not in the experiment".format(self.xchannel))

        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError(
                'ychannel',
                "ychannel {0} is not in the experiment".format(self.ychannel))

        if len(self.vertices) < 3:
            raise util.CytoflowOpError('vertices',
                                       "Must have at least 3 vertices")

        if any([len(x) != 2 for x in self.vertices]):
            return util.CytoflowOpError(
                'vertices', "All vertices must be lists or tuples "
                "of length = 2")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the Polygon gate's name "
                "before applying it!")

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                'name',
                "Experiment already contains a column {0}".format(self.name))

        # there's a bit of a subtlety here: if the vertices were
        # selected with an interactive plot, and that plot had scaled
        # axes, we need to apply that scale function to both the
        # vertices and the data before looking for path membership
        xscale = util.scale_factory(self.xscale,
                                    experiment,
                                    channel=self.xchannel)
        yscale = util.scale_factory(self.yscale,
                                    experiment,
                                    channel=self.ychannel)

        vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices]
        data = experiment.data[[self.xchannel, self.ychannel]].copy()
        data[self.xchannel] = xscale(data[self.xchannel])
        data[self.ychannel] = yscale(data[self.ychannel])

        # use a matplotlib Path because testing for membership is a fast C fn.
        path = mpl.path.Path(np.array(vertices))
        xy_data = data[[self.xchannel, self.ychannel]].values

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool",
                                     path.contains_points(xy_data))
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))

        return new_experiment

    def default_view(self, **kwargs):
        self._selection_view = PolygonSelection(op=self)
        self._selection_view.trait_set(**kwargs)
        return self._selection_view
Esempio n. 14
0
class GaussianMixture2DOp(HasStrictTraits):
    """
    This module fits a 2D Gaussian mixture model with a specified number of
    components to a pair of channels.
    
    .. warning:: 
    
        :class:`GaussianMixture2DOp` is **DEPRECATED** and will be removed
        in a future release.  It doesn't correctly handle the case where an 
        event is present in more than one component.  Please use
        :class:`GaussianMixtureOp` instead!
    
    Creates a new categorical metadata variable named :attr:`name`, with possible
    values ``name_1`` .... ``name_n`` where ``n`` is the number of components.
    An event is assigned to ``name_i`` category if it falls within :attr:`sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to ``name_None``.
    
    As a special case, if :attr:`num_components` is ``1`` and :attr:`sigma` 
    ``> 0.0``, then the new condition is boolean, ``True`` if the event fell 
    in the gate and ``False`` otherwise.
    
    Optionally, if :attr:`posteriors` is ``True``, this module will also compute the 
    posterior probability of each event in its assigned component, returning
    it in a new colunm named ``{Name}_Posterior``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the mixture model to.
        
    ychannel : Str
        The Y channel to apply the mixture model to.

    xscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the X acis before fitting the data?  

    yscale : {"linear", "logicle", "log"} (default = "linear")
        Re-scale the data on the Y axis before fitting the data?  
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        :attr:`sigma` is ``0.0``, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be ``>= 0.0``.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit
        the model separately to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.

    posteriors : Bool (default = False)
        If ``True``, add a column named ``{Name}_Posterior`` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.

    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixture2DOp(name = 'Flow',
        ...                          xchannel = 'V2-A',
        ...                          xscale = 'log',
        ...                          ychannel = 'Y2-A',
        ...                          yscale = 'log',
        ...                          num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view with the distributions
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)

    """

    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d')
    friendly_id = Constant("2D Gaussian Mixture")

    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    num_components = util.PositiveInt
    sigma = util.PositiveFloat(0.0, allow_zero=True)
    by = List(Str)

    posteriors = Bool(False)

    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True)
    _xscale = Instance(util.IScale, transient=True)
    _yscale = Instance(util.IScale, transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters.
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """

        warn(
            "GaussianMixture2DOp is DEPRECATED.  Please use GaussianMixtureOp.",
            util.CytoflowOpWarning)

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                'xchannel',
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                'ychannel',
                "Column {0} not found in the experiment".format(self.ychannel))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError(
                'posteriors', "If num_components == 1, all posteriors are 1.")

        if subset:
            try:
                experiment = experiment.query(subset)
            except Exception as e:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' isn't valid".format(subset)) from e

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = util.scale_factory(self.xscale,
                                          experiment,
                                          channel=self.xchannel)
        self._yscale = util.scale_factory(self.yscale,
                                          experiment,
                                          channel=self.ychannel)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    None, "Group {} had no data".format(group))
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])

            # drop data that isn't in the scale range
            x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))]
            x = x.values

            gmm = mixture.GaussianMixture(n_components=self.num_components,
                                          covariance_type="full",
                                          random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError(
                    None, "Estimator didn't converge"
                    " for group {0}".format(group))

            # in the 1D version, we sort the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.  that doesn't work in a 2D area,
            # obviously.

            # instead, we assume that the clusters are likely (?) to be
            # arranged along *one* of the axes, so we take the |norm| of the
            # x,y mean of each cluster and sort that way.

            norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with a column named :attr:`name` and
            optionally one named :attr:`name` ``_Posterior``.  Also includes
            the following new statistics:

            - **xmean** : Float
                the mean of the fitted gaussian in the x dimension.
                
            - **ymean** : Float
                the mean of the fitted gaussian in the y dimension.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                set if :attr:`num_components` ``> 1``.
        
            PS -- if someone has good ideas for summarizing spread in a 2D (non-isotropic)
            Gaussian, or other useful statistics, let me know!

        """

        warn(
            "GaussianMixture2DOp is DEPRECATED.  Please use GaussianMixtureOp.",
            util.CytoflowOpWarning)

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self.xchannel:
            raise util.CytoflowOpError('xchannel', "Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel', "Must set Y channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if not self._gmms:
            raise util.CytoflowOpError(
                None, "No components found.  Did you forget to "
                "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(
                None, "Couldn't find _xscale.  What happened??")

        if not self._yscale:
            raise util.CytoflowOpError(
                None, "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                'xchannel',
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                'ychannel',
                "Column {0} not found in the experiment".format(self.ychannel))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Column {0} already found in the experiment".format(
                        col_name))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.sigma < 0.0:
            raise util.CytoflowOpError('sigma', "sigma must be >= 0.0")

        event_assignments = pd.Series([None] * len(experiment), dtype="object")

        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))

        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.  for example, this is why
        # we don't use Ellipse.contains().

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])

            # which values are missing?
            x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel])
            x_na = x_na.values

            x = x.values
            group_idx = groupby.groups[group]

            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na])

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:

                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({
                    "x": x[:, 0],
                    "y": x[:, 1],
                    "p": predicted
                })

                # for each component, get the ellipse that follows the isoline
                # around the mixture component
                # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html
                # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389
                # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm
                # i am not proud of how many tries this took me to get right.

                for c in range(0, self.num_components):
                    mean = gmm.means_[c]
                    covar = gmm.covariances_[c]

                    # xc is the center on the x axis
                    # yc is the center on the y axis
                    xc = mean[0]  # @UnusedVariable
                    yc = mean[1]  # @UnusedVariable

                    v, w = linalg.eigh(covar)
                    u = w[0] / linalg.norm(w[0])

                    # xl is the length along the x axis
                    # yl is the length along the y axis
                    xl = np.sqrt(v[0]) * self.sigma  # @UnusedVariable
                    yl = np.sqrt(v[1]) * self.sigma  # @UnusedVariable

                    # t is the rotation in radians (counter-clockwise)
                    t = 2 * np.pi - np.arctan(u[1] / u[0])

                    sin_t = np.sin(t)  # @UnusedVariable
                    cos_t = np.cos(t)  # @UnusedVariable

                    # and build an expression with numexpr so it evaluates fast!

                    gate_bool = gate_df.eval(
                        "p == @c and "
                        "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + "
                        "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1"
                    ).values

                    predicted[np.logical_and(predicted == c,
                                             gate_bool == False)] = -1

            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0,
                                      "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, :])
                posteriors = pd.Series([0.0] * len(predicted))
                for c in range(0, self.num_components):
                    posteriors[predicted == c] = probability[predicted == c, c]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors

        new_experiment = experiment.clone()

        if self.num_components == 1 and self.sigma > 0:
            new_experiment.add_condition(
                self.name, "bool",
                event_assignments == "{0}_1".format(self.name))
        elif self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.posteriors and self.num_components > 1:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)

        # add the statistics
        levels = list(self.by)
        if self.num_components > 1:
            levels.append(self.name)

        if levels:
            idx = pd.MultiIndex.from_product(
                [new_experiment[x].unique() for x in levels], names=levels)

            xmean_stat = pd.Series(index=idx,
                                   dtype=np.dtype(object)).sort_index()
            ymean_stat = pd.Series(index=idx,
                                   dtype=np.dtype(object)).sort_index()
            prop_stat = pd.Series(index=idx,
                                  dtype=np.dtype(object)).sort_index()

            for group, _ in groupby:
                gmm = self._gmms[group]
                for c in range(self.num_components):
                    if self.num_components > 1:
                        component_name = "{}_{}".format(self.name, c + 1)

                        if group is True:
                            g = [component_name]
                        elif isinstance(group, tuple):
                            g = list(group)
                            g.append(component_name)
                        else:
                            g = list([group])
                            g.append(component_name)

                        if len(g) > 1:
                            g = tuple(g)
                        else:
                            g = g[0]
                    else:
                        g = group

                    xmean_stat.loc[g] = self._xscale.inverse(gmm.means_[c][0])
                    ymean_stat.loc[g] = self._yscale.inverse(gmm.means_[c][0])
                    prop_stat.loc[g] = gmm.weights_[c]

            new_experiment.statistics[(self.name,
                                       "xmean")] = pd.to_numeric(xmean_stat)
            new_experiment.statistics[(self.name,
                                       "ymean")] = pd.to_numeric(ymean_stat)
            if self.num_components > 1:
                new_experiment.statistics[(
                    self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call :meth:`~GaussianMixture2DView.plot` to see 
            the diagnostic plot.
        
        """

        warn(
            "GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
            util.CytoflowOpWarning)

        return GaussianMixture2DView(op=self, **kwargs)
Esempio n. 15
0
class BeadCalibrationOp(HasStrictTraits):
    """
    Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent
    beads (eg, the Spherotech RCP-30-5A rainbow beads.)
    
    To use, set the `beads_file` property to an FCS file containing the beads'
    events; specify which beads you ran by setting the `beads_type` property
    to match one of the values of BeadCalibrationOp.BEADS; and set the
    `units` dict to which channels you want calibrated and in which units.
    Then, call `estimate()` and check the peak-finding with 
    `default_view().plot()`.  If the peak-finding is wacky, try adjusting
    `bead_peak_quantile` and `bead_brightness_threshold`.  When the peaks are
    successfully identified, call apply() on your experimental data set. 
    
    If you can't make the peak finding work, please submit a bug report!
    
    This procedure works best when the beads file is very clean data.  It does
    not do its own gating (maybe a future addition?)  In the meantime, 
    I recommend gating the *acquisition* on the FSC/SSC channels in order
    to get rid of debris, cells, and other noise.
    
    Finally, because you can't have a negative number of fluorescent molecules
    (MEFLs, etc) (as well as for math reasons), this module filters out
    negative values.
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation.)

    units : Dict(Str, Str)
        A dictionary specifying the channels you want calibrated (keys) and
        the units you want them calibrated in (values).  The units must be
        keys of the `beads` attribute.       
        
    beads_file : File
        A file containing the FCS events from the beads.  Must be set to use
        `estimate()`.  This isn't persisted by `pickle()`.

    beads : Dict(Str, List(Float))
        The beads' characteristics.  Keys are calibrated units (ie, MEFL or
        MEAP) and values are ordered lists of known fluorophore levels.  Common
        values for this dict are included in BeadCalibrationOp.BEADS.
        Must be set to use `estimate()`.
        
    bead_peak_quantile : Int
        The quantile threshold used to choose bead peaks.  Default == 80.
        Must be set to use `estimate()`.
        
    bead_brightness_threshold : Float
        How bright must a bead peak be to be considered?  Default == 100.
        Must be set to use `estimate()`.
        
    Notes
    -----
    The peak finding is rather sophisticated.  
    
    For each channel, a 256-bin histogram is computed on the log-transformed
    bead data, and then the histogram is smoothed with a Savitzky-Golay 
    filter (with a window length of 5 and a polynomial order of 1).  
    
    Next, a wavelet-based peak-finding algorithm is used: it convolves the
    smoothed histogram with a series of wavelets and looks for relative 
    maxima at various length-scales.  The parameters of the smoothing 
    algorithm were arrived at empircally, using beads collected at a wide 
    range of PMT voltages.
    
    Finally, the peaks are filtered by height (the histogram bin has a quantile
    greater than `bead_peak_quantile`) and intensity (brighter than 
    `bead_brightness_threshold`).
    
    How to convert from a series of peaks to mean equivalent fluorochrome?
    If there's one peak, we assume that it's the brightest peak.  If there
    are two peaks, we assume they're the brightest two.  If there are n >=3
    peaks, we check all the contiguous n-subsets of the bead intensities
    and find the one whose linear regression (in log space!) has the smallest
    norm (square-root sum-of-squared-residuals.)
    
    There's a slight subtlety in the fact that we're performing the linear
    regression in log-space: if the relationship in log10-space is Y=aX + b,
    then the same relationship in linear space is x = 10**X, y = 10**y, and
    y = (10**b) * (x ** a).
    
    One more thing.  Because the beads are (log) evenly spaced across all
    the channels, we can directly compute the fluorophore equivalent in channels
    where we wouldn't usually measure that fluorophore: for example, you can
    compute MEFL (mean equivalent fluorosceine) in the PE-Texas Red channel,
    because the bead peak pattern is the same in the PE-Texas Red channel
    as it would be in the FITC channel.
    
    Examples
    --------
    >>> bead_op = flow.BeadCalibrationOp()
    >>> bead_op.beads = flow.BeadCalibrationOp.BEADS["Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R"]
    >>> bead_op.units = {"Pacific Blue-A" : "MEFL",
                         "FITC-A" : "MEFL",
                         "PE-Tx-Red-YG-A" : "MEFL"}
    >>>
    >>> bead_op.beads_file = "beads.fcs"
    >>> bead_op.estimate(ex3)
    >>>
    >>> bead_op.default_view().plot(ex3)  
    >>> # check the plot!
    >>>
    >>> ex4 = bead_op.apply(ex3)  
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate')
    friendly_id = Constant("Bead Calibration")

    name = CStr()
    units = Dict(Str, Str)

    beads_file = File(transient=True)
    bead_peak_quantile = Int(80)

    bead_brightness_threshold = Float(100)
    # TODO - bead_brightness_threshold should probably be different depending
    # on the data range of the input.

    beads = Dict(Str, List(Float), transient=True)

    #_coefficients = Dict(Str, Python)
    _calibration_functions = Dict(Str, Python)

    def estimate(self, experiment, subset=None):
        """
        Estimate the calibration coefficients from the beads file.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not set(self.units.keys()) <= set(experiment.channels):
            raise util.CytoflowOpError(
                "Specified channels that weren't found in "
                "the experiment.")

        if not set(self.units.values()) <= set(self.beads.keys()):
            raise util.CytoflowOpError("Units don't match beads.")

        beads_data = parse_tube(self.beads_file, experiment)
        channels = self.units.keys()

        for channel in channels:
            data = beads_data[channel]

            # TODO - this assumes the data is on a linear scale.  check it!

            # bin the data on a log scale
            data_range = experiment.metadata[channel]['range']
            hist_bins = np.logspace(1,
                                    math.log(data_range, 2),
                                    num=256,
                                    base=2)
            hist = np.histogram(data, bins=hist_bins)

            # mask off-scale values
            hist[0][0] = 0
            hist[0][-1] = 0

            # smooth it with a Savitzky-Golay filter
            hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1)

            # find peaks
            peak_bins = scipy.signal.find_peaks_cwt(
                hist_smooth,
                widths=np.arange(3, 20),
                max_distances=np.arange(3, 20) / 2)

            # filter by height and intensity
            peak_threshold = np.percentile(hist_smooth,
                                           self.bead_peak_quantile)
            peak_bins_filtered = \
                [x for x in peak_bins if hist_smooth[x] > peak_threshold
                 and hist[1][x] > self.bead_brightness_threshold]

            peaks = [hist_bins[x] for x in peak_bins_filtered]

            mef_unit = self.units[channel]

            if not mef_unit in self.beads:
                raise util.CytoflowOpError(
                    "Invalid unit {0} specified for channel {1}".format(
                        mef_unit, channel))

            # "mean equivalent fluorochrome"
            mef = self.beads[mef_unit]

            if len(peaks) == 0:
                raise util.CytoflowOpError(
                    "Didn't find any peaks; check the diagnostic plot")
            elif len(peaks) > len(self.beads):
                raise util.CytoflowOpError(
                    "Found too many peaks; check the diagnostic plot")
            elif len(peaks) == 1:
                # if we only have one peak, assume it's the brightest peak
                a = mef[-1] / peaks[0]
                self._calibration_functions[channel] = lambda x, a=a: a * x
            elif len(peaks) == 2:
                # if we have only two peaks, assume they're the brightest two
                a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0])
                self._calibration_functions[channel] = lambda x, a=a: a * x
            else:
                # if there are n > 2 peaks, check all the contiguous n-subsets
                # of mef for the one whose linear regression with the peaks
                # has the smallest (norm) sum-of-residuals.

                # do it in log10 space because otherwise the brightest peaks
                # have an outsized influence.

                best_resid = np.inf
                for start, end in [(x, x + len(peaks))
                                   for x in range(len(mef) - len(peaks) + 1)]:
                    mef_subset = mef[start:end]

                    # linear regression of the peak locations against mef subset
                    lr = np.polyfit(np.log10(peaks),
                                    np.log10(mef_subset),
                                    deg=1,
                                    full=True)

                    resid = lr[1][0]
                    if resid < best_resid:
                        best_lr = lr[0]
                        best_resid = resid

                # remember, these (linear) coefficients came from logspace, so
                # if the relationship in log10 space is Y = aX + b, then in
                # linear space the relationship is x = 10**X, y = 10**Y,
                # and y = (10**b) * x ^ a

                # also remember that the result of np.polyfit is a list of
                # coefficients with the highest power first!  so if we
                # solve y=ax + b, coeff #0 is a and coeff #1 is b

                a = best_lr[0]
                b = 10**best_lr[1]
                self._calibration_functions[channel] = \
                    lambda x, a=a, b=b: b * np.power(x, a)

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        old_experiment : Experiment
            the experiment to which this op is applied
            
        Returns
        -------
            a new experiment calibrated in physical units.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        channels = self.units.keys()

        if not self.units:
            raise util.CytoflowOpError("Units not specified.")

        if not self._calibration_functions:
            raise util.CytoflowOpError("Calibration not found. "
                                       "Did you forget to call estimate()?")

        if not set(channels) <= set(experiment.channels):
            raise util.CytoflowOpError(
                "Module units don't match experiment channels")

        if set(channels) != set(self._calibration_functions.keys()):
            raise util.CytoflowOpError("Calibration doesn't match units. "
                                       "Did you forget to call estimate()?")

        # two things.  first, you can't raise a negative value to a non-integer
        # power.  second, negative physical units don't make sense -- how can
        # you have the equivalent of -5 molecules of fluoresceine?  so,
        # we filter out negative values here.

        new_experiment = experiment.clone()

        for channel in channels:
            new_experiment.data = \
                new_experiment.data[new_experiment.data[channel] > 0]

        new_experiment.data.reset_index(drop=True, inplace=True)

        for channel in channels:
            calibration_fn = self._calibration_functions[channel]

            new_experiment[channel] = calibration_fn(new_experiment[channel])
            new_experiment.metadata[channel][
                'bead_calibration_fn'] = calibration_fn
            new_experiment.metadata[channel]['units'] = self.units[channel]
            if 'range' in experiment.metadata[channel]:
                new_experiment.metadata[channel]['range'] = calibration_fn(
                    experiment.metadata[channel]['range'])

        new_experiment.history.append(self.clone_traits())
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to see if the bleedthrough spline estimation
        is working.
        
        Returns
        -------
            IView : An IView, call plot() to see the diagnostic plots
        """

        return BeadCalibrationDiagnostic(op=self, **kwargs)

    BEADS = {
        # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls
        "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01": {
            "MECSB": [216, 464, 1232, 2940, 7669, 19812, 35474],
            "MEBFP": [861, 1997, 5776, 15233, 45389, 152562, 396759],
            "MEFL": [792, 2079, 6588, 16471, 47497, 137049, 271647],
            "MEPE": [531, 1504, 4819, 12506, 36159, 109588, 250892],
            "MEPTR": [233, 669, 2179, 5929, 18219, 63944, 188785],
            "MECY": [1614, 4035, 12025, 31896, 95682, 353225, 1077421],
            "MEPCY7": [14916, 42336, 153840, 494263],
            "MEAP": [373, 1079, 3633, 9896, 28189, 79831, 151008],
            "MEAPCY7": [2864, 7644, 19081, 37258]
        },
        # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls
        "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R": {
            "MECSB": [179, 400, 993, 3203, 6083, 17777, 36331],
            "MEBFP": [700, 1705, 4262, 17546, 35669, 133387, 412089],
            "MEFL": [692, 2192, 6028, 17493, 35674, 126907, 290983],
            "MEPE": [505, 1777, 4974, 13118, 26757, 94930, 250470],
            "MEPTR": [207, 750, 2198, 6063, 12887, 51686, 170219],
            "MECY": [1437, 4693, 12901, 36837, 76621, 261671, 1069858],
            "MEPCY7": [32907, 107787, 503797],
            "MEAP": [587, 2433, 6720, 17962, 30866, 51704, 146080],
            "MEAPCY7": [718, 1920, 5133, 9324, 14210, 26735]
        }
    }
Esempio n. 16
0
class Range2DOp(HasStrictTraits):
    """Apply a 2D range gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    xchannel : Str
        The name of the first channel to apply the range gate.
        
    xlow : Float
        The lowest value in xchannel to include in this gate.
        
    xhigh : Float
        The highest value in xchannel to include in this gate.
        
    ychannel : Str
        The name of the secon channel to apply the range gate.
        
    ylow : Float
        The lowest value in ychannel to include in this gate.
        
    yhigh : Float
        The highest value in ychannel to include in this gate.
        
    Examples
    --------
    
    >>> range_2d = flow.Range2DOp(xchannel = "V2-A",
    ...                           xlow = 0.0,
    ...                           xhigh = 0.5,
    ...                           ychannel = "Y2-A",
    ...                           ylow = 0.4,
    ...                           yhigh = 0.8)
    >>> ex3 = range_2d.apply(ex2)

    Alternately, in an IPython notebook with `%matplotlib notebook`
    
    >>> rv = range_2d.default_view()
    >>> rv.plot(ex2)
    >>> ### draw a box on the plot in the notebook ### 
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.range2d')
    friendly_id = Constant("2D Range")

    name = CStr()

    xchannel = Str()
    xlow = CFloat()
    xhigh = CFloat()

    ychannel = Str()
    ylow = CFloat()
    yhigh = CFloat()

    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment, the same as old_experiment but with a new
            column the same as the operation name.  The bool is True if the
            event's measurement in self.channel is greater than self.low and
            less than self.high; it is False otherwise.
        """

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                "Experiment already contains a column {0}".format(self.name))

        if not self.xchannel or not self.ychannel:
            raise util.CytoflowOpError("Must specify xchannel and ychannel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError("xchannel isn't in the experiment")

        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError("ychannel isn't in the experiment")

        if self.xhigh <= experiment[self.xchannel].min():
            raise util.CytoflowOpError(
                "x channel range high must be > {0}".format(
                    experiment[self.xchannel].min()))
        if self.xlow >= experiment[self.xchannel].max:
            raise util.CytoflowOpError(
                "x channel range low must be < {0}".format(
                    experiment[self.xchannel].max()))

        if self.yhigh <= experiment[self.ychannel].min():
            raise util.CytoflowOpError(
                "y channel range high must be > {0}".format(
                    experiment[self.ychannel].min()))
        if self.ylow >= experiment[self.ychannel].max:
            raise util.CytoflowOpError(
                "y channel range low must be < {0}".format(
                    experiment[self.ychannel].max()))

        x = experiment[self.xchannel].between(self.xlow, self.xhigh)
        y = experiment[self.ychannel].between(self.ylow, self.yhigh)
        gate = pd.Series(x & y)

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(self.clone_traits())
        return new_experiment

    def default_view(self, **kwargs):
        return RangeSelection2D(op=self, **kwargs)
Esempio n. 17
0
class PCAOp(HasStrictTraits):
    """
    Use principal components analysis (PCA) to decompose a multivariate data
    set into orthogonal components that explain a maximum amount of variance.
    
    Call :meth:`estimate` to compute the optimal decomposition.
      
    Calling :meth:`apply` creates new "channels" named ``{name}_1 ... {name}_n``,
    where ``name`` is the :attr:`name` attribute and ``n`` is :attr:`num_components`.

    The same decomposition may not be appropriate for different subsets of the data set.
    If this is the case, you can use the :attr:`by` attribute to specify 
    metadata by which to aggregate the data before estimating (and applying) a 
    model.  The PCA parameters such as the number of components and the kernel
    are the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new columns.
        
    channels : List(Str)
        The channels to apply the decomposition to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`.set_default_scale`) is used.

    num_components : Int (default = 2)
        How many components to fit to the data?  Must be a positive integer.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will 
        fit the model separately to each subset of the data with a unique 
        combination of ``Time`` and ``Dox``.
        
    whiten : Bool (default = False)
        Scale each component to unit variance?  May be useful if you will
        be using unsupervized clustering (such as K-means).

    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> pca = flow.PCAOp(name = 'PCA',
        ...                  channels = ['V2-A', 'V2-H', 'Y2-A', 'Y2-H'],
        ...                  scale = {'V2-A' : 'log',
        ...                           'V2-H' : 'log',
        ...                           'Y2-A' : 'log',
        ...                           'Y2-H' : 'log'},
        ...                  num_components = 2,
        ...                  by = ["Dox"])
        
    Estimate the decomposition
    
    .. plot::
        :context: close-figs
        
        >>> pca.estimate(ex)
        
    Apply the operation
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = pca.apply(ex)

    Plot a scatterplot of the PCA.  Compare to a scatterplot of the underlying
    channels.
    
    .. plot::
        :context: close-figs
        
        >>> flow.ScatterplotView(xchannel = "V2-A",
        ...                      xscale = "log",
        ...                      ychannel = "Y2-A",
        ...                      yscale = "log",
        ...                      subset = "Dox == 1.0").plot(ex2)

        >>> flow.ScatterplotView(xchannel = "PCA_1",
        ...                      ychannel = "PCA_2",
        ...                      subset = "Dox == 1.0").plot(ex2)
       
    .. plot::
        :context: close-figs
        
        >>> flow.ScatterplotView(xchannel = "V2-A",
        ...                      xscale = "log",
        ...                      ychannel = "Y2-A",
        ...                      yscale = "log",
        ...                      subset = "Dox == 10.0").plot(ex2) 

        >>> flow.ScatterplotView(xchannel = "PCA_1",
        ...                      ychannel = "PCA_2",
        ...                      subset = "Dox == 10.0").plot(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.pca')
    friendly_id = Constant("Principal Component Analysis")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(2, allow_zero=False)
    whiten = Bool(False)
    by = List(Str)

    _pca = Dict(Any, Any, transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the decomposition
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.

        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        if self.num_components > len(self.channels):
            raise util.CytoflowOpError(
                'num_components', "Number of components must be less than "
                "or equal to number of channels.")

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in `channels`".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            self._pca[group] = pca = \
                sklearn.decomposition.PCA(n_components = self.num_components,
                                          whiten = self.whiten,
                                          random_state = 0)

            pca.fit(x)

    def apply(self, experiment):
        """
        Apply the PCA decomposition to the data.
        
        Returns
        -------
        Experiment
            a new Experiment with additional :attr:`~Experiment.channels` 
            named ``name_1 ... name_n``

        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self._pca:
            raise util.CytoflowOpError(
                None, "No PCA found.  Did you forget to call estimate()?")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the operation's name "
                "before applying it!")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        new_experiment = experiment.clone()
        new_channels = []
        for i in range(self.num_components):
            cname = "{}_{}".format(self.name, i + 1)
            if cname in experiment.data:
                raise util.CytoflowOpError(
                    'name',
                    "Channel {} is already in the experiment".format(cname))

            new_experiment.add_channel(cname,
                                       pd.Series(index=experiment.data.index))
            new_channels.append(cname)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
            x_na = x_na.values
            x[x_na] = 0

            group_idx = groupby.groups[group]

            pca = self._pca[group]
            x_tf = pca.transform(x)
            x_tf[x_na] = np.nan

            for ci, c in enumerate(new_channels):
                new_experiment.data.loc[group_idx, c] = x_tf[:, ci]

        new_experiment.data.dropna(inplace=True)
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment
Esempio n. 18
0
class KMeansOp(HasStrictTraits):
    """
    Use a K-means clustering algorithm to cluster events.  
    
    Call :meth:`estimate` to compute the cluster centroids.
      
    Calling :meth:`apply` creates a new categorical metadata variable 
    named :attr:`name`, with possible values ``{name}_1`` .... ``name_n`` where 
    ``n`` is the number of clusters, specified with :attr:`num_clusters`.
    
    The same model may not be appropriate for different subsets of the data set.
    If this is the case, you can use the :attr:`by` attribute to specify 
    metadata by which to aggregate the data before estimating (and applying) a 
    model.  The  number of clusters is the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the clustering algorithm to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`.set_default_scale`) is used.

    num_clusters : Int (default = 2)
        How many components to fit to the data?  Must be a positive integer.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will 
        fit the model separately to each subset of the data with a unique 
        combination of ``Time`` and ``Dox``.
  
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> km_op = flow.KMeansOp(name = 'KMeans',
        ...                       channels = ['V2-A', 'Y2-A'],
        ...                       scale = {'V2-A' : 'log',
        ...                                'Y2-A' : 'log'},
        ...                       num_clusters = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> km_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> km_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = km_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> km_op.default_view().plot(ex2)
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.kmeans')
    friendly_id = Constant("KMeans Clustering")
    
    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_clusters = util.PositiveInt(allow_zero = False)
    by = List(Str)
    
    _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient = True)
    _scale = Dict(Str, Instance(util.IScale), transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the k-means clusters
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if self.num_clusters < 2:
            raise util.CytoflowOpError('num_clusters',
                                       "num_clusters must be >= 2")
        
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('scale',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError('subset',
                                            "Subset string '{0}' isn't valid"
                                            .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c)
                    
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
            
            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values
            
            self._kmeans[group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters,
                                                random_state = 0)
            
            kmeans.fit(x)
                                                 
         
    def apply(self, experiment):
        """
        Apply the KMeans clustering to the data.
        
        Returns
        -------
        Experiment
            a new Experiment with one additional :attr:`~Experiment.condition` 
            named :attr:`name`, of type ``category``.  The new category has 
            values  ``name_1, name_2, etc`` to indicate which k-means cluster 
            an event is a member of.
            
            The new :class:`.Experiment` also has one new statistic called
            ``centers``, which is a list of tuples encoding the centroids of each
            k-means cluster.
        """
 
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
         
        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._kmeans:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
 
        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                 
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('scale',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
                 
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
                 
        event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
         
        # make the statistics       
        clusters = [x + 1 for x in range(self.num_clusters)]
          
        idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], 
                                         names = list(self.by) + ["Cluster"] + ["Channel"])
        centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
                     
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))
            
            if group not in self._kmeans:
                raise util.CytoflowOpError('by',
                                           "Group {} not found in the estimated model. "
                                           "Do you need to re-run estimate()?"
                                           .format(group))    
            
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                 
            # which values are missing?
 
            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                         
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
            
            kmeans = self._kmeans[group]
  
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = kmeans.predict(x[~x_na])
                 
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_clusters):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx
      
            event_assignments.iloc[group_idx] = predicted_str
            
            for c in range(self.num_clusters):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    centers_stat.loc[g2] = self._scale[channel1].inverse(kmeans.cluster_centers_[c, cidx1])
         
        new_experiment = experiment.clone()          
        new_experiment.add_condition(self.name, "category", event_assignments)
        
        new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)
 
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the k-means clustering.
         
        Returns
        -------
            IView : an IView, call :meth:`KMeans1DView.plot` to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        
        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError('channels',
                                             "Channel {} isn't in the operation's channels"
                                             .format(c))
                
        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError('scale',
                                             "Channel {} isn't in the operation's channels"
                                             .format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()
            
        if len(channels) == 0:
            raise util.CytoflowViewError('channels',
                                         "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = KMeans1DView(op = self)
            v.trait_set(channel = channels[0], 
                        scale = scale[channels[0]], 
                        **kwargs)
            return v
        
        elif len(channels) == 2:
            v = KMeans2DView(op = self)
            v.trait_set(xchannel = channels[0], 
                        ychannel = channels[1],
                        xscale = scale[channels[0]],
                        yscale = scale[channels[1]], 
                        **kwargs)
            return v
        
        else:
            raise util.CytoflowViewError('channels',
                                         "Can't specify more than two channels for a default view")
Esempio n. 19
0
class FlowView(HasTraits):
    showFlow = Bool(True)
    flowVectWidth = Int(3)
    flowVectSpacing = Int(3)
    flowVectScale = Float(10)
    flowVectArrowSize = Float(1)
    flowImageName = CStr('outFlow')
    flowMaskName = CStr('outFlowMask')
    flowVectThresh = Float(0)
    flowVectType = Enum('Arrows', 'Bicolour')

    def default_traits_view(self):
        from traitsui.api import View, Item, Group
        traits_view = View(
            Item('showFlow'),
            Item('flowImageName'),
            Item('flowVectWidth'),
            Item('flowVectSpacing'),
            Item('flowVectScale'),
            Item('flowVectArrowSize'),
            Item('flowVectThresh'),
            Item('flowVectType'),
        )

        return traits_view

    #property proxies for do and view
    @property
    def _do(self):
        return self._dsviewer.do

    @property
    def _view(self):
        return self._dsviewer.view

    def __init__(self, dsviewer):
        HasTraits.__init__(self)
        self._dsviewer = weakref.proxy(dsviewer)

        #self.image = dsviewer.image

        #self._penCols = [wx.Colour(*pylab.cm.hsv(v, bytes=True)) for v in np.linspace(0, 1, 16)]
        #self._penColsA = [wx.Colour(*pylab.cm.hsv(v, alpha=0.5, bytes=True)) for v in np.linspace(0, 1, 16)]

        self._penColsA = [wx.Colour(255, 0, 0, 255), wx.Colour(0, 0, 255, 255)]
        self.CreatePens()

        dsviewer.do.overlays.append(self.DrawOverlays)

        dsviewer.paneHooks.append(self.GenFlowPanel)

    def Unplug(self):
        self._dsviewer.do.overlays.remove(self.DrawOverlays)
        self._dsviewer.paneHooks.remove(self.GenFlowPanel)

    @on_trait_change('flowVectWidth')
    def CreatePens(self):
        #self.candPens = [wx.Pen(c, self.candLineWidth, wx.DOT) for c in self.penCols]
        #self.chosenPens = [wx.Pen(c, self.chosenLineWidth) for c in self.penCols]
        self._vecPens = [wx.Pen(c, self.flowVectWidth) for c in self._penColsA]
        #self.selectedPens = [wx.Pen(c, self.selectedLineWidth) for c in self.penCols]

    def GenFlowPanel(self, _pnl):
        item = afp.foldingPane(_pnl,
                               -1,
                               caption="Flow Visualization",
                               pinned=True)

        pan = self.edit_traits(parent=item, kind='panel')
        item.AddNewElement(pan.control)

        _pnl.AddPane(item)

    @property
    def flowImage(self):
        try:
            return self._dsviewer.recipes.activeRecipe.namespace[
                self.flowImageName]
        except KeyError:
            return None

    def DrawOverlays_(self, view, dc):
        flow = self.flowImage

        if (not self.showFlow) or flow is None:
            return

        xb, yb, zb = view._calcVisibleBounds()
        x0, x1 = xb
        y0, y1 = yb

        z = self._do.zp

        flow_x = flow.data[:, :, z, 0].squeeze()
        flow_y = flow.data[:, :, z, 1].squeeze()

        dc.SetBrush(wx.TRANSPARENT_BRUSH)
        dc.SetPen(self._vecPens[0])

        step = int(self.flowVectSpacing)
        scale = float(self.flowVectScale)
        arrowSize = float(self.flowVectArrowSize)

        if self.flowVectType == 'Arrows':
            for x in np.arange(x0, min(x1, flow_x.shape[0]), step, dtype='i'):
                for y in np.arange(y0,
                                   min(y1, flow_y.shape[1]),
                                   step,
                                   dtype='i'):
                    fx = flow_x[x, y]
                    fy = flow_y[x, y]

                    x_1, y_1 = x + scale * fx, y + scale * fy

                    xs, ys = view._PixelToScreenCoordinates(x, y)
                    xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1)

                    dc.DrawLine(xs, ys, xs1, ys1)

                    #now for the arrowhead - normal vectors in each direction
                    l = np.sqrt(fx * fx + fy * fy)

                    h = np.array([x_1, y_1])

                    fh = np.array([fx / l, fy / l])
                    fhh = np.array([-fy / l, fx / l])

                    t1 = h + arrowSize * (.5 * fhh - fh)
                    t2 = h + arrowSize * (-.5 * fhh - fh)

                    xt1, yt1 = view._PixelToScreenCoordinates(*t1)
                    xt2, yt2 = view._PixelToScreenCoordinates(*t2)

                    dc.DrawLine(xs1, ys1, xt1, yt1)
                    dc.DrawLine(xs1, ys1, xt2, yt2)
        elif self.flowVectType == 'Bicolour':
            for x in np.arange(x0, min(x1, flow_x.shape[0]), step, dtype='i'):
                for y in np.arange(y0,
                                   min(y1, flow_y.shape[1]),
                                   step,
                                   dtype='i'):
                    fx = flow_x[x, y]
                    fy = flow_y[x, y]

                    x_1, y_1 = x + 0.5 * scale * fx, y + 0.5 * scale * fy
                    x_2, y_2 = x + scale * fx, y + scale * fy

                    xs, ys = view._PixelToScreenCoordinates(x, y)
                    xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1)
                    xs2, ys2 = view._PixelToScreenCoordinates(x_2, y_2)

                    dc.SetPen(self._vecPens[1])
                    dc.DrawLine(xs, ys, xs1, ys1)
                    dc.SetPen(self._vecPens[0])
                    dc.DrawLine(xs1, ys1, xs2, ys2)

    def DrawOverlays(self, view, dc):
        flow = self.flowImage

        if (not self.showFlow) or flow is None:
            return

        xb, yb, zb = view._calcVisibleBounds()
        x0, x1 = xb
        y0, y1 = yb

        z = self._do.zp

        flow_x = flow.data[:, :, z, 0].squeeze()
        flow_y = flow.data[:, :, z, 1].squeeze()

        dc.SetBrush(wx.TRANSPARENT_BRUSH)
        dc.SetPen(self._vecPens[0])

        step = int(self.flowVectSpacing)
        scale = float(self.flowVectScale)
        arrowSize = float(self.flowVectArrowSize)

        #for x in np.arange(x0,min(x1, flow_x.shape[0]), step, dtype='i'):
        #    for y in np.arange(y0, min(y1, flow_y.shape[1]), step, dtype='i'):

        fx = flow_x[x0:x1:step, y0:y1:step].ravel()
        fy = flow_y[x0:x1:step, y0:y1:step].ravel()

        #flow magnitude
        l = np.sqrt(fx * fx + fy * fy)

        x, y = np.mgrid[x0:min(x1, flow_x.shape[0]):step,
                        y0:min(y1, flow_y.shape[1]):step]
        x = x.ravel()
        y = y.ravel()

        #don't draw any vectors which are below the cutoff length
        f_t_mask = l > self.flowVectThresh

        fx = fx[f_t_mask]
        fy = fy[f_t_mask]

        x = x[f_t_mask]
        y = y[f_t_mask]
        l = l[f_t_mask]

        x_1, y_1 = x + scale * fx, y + scale * fy
        x_0, y_0 = x + 0.5 * scale * fx, y + 0.5 * scale * fy

        xs, ys = view._PixelToScreenCoordinates(x, y)
        xs0, ys0 = view._PixelToScreenCoordinates(x_0, y_0)
        xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1)

        if self.flowVectType == 'Arrows':
            dc.DrawLineList(np.array([xs, ys, xs1, ys1]).T)

            #now for the arrowhead - normal vectors in each direction

            h = np.array([x_1, y_1])

            fh = np.array([fx / l, fy / l])
            fhh = np.array([-fy / l, fx / l])

            t1 = h + arrowSize * (.5 * fhh - fh)
            t2 = h + arrowSize * (-.5 * fhh - fh)

            xt1, yt1 = view._PixelToScreenCoordinates(*t1)
            xt2, yt2 = view._PixelToScreenCoordinates(*t2)

            dc.DrawLineList(np.array([xs1, ys1, xt1, yt1]).T)
            dc.DrawLineList(np.array([xs1, ys1, xt2, yt2]).T)

        else:
            dc.SetPen(self._vecPens[1])
            dc.DrawLineList(np.array([xs, ys, xs0, ys0]).T)
            dc.SetPen(self._vecPens[0])
            dc.DrawLineList(np.array([xs0, ys0, xs1, ys1]).T)
Esempio n. 20
0
class GaussianMixtureOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to one or more channels.
    
    If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical 
    metadata variable named  ``name``, with possible values ``{name}_1`` .... 
    ``name_n`` where ``n`` is the number of components.  An event is assigned to 
    ``name_i`` category if it has the highest posterior probability of having been 
    produced by component ``i``.  If an event has a value that is outside the
    range of one of the channels' scales, then it is assigned to ``{name}_None``.
    
    Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new  
    ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where 
    ``n`` is the number of components.  The column ``{name}_i`` is ``True`` if 
    the event is less than :attr:`sigma` standard deviations from the mean of 
    component ``i``.  If :attr:`num_components` is ``1``, :attr:`sigma` must be 
    greater than 0.
    
    Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new 
    ``double`` metadata variables named ``{name}_1_posterior`` ... 
    ``{name}_n_posterior`` where ``n`` is the number of components.  The column 
    ``{name}_i_posterior`` contains the posterior probability that this event is 
    a member of component ``i``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components must be the same across each subset, though.
    
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the mixture model to.

    scale : Dict(Str : {"linear", "logicle", "log"})
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in :attr:`channels` but not in :attr:`scale`, the current 
        package-wide default (set with :func:`~.set_default_scale`) is used.

    num_components : Int (default = 1)
        How many components to fit to the data?  Must be a positive integer.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in the boolean variable ``{name}_i``?  Must be ``>= 0.0``.  If 
        :attr:`num_components` is ``1``, must be ``> 0``.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit 
        the model separately to each subset of the data with a unique combination of
        ``Time`` and ``Dox``.

    posteriors : Bool (default = False)
        If ``True``, add columns named ``{name}_{i}_posterior`` giving the 
        posterior probability that the event is in component ``i``.  Useful for 
        filtering out low-probability events.
        
    Notes
    -----
    
    We use the Mahalnobis distance as a multivariate generalization of the 
    number of standard deviations an event is from the mean of the multivariate
    gaussian.  If :math:`\\vec{x}` is an observation from a distribution with 
    mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the 
    Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`.
    
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['Y2-A'],
        ...                                scale = {'Y2-A' : 'log'},
        ...                                num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)
        
    And with two channels:
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss',
        ...                                channels = ['V2-A', 'Y2-A'],
        ...                                scale = {'V2-A' : 'log',
        ...                                         'Y2-A' : 'log'},
        ...                                num_components = 2)
        >>> gm_op.estimate(ex)   
        >>> ex2 = gm_op.apply(ex)
        >>> gm_op.default_view().plot(ex2)
        
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian')
    friendly_id = Constant("Gaussian Mixture Model")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    num_components = util.PositiveInt(1, allow_zero=False)
    sigma = util.PositiveFloat(allow_zero=True)
    by = List(Str)

    posteriors = Bool(False)

    # the key is either a single value or a tuple
    _gmms = Dict(Any,
                 Instance(sklearn.mixture.GaussianMixture),
                 transient=True)
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'channels', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    None, "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            gmm = sklearn.mixture.GaussianMixture(
                n_components=self.num_components,
                covariance_type="full",
                random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError(
                    None, "Estimator didn't converge"
                    " for group {0}".format(group))

            # in the 1D version, we sorted the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.

            # that doesn't work in the general case.  instead, we assume that
            # the clusters are likely (?) to be arranged along *one* of the
            # axes, so we take the |norm| of the mean of each cluster and
            # sort that way.

            norms = np.sum(gmm.means_**2, axis=1)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
            gmm.precisions_ = gmm.precisions_[sort_idx]
            gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms

    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if self.sigma > 0:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        'name',
                        "Experiment already has a column named {}".format(
                            cname))

        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        'name',
                        "Experiment already has a column named {}".format(
                            cname))

        if not self._gmms:
            raise util.CytoflowOpError(
                None, "No components found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(
                    None, "Model scale not set.  Did you forget "
                    "to call estimate()?")

        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))
#
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")

        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)


#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")

        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] *
                                          len(experiment),
                                          dtype="object")

        if self.sigma > 0:
            event_gate = {
                i: pd.Series([False] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.posteriors:
            event_posteriors = {
                i: pd.Series([0.0] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        # make the statistics
        components = [x + 1 for x in range(self.num_components)]

        prop_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components],
            names=list(self.by) + ["Component"])
        prop_stat = pd.Series(name="{} : {}".format(self.name, "proportion"),
                              index=prop_idx,
                              dtype=np.dtype(object)).sort_index()

        mean_idx = pd.MultiIndex.from_product(
            [experiment[x].unique()
             for x in self.by] + [components] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name="{} : {}".format(self.name, "mean"),
                              index=mean_idx,
                              dtype=np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name="{} : {}".format(self.name, "sigma"),
                               index=mean_idx,
                               dtype=np.dtype(object)).sort_index()
        interval_stat = pd.Series(name="{} : {}".format(self.name, "interval"),
                                  index=mean_idx,
                                  dtype=np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components] +
            [self.channels] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel_1"] +
            ["Channel_2"])
        corr_stat = pd.Series(name="{} : {}".format(self.name, "correlation"),
                              index=corr_idx,
                              dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])

                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(
                        self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx

                event_assignments.iloc[group_idx] = predicted_str

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]

                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s),
                                                (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed

                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)

                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)

            if self.posteriors:
                p = np.full((len(x), self.num_components), 0.0)
                p[~x_na] = gmm.predict_proba(x[~x_na])
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]

            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__') and not isinstance(
                        group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.loc[g] = gmm.weights_[c]

                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.loc[g2] = self._scale[channel1].inverse(
                        gmm.means_[c, cidx1])

                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.loc[g2] = (
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] -
                                                      s[cidx1]),
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] +
                                                      s[cidx1]))

                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]

                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True)

        new_experiment = experiment.clone()

        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.sigma > 0:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])

        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double",
                                             event_posteriors[c])

        new_experiment.statistics[(self.name,
                                   "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(
                self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(
                self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.

         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    'scale',
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                'channels',
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = GaussianMixture1DView(op=self)
            v.trait_set(channel=channels[0],
                        scale=scale[channels[0]],
                        **kwargs)
            return v

        elif len(channels) == 2:
            v = GaussianMixture2DView(op=self)
            v.trait_set(xchannel=channels[0],
                        ychannel=channels[1],
                        xscale=scale[channels[0]],
                        yscale=scale[channels[1]],
                        **kwargs)
            return v

        else:
            raise util.CytoflowViewError(
                'channels',
                "Can't specify more than two channels for a default view")
Esempio n. 21
0
class RangeOp(HasStrictTraits):
    """Apply a range gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    channel : Str
        The name of the channel to apply the range gate.
        
    low : Float
        The lowest value to include in this gate.
        
    high : Float
        The highest value to include in this gate.
        
    Examples
    --------
    >>> range = flow.RangeOp()
    >>> range.name = "Y2-A+"
    >>> range.channel = 'Y2-A'
    >>> range.low = 0.3
    >>> range.high = 0.8
    >>> 
    >>> ex3 = range.apply(ex2)
    
    Alternately  (in an IPython notebook with `%matplotlib notebook`)
    
    >>> r = RangeOp(name = 'Y2-A+',
    ...             channel = 'Y2-A')
    >>> rv = r.default_view()
    >>> rv.interactive = True
    >>> rv.plot(ex2)
    >>> ### draw a range on the plot ###
    >>> ex3 = r.apply(ex2)
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.range')
    friendly_id = Constant('Range')

    name = CStr()
    channel = Str()
    low = CFloat()
    high = CFloat()

    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment, the same as old_experiment but with a new
            column the same as the operation name.  The bool is True if the
            event's measurement in self.channel is greater than self.low and
            less than self.high; it is False otherwise.
        """

        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if not self.channel:
            raise util.CytoflowOpError("Channel not specified")

        if not self.channel in experiment.channels:
            raise util.CytoflowOpError(
                "Channel {0} not in the experiment".format(self.channel))

        if self.high <= self.low:
            raise util.CytoflowOpError("range high must be > range low")

        if self.high <= experiment[self.channel].min():
            raise util.CytoflowOpError("range high must be > {0}".format(
                experiment[self.channel].min()))
        if self.low >= experiment[self.channel].max:
            raise util.CytoflowOpError("range low must be < {0}".format(
                experiment[self.channel].max()))

        gate = experiment[self.channel].between(self.low, self.high)
        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(self.clone_traits())

        return new_experiment

    def default_view(self, **kwargs):
        return RangeSelection(op=self, **kwargs)
Esempio n. 22
0
class Range2DOp(HasStrictTraits):
    """
    Apply a 2D range gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by :meth:`apply`
        
    xchannel : Str
        The name of the first channel to apply the range gate.
        
    xlow : Float
        The lowest value in xchannel to include in this gate.
        
    xhigh : Float
        The highest value in xchannel to include in this gate.
        
    ychannel : Str
        The name of the secon channel to apply the range gate.
        
    ylow : Float
        The lowest value in ychannel to include in this gate.
        
    yhigh : Float
        The highest value in ychannel to include in this gate.
        
   
    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> r = flow.Range2DOp(name = "Range2D",
        ...                    xchannel = "V2-A",
        ...                    xlow = 10,
        ...                    xhigh = 1000,
        ...                    ychannel = "Y2-A",
        ...                    ylow = 1000,
        ...                    yhigh = 20000)
  
        
    Show the default view.  

    .. plot::
        :context: close-figs
            
        >>> r.default_view(huefacet = "Dox",
        ...                xscale = 'log',
        ...                yscale = 'log').plot(ex)
        
    Apply the gate, and show the result
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = r.apply(ex)
        >>> ex2.data.groupby('Range2D').size()
        Range2D
        False    16405
        True      3595
        dtype: int64
        
    """
    
    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.range2d')
    friendly_id = Constant("2D Range")
    
    name = CStr()
    
    xchannel = Str()
    xlow = CFloat()
    xhigh = CFloat()
    
    ychannel = Str()
    ylow = CFloat()
    yhigh = CFloat()

    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new :class:`~Experiment`, the same as the old experiment but with 
            a new column with a data type of ``bool`` and the same as the 
            operation :attr:`name`.  The bool is ``True`` if the event's 
            measurement in :attr:`xchannel` is greater than :attr:`xlow` and
            less than :attr:`high`, and the event's measurement in 
            :attr:`ychannel` is greater than :attr:`ylow` and less than 
            :attr:`yhigh`; it is ``False`` otherwise.
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
        
        # make sure old_experiment doesn't already have a column named self.name
        if(self.name in experiment.data.columns):
            raise util.CytoflowOpError('name',
                                       "Experiment already contains a column {0}"
                                       .format(self.name))
        
        if not self.xchannel or not self.ychannel:
            raise util.CytoflowOpError('xchannel',
                                       "Must specify xchannel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError('xchannel',
                                       "xchannel isn't in the experiment")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel',
                                       "Must specify ychannel")
        
        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError('ychannel',
                                       "ychannel isn't in the experiment")
        
        if self.xhigh <= experiment[self.xchannel].min():
            raise util.CytoflowOpError('xhigh',
                                       "x channel range high must be > {0}"
                                       .format(experiment[self.xchannel].min()))
        if self.xlow >= experiment[self.xchannel].max():
            raise util.CytoflowOpError('xlow',
                                       "x channel range low must be < {0}"
                                       .format(experiment[self.xchannel].max()))
            
        if self.yhigh <= experiment[self.ychannel].min():
            raise util.CytoflowOpError('yhigh',
                                       "y channel range high must be > {0}"
                                       .format(experiment[self.ychannel].min()))
        if self.ylow >= experiment[self.ychannel].max():
            raise util.CytoflowOpError('ylow',
                                       "y channel range low must be < {0}"
                                       .format(experiment[self.ychannel].max()))
        
        x = experiment[self.xchannel].between(self.xlow, self.xhigh)
        y = experiment[self.ychannel].between(self.ylow, self.yhigh)
        gate = pd.Series(x & y)
        
        new_experiment = experiment.clone() 
        new_experiment.add_condition(self.name, "bool", gate)   
        new_experiment.history.append(self.clone_traits(transient = lambda t: True))    
        return new_experiment
    
    def default_view(self, **kwargs):
        return RangeSelection2D(op = self, **kwargs)
Esempio n. 23
0
class ParticleTracker(HasTraits):
    features = CStr('x, y')
    pNew = Float(0.2)
    r0 = Float(500)
    pLinkCutoff = Float(0.2)

    showTracks = Bool(True)
    showCandidates = Bool(True)

    candLineWidth = Int(4)
    chosenLineWidth = Int(5)
    trackLineWidth = Int(2)

    traits_view = View(
        Group(Item(name='features'), Item(name='pNew'), Item(name='r0'),
              Item(name='pLinkCutoff')),
        Group(Item(name='showTracks'), Item(name='showCandidates')))

    def __init__(self, dsviewer):
        HasTraits.__init__(self)
        self.dsviewer = dsviewer
        self.view = dsviewer.view
        self.do = dsviewer.do
        self.image = dsviewer.image

        self.tracker = None

        #        self.features.on_trait_change(self.OnFeaturesChanged)
        #        self.pNew.on_trait_change(self.OnParamChange)
        #        self.r0.on_trait_change = self.OnParamChange
        #        self.pLinkCutoff.on_trait_change = self.OnParamChange

        #self.pipeline = dsviewer.pipeline
        self.penCols = [
            wx.Colour(*pylab.cm.hsv(v, bytes=True))
            for v in np.linspace(0, 1, 16)
        ]
        self.penColsA = [
            wx.Colour(*pylab.cm.hsv(v, alpha=0.5, bytes=True))
            for v in np.linspace(0, 1, 16)
        ]
        self.CreatePens()

        dsviewer.do.overlays.append(self.DrawOverlays)

        dsviewer.paneHooks.append(self.GenTrackingPanel)

    @on_trait_change('candLineWidth, chosenLineWidth, trackLineWidth')
    def CreatePens(self):
        self.candPens = [
            wx.Pen(c, self.candLineWidth, wx.DOT) for c in self.penCols
        ]
        self.chosenPens = [
            wx.Pen(c, self.chosenLineWidth) for c in self.penCols
        ]
        self.trackPens = [
            wx.Pen(c, self.trackLineWidth) for c in self.penColsA
        ]

    def GenTrackingPanel(self, _pnl):
        item = afp.foldingPane(_pnl,
                               -1,
                               caption="Particle Tracking",
                               pinned=True)

        pan = self.edit_traits(parent=item, kind='panel')
        item.AddNewElement(pan.control)

        #pan = wx.Panel(item, -1)

        #        vsizer = wx.BoxSizer(wx.VERTICAL)
        #
        ##        #if self.multiChannel: #we have channels
        #        hsizer = wx.BoxSizer(wx.HORIZONTAL)
        #        hsizer.Add(wx.StaticText(pan, -1, 'Features:'), 0,wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
        #
        #        self.tFeatures = wx.Text(pan, -1, 'x, y')
        #
        #        hsizer.Add(self.tFeatures, 1,wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
        #
        #        vsizer.Add(hsizer, 0,wx.EXPAND|wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 0)
        #
        #        hsizer = wx.BoxSizer(wx.HORIZONTAL)
        #
        #
        #
        #        vsizer.Add(hsizer, 0,wx.ALL|wx.ALIGN_RIGHT, 5)
        #
        #
        #        pan.SetSizer(vsizer)
        #        vsizer.Fit(pan)

        #item.AddNewElement(pan)
        bTrack = wx.Button(item, -1, 'Track')
        bTrack.Bind(wx.EVT_BUTTON, self.OnTrack)
        item.AddNewElement(bTrack)

        _pnl.AddPane(item)

    @on_trait_change('pNew, r0, pLinkCutoff')
    def OnParamChange(self):
        if not self.tracker == None:
            self.tracker.pNew = self.pNew
            self.tracker.r0 = self.r0
            self.tracker.linkageCuttoffProb = self.pLinkCutoff

    @on_trait_change('features')
    def OnFeaturesChanged(self):
        self.tracker = None

    def OnTrack(self, event):
        pipeline = self.dsviewer.pipeline

        if self.tracker == None:
            featNames = [s.strip() for s in self.features.split(',')]

            def _calcWeights(s):
                fw = s.split('*')
                if len(fw) == 2:
                    return float(fw[0]), fw[1]
                else:
                    return 1.0, s

            weightedFeats = [_calcWeights(s) for s in featNames]

            feats = np.vstack([w * pipeline[fn] for w, fn in weightedFeats])

            self.tracker = tracking.Tracker(pipeline['t'], feats)

            self.tracker.pNew = self.pNew
            self.tracker.r0 = self.r0
            self.tracker.linkageCuttoffProb = self.pLinkCutoff

        for i in range(1, self.dsviewer.image.data.shape[2]):
            L = self.tracker.calcLinkages(i, i - 1)
            self.tracker.updateTrack(i, L)

        pipeline.selectedDataSource.clumps = self.tracker.clumpIndex
        pipeline.selectedDataSource.setMapping('clumpIndex', 'clumps')

        clumpSizes = np.zeros_like(self.tracker.clumpIndex)

        for i in set(self.tracker.clumpIndex):
            ind = (self.tracker.clumpIndex == i)

            clumpSizes[ind] = ind.sum()

        pipeline.selectedDataSource.clumpSizes = clumpSizes
        pipeline.selectedDataSource.setMapping('clumpSize', 'clumpSizes')

    def DrawOverlays(self, view, dc):
        if self.showTracks and not (self.tracker == None):
            t = self.dsviewer.pipeline['t']
            x = self.dsviewer.pipeline['x'] / self.image.voxelsize[0]
            y = self.dsviewer.pipeline['y'] / self.image.voxelsize[1]

            xb, yb, zb = view._calcVisibleBounds()

            IFoc = (x >= xb[0]) * (y >= yb[0]) * (t >= (zb[0] - 5)) * (
                x < xb[1]) * (y < yb[1]) * (t < (zb[1] + 5))

            tFoc = list(set(self.tracker.clumpIndex[IFoc]))

            dc.SetBrush(wx.TRANSPARENT_BRUSH)

            #pGreen = wx.Pen(wx.TheColourDatabase.FindColour('RED'),1)
            #pRed = wx.Pen(wx.TheColourDatabase.FindColour('RED'),1)
            #dc.SetPen(pGreen)

            for tN in tFoc:
                IFoc = (self.tracker.clumpIndex == tN)
                if IFoc.sum() >= 2:
                    pFoc = np.vstack(
                        view._PixelToScreenCoordinates3D(
                            x[IFoc], y[IFoc], t[IFoc])).T

                    #print pFoc.shape
                    dc.SetPen(self.trackPens[tN % 16])
                    dc.DrawSpline(pFoc)

        if self.showCandidates and not (self.tracker == None):
            if view.do.zp >= 1:
                iCurr = view.do.zp
                iPrev = view.do.zp - 1
                links = self.tracker.calcLinkages(iCurr, iPrev)

                #pRed = wx.Pen(wx.TheColourDatabase.FindColour('RED'),2)
                pRedDash = wx.Pen(wx.TheColourDatabase.FindColour('RED'), 2,
                                  wx.SHORT_DASH)
                dc.SetPen(pRedDash)

                dc.SetFont(
                    wx.Font(12, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL,
                            wx.FONTWEIGHT_BOLD))
                dc.SetTextForeground(wx.TheColourDatabase.FindColour('YELLOW'))

                for curFrameIndex, linkInfo in links.items():
                    inds = self.tracker.indicesByT[iCurr]
                    i = inds[curFrameIndex]

                    x1 = self.dsviewer.pipeline['x'][i] / self.image.voxelsize[
                        0]
                    y1 = self.dsviewer.pipeline['y'][i] / self.image.voxelsize[
                        1]

                    x1s, y1s = view._PixelToScreenCoordinates(x1, y1)

                    linkSrcs, linkPs = linkInfo
                    n = 0
                    for ls, lp in zip(linkSrcs, linkPs):
                        if n == 0:
                            dc.SetPen(
                                self.chosenPens[self.tracker.clumpIndex[ls] %
                                                16])
                        else:
                            dc.SetPen(
                                self.candPens[self.tracker.clumpIndex[ls] %
                                              16])

                        if ls == -1:
                            #new object
                            x0 = x1
                            y0 = y1 - 10

                        else:
                            x0 = self.dsviewer.pipeline['x'][
                                ls] / self.image.voxelsize[0]
                            y0 = self.dsviewer.pipeline['y'][
                                ls] / self.image.voxelsize[1]

                        x0s, y0s = view._PixelToScreenCoordinates(x0, y0)
                        dc.DrawLine(x0s, y0s, x1s, y1s)

                        if ls == -1:
                            dc.DrawText('N', x0s, y0s + 1)
                        dc.DrawText('%1.1f' % lp, (x0s + x1s) / 2 + 2,
                                    (y0s + y1s) / 2 + 2)
                        n += 1
Esempio n. 24
0
class BinningOp(HasStrictTraits):
    """
    Bin data along an axis.
    
    This operation creates equally spaced bins (in linear or log space)
    along an axis and adds a metadata column assigning each event to a bin.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by apply()
        
    channel : Str
        The name of the channel along which to bin.

    scale : Enum("linear", "log", "logicle)
        Make the bins equidistant along what scale?
        
    num_bins = Int
        The number of bins to make.  Must set either `num_bins` or `bin_width`.
        If both are defined, `num_bins` takes precedence.
        
    bin_width = Float
        The width of the bins.  Must set either `num_bins` or `bin_width`.  If
        `scale` is `log`, `bin_width` is in log-10 units; if `scale` is
        `logicle`, and error is thrown because the units are ill-defined.
        If both `num_bins` and `bin_width` are defined, `num_bins` takes 
        precedence. 
        
    bin_count_name : Str
        If `bin_count_name` is set, add another piece of metadata when calling
        `apply()` that contains the number of events in the bin that this event
        falls in.  Useful for filtering bins by # of events.
        
    Examples
    --------
    >>> bin_op = flow.BinningOp(name = "CFP_Bin",
    ...                         channel = "PE-Tx-Red-YG-A",
    ...                         scale = "linear",
    ...                         num_bins = 40)
    >>> ex5_binned = bin_op.apply(ex5)

    >>> h.huefacet = "CFP_Bin"
    >>> h.plot(ex5_binned)
    """
    
    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.binning')
    friendly_id = Constant("Binning")
    
    name = CStr()
    bin_count_name = CStr()
    channel = Str()
    num_bins = util.PositiveInt(Undefined)
    bin_width = util.PositiveFloat(Undefined)
    scale = util.ScaleEnum

    def apply(self, experiment):
        """Applies the binning to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment, the same as old_experiment but with a new
            column the same as the operation name.  The bool is True if the
            event's measurement in self.channel is greater than self.low and
            less than self.high; it is False otherwise.
        """
        if not experiment:
            raise util.CytoflowOpError("no experiment specified")
        
        if not self.name:
            raise util.CytoflowOpError("name is not set")
        
        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("name {0} is in the experiment already"
                                  .format(self.name))
            
        if self.bin_count_name and self.bin_count_name in experiment.data.columns:
            raise util.CytoflowOpError("bin_count_name {0} is in the experiment already"
                                  .format(self.bin_count_name))
        
        if not self.channel:
            raise util.CytoflowOpError("channel is not set")
        
        if self.channel not in experiment.data.columns:
            raise util.CytoflowOpError("channel {0} isn't in the experiment"
                                  .format(self.channel))
              
        if self.num_bins is Undefined and self.bin_width is Undefined:
            raise util.CytoflowOpError("must set either bin number or width")
        
        if self.num_bins is Undefined \
           and not (self.scale == "linear" or self.scale == "log"):
            raise util.CytoflowOpError("Can only use bin_width with linear or log scale") 
        
        scale = util.scale_factory(self.scale, experiment, self.channel)
        scaled_data = scale(experiment.data[self.channel])
            
        channel_min = bn.nanmin(scaled_data)
        channel_max = bn.nanmax(scaled_data)
        
        num_bins = self.num_bins if self.num_bins is not Undefined else \
                   (channel_max - channel_min) / self.bin_width

        bins = np.linspace(start = channel_min, stop = channel_max,
                           num = num_bins)
            
        # bins need to be internal; drop the first and last one
        bins = bins[1:-1]
            
        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name,
                                     "int",
                                     np.digitize(scaled_data, bins))
        
        # if we're log-scaled (for example), don't label data that isn't
        # showable on a log scale!
        new_experiment.data.ix[np.isnan(scaled_data), self.name] = np.NaN
        
        # keep track of the bins we used, for pretty plotting later.
        new_experiment.metadata[self.name]["bin_scale"] = self.scale
        new_experiment.metadata[self.name]["bins"] = bins
        
        if self.bin_count_name:
            # TODO - this is a HUGE memory hog?!
            agg_count = new_experiment.data.groupby(self.name).count()
            agg_count = agg_count[agg_count.columns[0]]
            
            # have to make the condition a float64, because if we're in log
            # space there may be events that have NaN as the bin number.
            
            new_experiment.add_condition(
                self.bin_count_name,
                "float64",
                new_experiment[self.name].map(agg_count))
        
        new_experiment.history.append(self.clone_traits())
        return new_experiment
    
    def default_view(self, **kwargs):
        return BinningView(op = self, **kwargs)
Esempio n. 25
0
class FlowPeaksOp(HasStrictTraits):
    """
    This module uses the flowPeaks algorithm to assign events to clusters in
    an unsupervised manner.
    
    Call `estimate()` to compute the clusters.
      
    Calling `apply()` creates a new categorical metadata variable 
    named `name`, with possible values `{name}_1` .... `name_n` where `n` is 
    the number of clusters, specified with `n_clusters`.
    
    The same model may not be appropriate for different subsets of the data set.
    If this is the case, you can use the `by` attribute to specify metadata by 
    which to aggregate the data before estimating (and applying) a model.  The 
    number of clusters is the same across each subset, though.

    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channels : List(Str)
        The channels to apply the clustering algorithm to.

    scale : Dict(Str : Enum("linear", "logicle", "log"))
        Re-scale the data in the specified channels before fitting.  If a 
        channel is in `channels` but not in `scale`, the current package-wide
        default (set with `set_default_scale`) is used.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    h : Float (default = 1.5)
        A scalar value by which to scale the covariance matrices of the 
        underlying density function.  (See `Notes`, below, for more details.)
        
    h0 : Float (default = 1.0)
        A scalar value by which to smooth the covariance matrices of the
        underlying density function.  (See `Notes`, below, for more details.)
        
    tol : Float (default = 0.5)
        How readily should clusters be merged?  Must be between 0 and 1.
        See `Notes`, below, for more details.
        
    merge_dist : Float (default = 5)
        How far apart can clusters be before they are merged?  This is
        a unit-free scalar, and is approximately the maximum number of
        k-means clusters between peaks. 
        
    find_outliers : Bool (default = False)
        Should the algorithm use an extra step to identify outliers?
        *Note: I have disabled this code until I can try to make it faster.*
        
    Notes
    -----
    
    This algorithm uses kmeans to find a large number of clusters, then 
    hierarchically merges those clusters.  Thus, the user does not need to
    specify the number of clusters in advance; and it can find non-convex
    clusters.  It also operates in an arbitrary number of dimensions.
    
    The merging happens in two steps.  First, the cluster centroids are used
    to estimate an underlying density function.  Then, the local maxima of
    the density function are found using a numerical optimization starting from
    each centroid, and k-means clusters that converge to the same local maximum
    are merged.  Finally, these clusters-of-clusters are merged if their local 
    maxima are (a) close enough, and (b) the density function between them is 
    smooth enough.  Thus, the final assignment of each event depends on the 
    k-means cluster it ends up in, and which  cluster-of-clusters that k-means 
    centroid is assigned to.
    
    There are a lot of parameters that affect this process.  The k-means
    clustering is pretty robust (though somewhat sensitive to the number of 
    clusters, which is currently not exposed in the API.) The most important
    are exposed as traits of the `FlowPeaksOp` class.  These include:
     - h, h0: sometimes the density function is too "rough" to find good
              local maxima.  These parameters smooth it out by widening the
              covariance matrices.  Increasing `h` makes the density rougher;
              increasing `h0` makes it smoother.
              
    - tol: How smooth does the density function have to be between two density
           maxima to merge them?  Must be between 0 and 1.
           
    - merge_dist: How close must two maxima be to merge them?  This value is
                  a unit-free scalar, and is approximately the number of
                  k-means clusters between the two maxima.
    
    
    For details and a theoretical justification, see
    
    flowPeaks: a fast unsupervised clustering for flow cytometry data via
    K-means and density peak finding 
    
    Yongchao Ge  Stuart C. Sealfon
    Bioinformatics (2012) 28 (15): 2052-2058.         
  
    Examples
    --------
    
    >>> fp_op = FlowPeaksOp(name = "Clust",
    ...                     channels = ["V2-A", "Y2-A"],
    ...                     scale = {"V2-A" : "log"})
    >>> fp_op.estimate(ex2)
    >>> fp_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2)
    >>> ex3 = fp_op.apply(ex2)
    """

    id = Constant('edu.mit.synbio.cytoflow.operations.flowpeaks')
    friendly_id = Constant("FlowPeaks Clustering")

    name = CStr()
    channels = List(Str)
    scale = Dict(Str, util.ScaleEnum)
    by = List(Str)
    #     find_outliers = Bool(False)

    # parameters that control estimation, with sensible defaults
    h = util.PositiveFloat(1.5, allow_zero=False)
    h0 = util.PositiveFloat(1, allow_zero=False)
    tol = util.PositiveFloat(0.5, allow_zero=False)
    merge_dist = util.PositiveFloat(5, allow_zero=False)

    # parameters that control outlier selection, with sensible defaults

    _kmeans = Dict(Any,
                   Instance(sklearn.cluster.MiniBatchKMeans),
                   transient=True)
    _normals = Dict(Any, List(Function), transient=True)
    _density = Dict(Any, Function, transient=True)
    _peaks = Dict(Any, List(Array), transient=True)
    _cluster_peak = Dict(Any, List,
                         transient=True)  # kmeans cluster idx --> peak idx
    _cluster_group = Dict(Any, List,
                          transient=True)  # kmeans cluster idx --> group idx
    _scale = Dict(Str, Instance(util.IScale), transient=True)

    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
#                 if self.scale[c] == 'log':
#                     self._scale[c].mode = 'mask'
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for data_group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(data_group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            #### choose the number of clusters and fit the kmeans
            num_clusters = [
                util.num_hist_bins(x[:, c]) for c in range(len(self.channels))
            ]
            num_clusters = np.ceil(np.median(num_clusters))
            num_clusters = int(num_clusters)

            self._kmeans[data_group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters)

            kmeans.fit(x)
            x_labels = kmeans.predict(x)
            d = len(self.channels)

            #### use the kmeans centroids to parameterize a finite gaussian
            #### mixture model which estimates the density function

            d = len(self.channels)
            s0 = np.zeros([d, d])
            for j in range(d):
                r = x[d].max() - x[d].min()
                s0[j, j] = (r / (num_clusters**(1. / d)))**0.5

            means = []
            weights = []
            normals = []
            beta_max = []

            for k in range(num_clusters):
                xk = x[x_labels == k]
                num_k = np.sum(x_labels == k)
                weight_k = num_k / len(x_labels)
                mu = xk.mean(axis=0)
                means.append(mu)
                s = np.cov(xk, rowvar=False)

                el = num_k / (num_clusters + num_k)
                s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0

                n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth)
                weights.append(weight_k)
                normals.append(lambda x, n=n: n.pdf(x))

                # get appropriate step size for peak finding
                min_b = np.inf
                for b in np.diagonal(s_smooth):
                    if np.sqrt(b) < min_b:
                        min_b = np.sqrt(b)
                beta_max.append(b)

            self._normals[data_group] = normals
            self._density[
                data_group] = density = lambda x, weights=weights, normals=normals: np.sum(
                    [w * n(x) for w, n in zip(weights, normals)], axis=0)

            ### use optimization on the finite gmm to find the local peak for
            ### each kmeans cluster
            peaks = []
            peak_clusters = []  # peak idx --> list of clusters

            min_mu = [np.inf] * len(self.channels)
            max_mu = [-1.0 * np.inf] * len(self.channels)

            for k in range(num_clusters):
                mu = means[k]
                for ci in range(len(self.channels)):
                    if mu[ci] < min_mu[ci]:
                        min_mu[ci] = mu[ci]
                    if mu[ci] > max_mu[ci]:
                        max_mu[ci] = mu[ci]

            constraints = []
            for ci, c in enumerate(self.channels):
                constraints.append({
                    'type':
                    'ineq',
                    'fun':
                    lambda x, min_mu=min_mu[ci]: x - min_mu
                })
                constraints.append({
                    'type':
                    'ineq',
                    'fun':
                    lambda x, max_mu=max_mu[ci]: max_mu - x
                })

            for k in range(num_clusters):
                mu = means[k]
                f = lambda x: -1.0 * density(x)

                res = scipy.optimize.minimize(f,
                                              mu,
                                              method='COBYLA',
                                              constraints=constraints,
                                              options={
                                                  'rhobeg': beta_max[k],
                                                  'maxiter': 5000
                                              })
                if not res.success:
                    raise util.CytoflowOpError(
                        "Peak finding failed for cluster {}: {}".format(
                            k, res.message))


#                 ### The peak-searching algorithm from the paper.  works fine,
#                 ### but slow!  we get similar results with the COBYLA
#                 ### optimization method from scipy, using an appropriate rho
#                 x0 = x = means[k]
#                 k0 = k
#                 b = beta_max[k] / 10.0
#                 Nsuc = 0
#                 n = 0
#
#                 while(n < 1000):
# #                     df = scipy.misc.derivative(density, x, 1e-6)
#                     df = statsmodels.tools.numdiff.approx_fprime(x, density)
#                     if np.linalg.norm(df) < 1e-3:
#                         break
#
#                     y = x + b * df / np.linalg.norm(df)
#                     if density(y) <= density(x):
#                         Nsuc = 0
#                         b = b / 2.0
#                         continue
#
#                     Nsuc += 1
#                     if Nsuc >= 2:
#                         b = min(2*b, beta_max[k])
#
#                     ky = kmeans.predict(y[np.newaxis, :])[0]
#                     if ky == k:
#                         x = y
#                     else:
#                         k = ky
#                         b = beta_max[k] / 10.0
#                         mu = means[k]
#                         if density(mu) > density(y):
#                             x = mu
#                         else:
#                             x = y
#
#                     n += 1
#
#
#
#                 print("{} --> {}, {}".format(x0, x, n))

                merged = False
                for pi, p in enumerate(peaks):
                    if np.linalg.norm(p - res.x) < (1e-2):
                        peak_clusters[pi].append(k)
                        merged = True
                        break

                if not merged:
                    peak_clusters.append([k])
                    peaks.append(res.x)

            self._peaks[data_group] = peaks

            ### merge peaks that are sufficiently close

            groups = [[x] for x in range(len(peaks))]
            peak_groups = [x for x in range(len(peaks))
                           ]  # peak idx --> group idx

            def max_tol(x, y):
                f = lambda a: density(a[np.newaxis, :])
                #                 lx = kmeans.predict(x[np.newaxis, :])[0]
                #                 ly = kmeans.predict(y[np.newaxis, :])[0]
                n = len(x)
                n_scale = 1

                #                 n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters))

                def tol(t):
                    zt = x + t * (y - x)
                    fhat_zt = f(x) + t * (f(y) - f(x))
                    return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale

                res = scipy.optimize.minimize_scalar(tol,
                                                     bounds=[0, 1],
                                                     method='Bounded')

                if res.status != 0:
                    raise util.CytoflowOpError(
                        "tol optimization failed for {}, {}".format(x, y))
                return -1.0 * res.fun

            def nearest_neighbor_dist(k):
                min_dist = np.inf

                for i in range(num_clusters):
                    if i == k:
                        continue
                    dist = np.linalg.norm(means[k] - means[i])
                    if dist < min_dist:
                        min_dist = dist

                return min_dist

            sk = [nearest_neighbor_dist(x) for x in range(num_clusters)]

            def s(x):
                k = kmeans.predict(x[np.newaxis, :])[0]
                return sk[k]

            def can_merge(g, h):
                for pg in g:
                    for ph in h:
                        vg = peaks[pg]
                        vh = peaks[ph]
                        dist_gh = np.linalg.norm(vg - vh)

                        if max_tol(vg, vh) < self.tol and dist_gh / (
                                s(vg) + s(vh)) <= self.merge_dist:
                            return True

                return False

            while True:
                if len(groups) == 1:
                    break

                # find closest mergable groups
                min_dist = np.inf
                for gi in range(len(groups)):
                    g = groups[gi]

                    for hi in range(gi + 1, len(groups)):
                        h = groups[hi]

                        if can_merge(g, h):
                            dist_gh = np.inf
                            for pg in g:
                                vg = peaks[pg]
                                for ph in h:
                                    vh = peaks[ph]
                                    #                                     print("vg {} vh {}".format(vg, vh))
                                    dist_gh = min(dist_gh,
                                                  np.linalg.norm(vg - vh))

                            if dist_gh < min_dist:
                                min_gi = gi
                                min_hi = hi
                                min_dist = dist_gh

                if min_dist == np.inf:
                    break

                # merge the groups
                groups[min_gi].extend(groups[min_hi])
                for g in groups[min_hi]:
                    peak_groups[g] = min_gi
                del groups[min_hi]

        cluster_group = [0] * num_clusters
        cluster_peaks = [0] * num_clusters

        for gi, g in enumerate(groups):
            for p in g:
                for cluster in peak_clusters[p]:
                    cluster_group[cluster] = gi
                    cluster_peaks[cluster] = p

        self._peaks[data_group] = peaks
        self._cluster_peak[data_group] = cluster_peaks
        self._cluster_group[data_group] = cluster_group

    def apply(self, experiment):
        """
        Apply the KMeans clustering to the data
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series(["{}_None".format(self.name)] *
                                      len(experiment),
                                      dtype="object")

        # make the statistics
        #         clusters = [x + 1 for x in range(self.num_clusters)]
        #
        #         idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels],
        #                                          names = list(self.by) + ["Cluster"] + ["Channel"])
        #         centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            kmeans = self._kmeans[group]

            predicted_km = np.full(len(x), -1, "int")
            predicted_km[~x_na] = kmeans.predict(x[~x_na])

            groups = np.asarray(self._cluster_group[group])
            predicted_group = np.full(len(x), -1, "int")
            predicted_group[~x_na] = groups[predicted_km[~x_na]]

            #             num_groups = len(set(groups))
            #             if self.find_outliers:
            #                 density = self._density[group]
            #                 max_d = [-1.0 * np.inf] * num_groups
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #                     d_x_c = density(x[xi])
            #                     if d_x_c > max_d[x_c]:
            #                         max_d[x_c] = d_x_c
            #
            #                 group_density = [None] * num_groups
            #                 group_weight = [0.0] * num_groups
            #
            #                 for c in range(num_groups):
            #                     num_c = np.sum(predicted_group == c)
            #                     clusters = np.argwhere(groups == c).flatten()
            #
            #                     normals = []
            #                     weights = []
            #                     for k in range(len(clusters)):
            #                         num_k = np.sum(predicted_km == k)
            #                         weight_k = num_k / num_c
            #                         group_weight[c] += num_k / len(x)
            #                         weights.append(weight_k)
            #                         normals.append(self._normals[group][k])
            #
            #                     group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0)
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #
            #                     if density(x[xi]) / max_d[x_c] < 0.01:
            #                         predicted_group[xi] = -1
            #                         continue
            #
            #                     sum_d = 0
            #                     for c in set(groups):
            #                         sum_d += group_weight[c] * group_density[c](x[xi])
            #
            #                     if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8:
            #                         predicted_group[xi] = -1

            #
            #                     max_d = -1.0 * np.inf
            #                     for x_c in x[predicted_group == c]:
            #                         x_c_d = density(x_c)
            #                         if x_c_d > max_d:
            #                             max_d = x_c_d
            #
            #                     for i in range(len(x)):
            #                         if predicted_group[i] == c and density(x[i]) / max_d <= 0.01:
            #                             predicted_group[i] = -1
            #
            #

            predicted_str = pd.Series(["(none)"] * len(predicted_group))
            for c in range(len(self._cluster_group[group])):
                predicted_str[predicted_group == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted_group == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "category", event_assignments)

        #         new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        density = kwargs.pop('density', False)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            return FlowPeaks1DView(op=self,
                                   channel=channels[0],
                                   scale=scale[channels[0]],
                                   **kwargs)
        elif len(channels) == 2:
            if density:
                return FlowPeaks2DDensityView(op=self,
                                              xchannel=channels[0],
                                              ychannel=channels[1],
                                              xscale=scale[channels[0]],
                                              yscale=scale[channels[1]],
                                              **kwargs)
            else:
                return FlowPeaks2DView(op=self,
                                       xchannel=channels[0],
                                       ychannel=channels[1],
                                       xscale=scale[channels[0]],
                                       yscale=scale[channels[1]],
                                       **kwargs)
        else:
            raise util.CytoflowViewError(
                "Can't specify more than two channels for a default view")
Esempio n. 26
0
class GaussianMixture1DOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to a channel.
    
    Creates a new categorical metadata variable named `name`, with possible
    values `name_1` .... `name_n` where `n` is the number of components.
    An event is assigned to `name_i` category if it falls within `sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if `sigma == 0.0`), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to `name_None`.
    
    As a special case, if `num_components` is `1` and `sigma` > 0.0, then
    the new condition is boolean, `True` if the event fell in the gate and
    `False` otherwise.
    
    Optionally, if `posteriors` is `True`, this module will also compute the 
    posterior probability of each event in its assigned component, returning
    it in a new colunm named `{Name}_Posterior`.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the `by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channel : Str
        Which channel to apply the mixture model to.
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    scale : Enum("linear", "log", "logicle") (default = "linear")
        Re-scale the data before fitting the data?  
        
    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
        
    Examples
    --------
    
    >>> gauss_op = GaussianMixture1DOp(name = "Gaussian",
    ...                                channel = "Y2-A",
    ...                                num_components = 2)
    >>> gauss_op.estimate(ex2)
    >>> gauss_op.default_view().plot(ex2)
    >>> ex3 = gauss_op.apply(ex2)
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d')
    friendly_id = Constant("1D Gaussian Mixture")
    
    name = CStr()
    channel = Str()
    num_components = util.PositiveInt(1)
    sigma = util.PositiveFloat(0.0, allow_zero = True)
    by = List(Str)
    scale = util.ScaleEnum
    posteriors = Bool(False)
    
    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(mixture.GMM), transient = True)
    _scale = Instance(util.IScale, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters
        """
        
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.channel))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))
            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                
            
        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowOpError("If num_components == 1, sigma must be > 0")
        
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError("Subset string '{0}' isn't valid"
                                        .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowViewError("Subset string '{0}' returned no events"
                                        .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda x: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._scale = util.scale_factory(self.scale, experiment, self.channel)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError("Group {} had no data"
                                           .format(group))
            x = data_subset[self.channel].reset_index(drop = True)
            x = self._scale(x)
            
            # drop data that isn't in the scale range
            #x = pd.Series(self._scale(x)).dropna()
            x = x[~np.isnan(x)]
            
            gmm = mixture.GMM(n_components = self.num_components,
                              random_state = 1)
            gmm.fit(x[:, np.newaxis])
            
            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                      " for group {0}"
                                      .format(group))
                
            # to make sure we have a stable ordering, sort the components
            # by the means (so the first component has the lowest mean, 
            # the next component has the next-lowest, etc.)
            
            sort_idx = np.argsort(gmm.means_[:, 0])
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covars_ = gmm.covars_[sort_idx]
           
            gmms[group] = gmm
            
        self._gmms = gmms
    
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """
            
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self._gmms:
            raise util.CytoflowOpError("No model found.  Did you forget to "
                                       "call estimate()?")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                  "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("Experiment already has a column named {0}"
                                  .format(self.name))

        if not self._scale:
            raise util.CytoflowOpError("Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.channel))

            
        if (self.name + "_Posterior") in experiment.data:
            raise util.CytoflowOpError("Column {0} already found in the experiment"
                                  .format(self.name + "_Posterior"))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError("Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))

            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)

        for group, data_subset in groupby:
            if group not in self._gmms:
                raise util.CytoflowOpError("Can't find group in model. "
                                           "Did you call estimate()?")

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
            
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x).values
            
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covars_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covars_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        else:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)
            
        new_experiment.history.append(self.clone_traits(transient = lambda t: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        return GaussianMixture1DView(op = self, **kwargs)
Esempio n. 27
0
class GaussianMixture1DOp(HasStrictTraits):
    """
    This module fits a Gaussian mixture model with a specified number of
    components to a channel.
    
    .. warning:: 
    
        :class:`GaussianMixture1DOp` is **DEPRECATED** and will be removed
        in a future release.  It doesn't correctly handle the case where an 
        event is present in more than one component.  Please use
        :class:`GaussianMixtureOp` instead!
    
    Creates a new categorical metadata variable named :attr:`name`, with possible
    values ``name_1`` .... ``name_n`` where ``n`` is the number of components.
    An event is assigned to ``name_i`` category if it falls within :attr:`sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to ``name_None``.
    
    As a special case, if :attr:`num_components` is `1` and :attr:`sigma` 
    ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in 
    the gate and ``False`` otherwise.
    
    Optionally, if :attr:`posteriors` is ``True``, this module will also 
    compute the posterior probability of each event in its assigned component, 
    returning it in a new colunm named ``{Name}_Posterior``.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the :attr:`by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    channel : Str
        Which channel to apply the mixture model to.
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    scale : Enum("linear", "log", "logicle") (default = "linear")
        Re-scale the data before fitting the model?  
        
    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
        
        
    Examples
    --------
    
    Make a little data set.
    
    .. plot::
        :context: close-figs
            
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> gm_op = flow.GaussianMixture1DOp(name = 'GM',
        ...                                  channel = 'Y2-A',
        ...                                  scale = 'log',
        ...                                  num_components = 2)
        
    Estimate the clusters
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.estimate(ex)
        
    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex)

    Apply the gate
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = gm_op.apply(ex)

    Plot a diagnostic view with the event assignments
    
    .. plot::
        :context: close-figs
        
        >>> gm_op.default_view().plot(ex2)

    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d')
    friendly_id = Constant("1D Gaussian Mixture")
    
    name = CStr()
    channel = Str()
    num_components = util.PositiveInt(1)
    sigma = util.PositiveFloat(0.0, allow_zero = True)
    by = List(Str)
    scale = util.ScaleEnum
    posteriors = Bool(False)
    
    # the key is a set
    _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True)
    _scale = Instance(util.IScale, transient = True)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters.
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """
        
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
        
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError('channel',
                                       "Column {0} not found in the experiment"
                                       .format(self.channel))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
            
        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError('num_components',
                                       "If num_components == 1, all posteriors are 1.")
        
        if subset:
            try:
                experiment = experiment.query(subset)
            except Exception as e:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' isn't valid"
                                           .format(subset)) from e
                
            if len(experiment) == 0:
                raise util.CytoflowOpError('subset',
                                           "Subset string '{0}' returned no events"
                                           .format(subset))
                
        if self.by:
            by = sorted(self.by)
            groupby = experiment.data.groupby(by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._scale = util.scale_factory(self.scale, experiment, channel = self.channel)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(None, 
                                           "Group {} had no data".format(group))
            x = data_subset[self.channel].reset_index(drop = True)
            x = self._scale(x)
            
            # drop data that isn't in the scale range
            #x = pd.Series(self._scale(x)).dropna()
            x = x[~np.isnan(x)]
            
            gmm = mixture.GaussianMixture(n_components = self.num_components,
                                          random_state = 1)
            gmm.fit(x[:, np.newaxis])
            
            if not gmm.converged_:
                raise util.CytoflowOpError(None,
                                           "Estimator didn't converge"
                                           " for group {0}"
                                           .format(group))
                
            # to make sure we have a stable ordering, sort the components
            # by the means (so the first component has the lowest mean, 
            # the next component has the next-lowest, etc.)
            
            sort_idx = np.argsort(gmm.means_[:, 0])
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
           
            gmms[group] = gmm
            
        self._gmms = gmms
    
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment`, with a new column named :attr:`name`,
            and possibly one named :attr:`name` _Posterior.  Also the following
            new :attr:`~.Experiment.statistics`:
            
            - **mean** : Float
                the mean of the fitted gaussian
            
            - **stdev** : Float
                the inverse-scaled standard deviation of the fitted gaussian.  on a 
                linear scale, this is in the same units as the mean; on a log scale,
                this is a scalar multiple; and on a logicle scale, this is probably
                meaningless!
            
            - **interval** : (Float, Float)
                the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian.
                this is likely more meaningful than ``stdev``, especially on the
                ``logicle`` scale.
            
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                set if :attr:`num_components` ``> 1``.
             
        """
        
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")

        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No model found.  Did you forget to "
                                       "call estimate()?")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name))  

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No components found.  Did you forget to "
                                       "call estimate()?")

        if not self._scale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError('channel',
                                       "Column {0} not found in the experiment"
                                       .format(self.channel))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError('posteriors',
                                           "Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError('sigma',
                                       "sigma must be >= 0.0")

        if self.by:
            by = sorted(self.by)
            groupby = experiment.data.groupby(by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        for group, data_subset in groupby:
            
            # if there weren't any events in this group, there's no gmm
            if group not in self._gmms:
                warn("There wasn't a GMM for data subset {}".format(group),
                     util.CytoflowOpWarning)
                continue
            
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x).values
                        
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1 and self.sigma > 0:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        elif self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors and self.num_components > 1:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)

        # add the statistics
        levels = list(self.by)
        if self.num_components > 1:
            levels.append(self.name)
        
        if levels:     
            idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], 
                                             names = levels)
    
            mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()     
                                   
            for group, _ in groupby:
                gmm = self._gmms[group]
                for c in range(self.num_components):
                    if self.num_components > 1:
                        component_name = "{}_{}".format(self.name, c + 1)

                        if group is True:
                            g = [component_name]
                        elif isinstance(group, tuple):
                            g = list(group)
                            g.append(component_name)
                        else:
                            g = list([group])
                            g.append(component_name)
                        
                        if len(g) > 1:
                            g = tuple(g)
                        else:
                            g = (g[0],)
                    else:
                        g = group

                    mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0])
                    stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0]
                    interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])),
                                            self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0])))
                    prop_stat.at[g] = gmm.weights_[c]
                     
            new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
            new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat)
            new_experiment.statistics[(self.name, "interval")] = interval_stat
            if self.num_components > 1:
                new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)
            
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
        
        v = GaussianMixture1DView(op = self)
        v.trait_set(**kwargs)
        return v
Esempio n. 28
0
class RangeOp(HasStrictTraits):
    """
    Apply a range gate to a cytometry experiment.
    
    Attributes
    ----------
    name : Str
        The operation name.  Used to name the new metadata field in the
        experiment that's created by :meth:`apply`
        
    channel : Str
        The name of the channel to apply the range gate.
        
    low : Float
        The lowest value to include in this gate.
        
    high : Float
        The highest value to include in this gate.

    Examples
    --------
    
    .. plot::
        :context: close-figs
        
        Make a little data set.
    
        >>> import cytoflow as flow
        >>> import_op = flow.ImportOp()
        >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs",
        ...                              conditions = {'Dox' : 10.0}),
        ...                    flow.Tube(file = "Plate01/CFP_Well_A4.fcs",
        ...                              conditions = {'Dox' : 1.0})]
        >>> import_op.conditions = {'Dox' : 'float'}
        >>> ex = import_op.apply()
    
    Create and parameterize the operation.
    
    .. plot::
        :context: close-figs
        
        >>> range_op = flow.RangeOp(name = 'Range',
        ...                         channel = 'Y2-A',
        ...                         low = 2000,
        ...                         high = 10000)
        

    Plot a diagnostic view
    
    .. plot::
        :context: close-figs
        
        >>> range_op.default_view(scale = 'log').plot(ex)
        
    Apply the gate, and show the result
    
    .. plot::
        :context: close-figs
        
        >>> ex2 = range_op.apply(ex)
        >>> ex2.data.groupby('Range').size()
        Range
        False    16042
        True      3958
        dtype: int64
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.range')
    friendly_id = Constant('Range')

    name = CStr()
    channel = Str()
    low = CFloat()
    high = CFloat()

    def apply(self, experiment):
        """Applies the range gate to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new experiment, the same as old :class:`~Experiment` but with a new
            column of type ``bool`` with the same as the operation name.  The 
            bool is ``True`` if the event's measurement in :attr:`channel` is 
            greater than :attr:`low` and less than :attr:`high`; it is ``False`` 
            otherwise.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "Channel not specified")

        if not self.channel in experiment.channels:
            raise util.CytoflowOpError(
                'channel',
                "Channel {0} not in the experiment".format(self.channel))

        if self.high <= self.low:
            raise util.CytoflowOpError('high',
                                       "range high must be > range low")

        if self.high <= experiment[self.channel].min():
            raise util.CytoflowOpError(
                'high', "range high must be > {0}".format(
                    experiment[self.channel].min()))
        if self.low >= experiment[self.channel].max():
            raise util.CytoflowOpError(
                'low', "range low must be < {0}".format(
                    experiment[self.channel].max()))

        gate = experiment[self.channel].between(self.low, self.high)
        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))

        return new_experiment

    def default_view(self, **kwargs):
        return RangeSelection(op=self, **kwargs)
Esempio n. 29
0
class GaussianMixture2DOp(HasStrictTraits):
    """
    This module fits a 2D Gaussian mixture model with a specified number of
    components to a pair of channels.
    
    Creates a new categorical metadata variable named `name`, with possible
    values `name_1` .... `name_n` where `n` is the number of components.
    An event is assigned to `name_i` category if it falls within `sigma`
    standard deviations of the component's mean.  If that is true for multiple
    categories (or if `sigma == 0.0`), the event is assigned to the category 
    with the highest posterior probability.  If the event doesn't fall into
    any category, it is assigned to `name_None`.
    
    As a special case, if `num_components` is `1` and `sigma` > 0.0, then
    the new condition is boolean, `True` if the event fell in the gate and
    `False` otherwise.
    
    Optionally, if `posteriors` is `True`, this module will also compute the 
    posterior probability of each event in its assigned component, returning
    it in a new colunm named `{Name}_Posterior`.
    
    Finally, the same mixture model (mean and standard deviation) may not
    be appropriate for every subset of the data.  If this is the case, you
    can use the `by` attribute to specify metadata by which to aggregate
    the data before estimating (and applying) a mixture model.  The number of 
    components is the same across each subset, though.
    
    Attributes
    ----------
    name : Str
        The operation name; determines the name of the new metadata column
        
    xchannel : Str
        The X channel to apply the mixture model to.
        
    ychannel : Str
        The Y channel to apply the mixture model to.
        
    num_components : Int (default = 1)
        How many components to fit to the data?  Must be positive.

    sigma : Float (default = 0.0)
        How many standard deviations on either side of the mean to include
        in each category?  If an event is in multiple components, assign it
        to the component with the highest posterior probability.  If 
        `sigma == 0.0`, categorize *all* the data by assigning each event to
        the component with the highest posterior probability.  Must be >= 0.0.
    
    by : List(Str)
        A list of metadata attributes to aggregate the data before estimating
        the model.  For example, if the experiment has two pieces of metadata,
        `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model 
        separately to each subset of the data with a unique combination of
        `Time` and `Dox`.
        
    scale : Enum("linear", "log") (default = "linear")
        Re-scale the data before fitting the data?  
        TODO - not currently implemented.
        
    posteriors : Bool (default = False)
        If `True`, add a column named `{Name}_Posterior` giving the posterior
        probability that the event is in the component to which it was
        assigned.  Useful for filtering out low-probability events.
    
    Examples
    --------
    
    >>> gauss_op = GaussianMixture2DOp(name = "Gaussian",
    ...                                xchannel = "V2-A",
    ...                                ychannel = "Y2-A",
    ...                                num_components = 2)
    >>> gauss_op.estimate(ex2)
    >>> gauss_op.default_view().plot(ex2)
    >>> ex3 = gauss_op.apply(ex2)
    """
    
    id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d')
    friendly_id = Constant("2D Gaussian Mixture")
    
    name = CStr()
    xchannel = Str()
    ychannel = Str()
    xscale = util.ScaleEnum
    yscale = util.ScaleEnum
    num_components = util.PositiveInt
    sigma = util.PositiveFloat(0.0, allow_zero = True)
    by = List(Str)
    
    posteriors = Bool(False)
    
    # the key is either a single value or a tuple
    _gmms = Dict(Any, Instance(mixture.GMM))
    _xscale = Instance(util.IScale)
    _yscale = Instance(util.IScale)
    
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters
        """
        
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.xchannel))
            
        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.ychannel))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))
            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda x: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = util.scale_factory(self.xscale, experiment, self.xchannel)
        self._yscale = util.scale_factory(self.yscale, experiment, self.ychannel)
            
        for group, data_subset in groupby:
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])
            
            # drop data that isn't in the scale range
            x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))]
            x = x.values
            
            gmm = mixture.GMM(n_components = self.num_components,
                              covariance_type = "full",
                              random_state = 1)
            gmm.fit(x)
            
            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                      " for group {0}"
                                      .format(group))
                
            # in the 1D version, we sort the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.  that doesn't work in a 2D area,
            # obviously.
            
            # instead, we assume that the clusters are likely (?) to be
            # arranged along *one* of the axes, so we take the |norm| of the
            # x,y mean of each cluster and sort that way.
            
            norms = (gmm.means_[:, 0] ** 2 + gmm.means_[:, 1] ** 2) ** 0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covars_ = gmm.covars_[sort_idx]
            
            self._gmms[group] = gmm
    
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """
            
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                  "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("Experiment already has a column named {0}"
                                  .format(self.name))
        
        if not self._gmms:
            raise util.CytoflowOpError("No components found.  Did you forget to "
                                  "call estimate()?")
            
        if not self._xscale:
            raise util.CytoflowOpError("Couldn't find _xscale.  What happened??")
        
        if not self._yscale:
            raise util.CytoflowOpError("Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.ychannel))
            
        if (self.name + "_Posterior") in experiment.data:
            raise util.CytoflowOpError("Column {0} already found in the experiment"
                                  .format(self.name + "_Posterior"))
            
        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowError("If num_components == 1, sigma must be > 0")

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError("Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))

            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")
        
        event_assignments = pd.Series([None] * len(experiment), dtype = "object")

        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.  for example, this is why
        # we don't use Ellipse.contains().  
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)
        
        for group, data_subset in groupby:
            gmm = self._gmms[group]
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])
            
            # which values are missing?
            x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel])
            x_na = x_na.values
            
            x = x.values
            group_idx = groupby.groups[group]

            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x[:, 0], 
                                        "y" : x[:, 1],
                                        "p" : predicted})

                # for each component, get the ellipse that follows the isoline
                # around the mixture component
                # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html
                # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389
                # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm
                # i am not proud of how many tries this took me to get right.

                for c in range(0, self.num_components):
                    mean = gmm.means_[c]
                    covar = gmm._get_covars()[c]
                    
                    # xc is the center on the x axis
                    # yc is the center on the y axis
                    xc = mean[0]  # @UnusedVariable
                    yc = mean[1]  # @UnusedVariable
                    
                    v, w = linalg.eigh(covar)
                    u = w[0] / linalg.norm(w[0])
                    
                    # xl is the length along the x axis
                    # yl is the length along the y axis
                    xl = np.sqrt(v[0]) * self.sigma  # @UnusedVariable
                    yl = np.sqrt(v[1]) * self.sigma  # @UnusedVariable
                    
                    # t is the rotation in radians (counter-clockwise)
                    t = 2 * np.pi - np.arctan(u[1] / u[0])
                    
                    sin_t = np.sin(t)  # @UnusedVariable
                    cos_t = np.cos(t)  # @UnusedVariable
                                        
                    # and build an expression with numexpr so it evaluates fast!

                    gate_bool = gate_df.eval("p == @c and "
                                             "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + "
                                             "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1").values

                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
            
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                    
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, :])
                posteriors = pd.Series([0.0] * len(predicted))
                for c in range(0, self.num_components):
                    posteriors[predicted == c] = probability[predicted == c, c]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        else:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)
                    
        new_experiment.history.append(self.clone_traits())
        return new_experiment
    
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        return GaussianMixture2DView(op = self, **kwargs)
Esempio n. 30
0
class BleedthroughLinearOp(HasStrictTraits):
    """
    Apply matrix-based bleedthrough correction to a set of fluorescence channels.
    
    This is a traditional matrix-based compensation for bleedthrough.  For each
    pair of channels, the user specifies the proportion of the first channel
    that bleeds through into the second; then, the module performs a matrix
    multiplication to compensate the raw data.
    
    The module can also estimate the bleedthrough matrix using one
    single-color control per channel.
    
    This works best on data that has had autofluorescence removed first;
    if that is the case, then the autofluorescence will be subtracted from
    the single-color controls too.
    
    To use, set up the `controls` dict with the single color controls;
    call `estimate()` to parameterize the operation; check that the bleedthrough 
    plots look good with `default_view().plot()`; and then `apply()` to an 
    Experiment.
    
    Attributes
    ----------
    name : Str
        The operation name (for UI representation; optional for interactive use)
    
    controls : Dict(Str, File)
        The channel names to correct, and corresponding single-color control
        FCS files to estimate the correction splines with.  Must be set to
        use `estimate()`.
        
    spillover : Dict(Tuple(Str, Str), Float)
        The spillover "matrix" to use to correct the data.  The keys are pairs
        of channels, and the values are proportions of spectral overlap.  If 
        `("channel1", "channel2")` is present as a key, 
        `("channel2", "channel1")` must also be present.  The module does not
        assume that the matrix is symmetric.
        
    Notes
    -----


    Examples
    --------
    >>> bl_op = flow.BleedthroughLinearOp()
    >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs',
    ...                   'FITC-A' : 'merged/eyfp.fcs',
    ...                   'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'}
    >>>
    >>> bl_op.estimate(ex2)
    >>> bl_op.default_view().plot(ex2)    
    >>>
    >>> ex3 = bl_op.apply(ex2)
    """

    # traits
    id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_linear')
    friendly_id = Constant("Linear Bleedthrough Correction")

    name = CStr()

    controls = Dict(Str, File)
    spillover = Dict(Tuple(Str, Str), Float)

    def estimate(self, experiment, subset=None):
        """
        Estimate the bleedthrough from simgle-channel controls in `controls`
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        channels = self.controls.keys()

        if len(channels) < 2:
            raise util.CytoflowOpError(
                "Need at least two channels to correct bleedthrough.")

        # make sure the control files exist
        for channel in channels:
            if not os.path.isfile(self.controls[channel]):
                raise util.CytoflowOpError(
                    "Can't find file {0} for channel {1}.".format(
                        self.controls[channel], channel))

        for channel in channels:

            # make a little Experiment
            check_tube(self.controls[channel], experiment)
            tube_exp = ImportOp(tubes=[Tube(
                file=self.controls[channel])]).apply()

            # apply previous operations
            for op in experiment.history:
                tube_exp = op.apply(tube_exp)

            # subset it
            if subset:
                try:
                    tube_data = tube_exp.query(subset)
                except:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' isn't valid".format(self.subset))

                if len(tube_data.index) == 0:
                    raise util.CytoflowOpError(
                        "Subset string '{0}' returned no events".format(
                            self.subset))
            else:
                tube_data = tube_exp.data

            # polyfit requires sorted data
            tube_data.sort(channel, inplace=True)

            for to_channel in channels:
                from_channel = channel

                if from_channel == to_channel:
                    continue

                # sometimes some of the data is off the edge of the
                # plot, and this screws up a linear regression

                from_min = np.min(tube_data[from_channel]) * 1.05
                from_max = np.max(tube_data[from_channel]) * 0.95
                tube_data = tube_data[tube_data[from_channel] > from_min]
                tube_data = tube_data[tube_data[from_channel] < from_max]

                to_min = np.min(tube_data[to_channel]) * 1.05
                to_max = np.max(tube_data[to_channel]) * 0.95
                tube_data = tube_data[tube_data[to_channel] > to_min]
                tube_data = tube_data[tube_data[to_channel] < to_max]

                tube_data.reset_index(drop=True, inplace=True)

                lr = np.polyfit(tube_data[from_channel],
                                tube_data[to_channel],
                                deg=1)

                self.spillover[(from_channel, to_channel)] = lr[0]

    def apply(self, experiment):
        """Applies the bleedthrough correction to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
            a new experiment with the bleedthrough subtracted out.
        """
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if not self.spillover:
            raise util.CytoflowOpError("Spillover matrix isn't set. "
                                       "Did you forget to run estimate()?")

        for (from_channel, to_channel) in self.spillover:
            if not from_channel in experiment.data:
                raise util.CytoflowOpError(
                    "Can't find channel {0} in experiment".format(
                        from_channel))
            if not to_channel in experiment.data:
                raise util.CytoflowOpError(
                    "Can't find channel {0} in experiment".format(to_channel))

            if not (to_channel, from_channel) in self.spillover:
                raise util.CytoflowOpError("Must have both (from, to) and "
                                           "(to, from) keys in self.spillover")

        new_experiment = experiment.clone()

        # the completely arbitrary ordering of the channels
        channels = list(set([x for (x, _) in self.spillover.keys()]))

        # build the spillover matrix from the spillover dictionary
        a = [[self.spillover[(y, x)] if x != y else 1.0 for x in channels]
             for y in channels]

        # invert it.  use the pseudoinverse in case a is singular
        a_inv = np.linalg.pinv(a)

        new_experiment.data[channels] = np.dot(experiment.data[channels],
                                               a_inv)

        for channel in channels:
            # add the spillover values to the channel's metadata
            new_experiment.metadata[channel]['linear_bleedthrough'] = \
                {x : self.spillover[(x, channel)]
                     for x in channels if x != channel}

        new_experiment.history.append(self.clone_traits())
        return new_experiment

    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot to make sure spillover estimation is working.
        
        Returns
        -------
        IView : An IView, call plot() to see the diagnostic plots
        """

        # the completely arbitrary ordering of the channels
        channels = list(set([x for (x, _) in self.spillover.keys()]))

        if set(self.controls.keys()) != set(channels):
            raise util.CytoflowOpError(
                "Must have both the controls and bleedthrough to plot")

        return BleedthroughLinearDiagnostic(op=self, **kwargs)