def _get_trait(self): if self.type == 'metadata' or self.type == 'category': return CStr() elif self.type == 'float': return CFloat() elif self.type == 'bool': return ConvertingBool()
class _Snake_Settings(HasTraits): length_weight = Float(0) #alpha smoothness = Float(0.1) #beta line_weight = Float(-1) #w_line - -ve values seek dark pixels edge_weight = Float(0) boundaries = CStr('fixed') prefilter_sigma = Float(2)
class FitSettings(HasTraits): coalescedProcessing = Enum(['useClumpIndexOnly','useTminTmaxIfAvailable']) cumulativeDistribution = Enum(['binned','empirical']) fitMode = Enum(['SingleMode','TwoModes']) Tau2Constant = Bool(False) Tau2FixedValue = Float(2.0) IDcolumn = CStr('objectID')
class TimedSpecies(HasTraits): Species1 = CStr() Species1FromTime = Float() Species1ToTime = Float() Species2 = CStr() Species2FromTime = Float() Species2ToTime = Float() Species3 = CStr() Species3FromTime = Float() Species3ToTime = Float() traits_view = View(Group(Item(name = 'Species1'), Item(name = 'Species1FromTime'), Item(name = 'Species1ToTime'), Item('_'), Item(name = 'Species2'), Item(name = 'Species2FromTime'), Item(name = 'Species2ToTime'), Item('_'), Item(name = 'Species3'), Item(name = 'Species3FromTime'), Item(name = 'Species3ToTime'), label = 'Specify Timed Species', show_border = True), buttons = OKCancelButtons) def getSpeciesDescriptor(self): speclist = {} if self.Species1: # empty strings will be ignored speclist[self.Species1] = (self.Species1FromTime, self.Species1ToTime) if self.Species2: # empty strings will be ignored speclist[self.Species2] = (self.Species2FromTime, self.Species2ToTime) if self.Species3: # empty strings will be ignored speclist[self.Species3] = (self.Species3FromTime, self.Species3ToTime) logger.info('speclist is ' + repr(speclist)) return speclist
class EllipseOp(HasStrictTraits): id = Constant('edu.mit.synbio.cytoflow.operations.ellipse') friendly_id = Constant("Ellipse") name = CStr() xchannel = Str() ychannel = Str() vertices = List((Float, Float)) _xscale = Str("linear") _yscale = Str("linear") center = width = height = angle = def _plot_ellipse(self, center, width, height, angle, **kwargs): tf = transforms.Affine2D() \ .scale(width * 0.5, height * 0.5) \ .rotate_deg(angle) \ .translate(*center) tf_path = tf.transform_path(path.Path.unit_circle()) v = tf_path.vertices v = np.vstack((self.op._xscale.inverse(v[:, 0]), self.op._yscale.inverse(v[:, 1]))).T scaled_path = path.Path(v, tf_path.codes) scaled_patch = patches.PathPatch(scaled_path, **kwargs) plt.gca().add_patch(scaled_patch) name = CStr() xchannel = Str() ychannel = Str() vertices = List((Float, Float))
class ArduinoLCDActuator(AbstractArduinoActuator): """ Actuator that sends target device digital output pin status change requests Needs `AutomateFirmata <https://github.com/tuomas2/AutomateFirmata>`_ """ _status = CStr(transient=True) #: Target device number device = CInt def _status_changed(self): self._arduino.lcd_print(self._status)
class BleedthroughPiecewiseOp(HasStrictTraits): """ Apply bleedthrough correction to a set of fluorescence channels. This is not a traditional bleedthrough matrix-based compensation; it uses a similar set of single-color controls, but instead of computing a compensation matrix, it fits a piecewise-linear spline to the untransformed data and uses those splines to compute the correction factor at each point in a mesh across the color space. The experimental data is corrected using a linear interpolation along that mesh: this is much faster than computing the correction factor for each cell indiviually (an operation that takes 5 msec each.) To use, set up the `controls` dict with the single color controls; call `estimate()` to parameterize the operation; check that the bleedthrough plots look good with `default_view().plot()`; and then `apply()` to an Experiment. Attributes ---------- name : Str The operation name (for UI representation; optional for interactive use) controls : Dict(Str, File) The channel names to correct, and corresponding single-color control FCS files to estimate the correction splines with. Must be set to use `estimate()`. num_knots : Int (default = 7) The number of internal control points to estimate, spaced log-evenly from 0 to the range of the channel. Must be set to use `estimate()`. mesh_size : Int (default = 32) The size of each axis in the mesh used to interpolate corrected values. Notes ----- We use an interpolation-based scheme to estimate corrected bleedthrough. The algorithm is as follows: - Fit a piecewise-linear spline to each single-color control's bleedthrough into other channels. Because we want to fit the spline to untransfomed data, but capture both the negative, positive-linear and positive-log portions of a traditional flow data set, we distribute the spline knots evenly on an hlog-transformed axis for each color we're correcting. - At each point on a regular mesh spanning the entire range of the instrument, estimate the mapping from (raw colors) --> (actual colors). The mesh points are also distributed evenly along the hlog-transformed color axes; this captures negative data as well as positive This is quite slow: ~30 seconds for a mesh size of 32 in 3-space. Remember that additional channels expand the number of mesh points exponentially! - Use these estimates to paramaterize a linear interpolator (in linear space, this time). There's one interpolator per output channel (so for a 3-channel correction, each interpolator is R^3 --> R). For each measured cell, run each interpolator to give the corrected output. Examples -------- >>> bl_op = flow.BleedthroughPiecewiseOp() >>> bl_op.num_knots = 10 >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs', ... 'FITC-A' : 'merged/eyfp.fcs', ... 'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'} >>> >>> bl_op.estimate(ex2) >>> bl_op.default_view().plot(ex2) >>> >>> %time ex3 = bl_op.apply(ex2) # 410,000 cells CPU times: user 577 ms, sys: 27.7 ms, total: 605 ms Wall time: 607 ms """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_piecewise') friendly_id = Constant("Piecewise Bleedthrough Correction") name = CStr() controls = Dict(Str, File) num_knots = Int(7) mesh_size = Int(32) _splines = Dict(Str, Dict(Str, Python)) _interpolators = Dict(Str, Python) # because the order of the channels is important, we can't just call # _interpolators.keys() # TODO - this is ugly and unpythonic. :-/ _channels = List(Str) def estimate(self, experiment, subset=None): """ Estimate the bleedthrough from the single-channel controls in `controls` """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.num_knots < 3: raise util.CytoflowOpError( "Need to allow at least 3 knots in the spline") self._channels = self.controls.keys() if len(self._channels) < 2: raise util.CytoflowOpError( "Need at least two channels to correct bleedthrough.") self._splines = {} mesh_axes = [] for channel in self._channels: self._splines[channel] = {} # make a little Experiment check_tube(self.controls[channel], experiment) tube_exp = ImportOp(tubes=[Tube( file=self.controls[channel])]).apply() # apply previous operations for op in experiment.history: tube_exp = op.apply(tube_exp) # subset it if subset: try: tube_data = tube_exp.query(subset).copy() except: raise util.CytoflowOpError( "Subset string '{0}' isn't valid".format(self.subset)) if len(tube_data.index) == 0: raise util.CytoflowOpError( "Subset string '{0}' returned no events".format( self.subset)) else: tube_data = tube_exp.data.copy() # polyfit requires sorted data tube_data.sort_values(by=channel, inplace=True) channel_min = tube_data[channel].min() channel_max = tube_data[channel].max() # we're going to set the knots and splines evenly across the hlog- # transformed data, so as to capture both the "linear" aspect # of near-0 and negative values, and the "log" aspect of large # values # parameterize the hlog transform r = experiment.metadata[channel]['range'] # instrument range d = np.log10(r) # maximum display scale, in decades # the transition point from linear --> log scale # use half of the log-transformed scale as "linear". b = 2**(np.log2(r) / 2) # the splines' knots knot_min = channel_min knot_max = channel_max hlog_knot_min, hlog_knot_max = \ hlog((knot_min, knot_max), b = b, r = r, d = d) hlog_knots = np.linspace(hlog_knot_min, hlog_knot_max, self.num_knots) knots = hlog_inv(hlog_knots, b=b, r=r, d=d) # only keep the interior knots knots = knots[1:-1] # the interpolators' mesh if 'af_median' in experiment.metadata[channel] and \ 'af_stdev' in experiment.metadata[channel]: mesh_min = experiment.metadata[channel]['af_median'] - \ 3 * experiment.metadata[channel]['af_stdev'] else: mesh_min = -0.01 * r # TODO - does this even work? mesh_max = r hlog_mesh_min, hlog_mesh_max = \ hlog((mesh_min, mesh_max), b = b, r = r, d = d) hlog_mesh_axis = \ np.linspace(hlog_mesh_min, hlog_mesh_max, self.mesh_size) mesh_axis = hlog_inv(hlog_mesh_axis, b=b, r=r, d=d) mesh_axes.append(mesh_axis) for to_channel in self._channels: from_channel = channel if from_channel == to_channel: continue self._splines[from_channel][to_channel] = \ scipy.interpolate.LSQUnivariateSpline(tube_data[from_channel].values, tube_data[to_channel].values, t = knots, k = 1) mesh = pandas.DataFrame(util.cartesian(mesh_axes), columns=[x for x in self._channels]) mesh_corrected = mesh.apply(_correct_bleedthrough, axis=1, args=([[x for x in self._channels], self._splines])) for channel in self._channels: chan_values = np.reshape(mesh_corrected[channel], [len(x) for x in mesh_axes]) self._interpolators[channel] = \ scipy.interpolate.RegularGridInterpolator(points = mesh_axes, values = chan_values, bounds_error = False, fill_value = 0.0) # TODO - some sort of validity checking. def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment with the bleedthrough subtracted out. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self._interpolators: raise util.CytoflowOpError("Module interpolators aren't set. " "Did you run estimate()?") if not set(self._interpolators.keys()) <= set(experiment.channels): raise util.CytoflowOpError( "Module parameters don't match experiment channels") new_experiment = experiment.clone() # get rid of data outside of the interpolators' mesh # (-3 * autofluorescence sigma) for channel in self._channels: # if you update the mesh calculation above, update it here too! if 'af_median' in experiment.metadata[channel] and \ 'af_stdev' in experiment.metadata[channel]: mesh_min = experiment.metadata[channel]['af_median'] - \ 3 * experiment.metadata[channel]['af_stdev'] else: mesh_min = -0.01 * experiment.metadata[channel][ 'range'] # TODO - does this even work? new_experiment.data = \ new_experiment.data[new_experiment.data[channel] > mesh_min] new_experiment.data.reset_index(drop=True, inplace=True) old_data = new_experiment.data[self._channels] for channel in self._channels: new_experiment[channel] = self._interpolators[channel](old_data) # add the correction splines to the experiment metadata so we can # correct other controls later on new_experiment.metadata[channel]['piecewise_bleedthrough'] = \ (self._channels, self._interpolators[channel]) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the bleedthrough spline estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ if set(self.controls.keys()) != set(self._splines.keys()): raise util.CytoflowOpError( "Must have both the controls and bleedthrough to plot") return BleedthroughPiecewiseDiagnostic(op=self, **kwargs)
class TornadoService(AbstractUserService): """ Abstract service that provides HTTP server for WSGI applications. """ #: Which ip address to listen. Use ``0.0.0.0`` (default) to listen to all local networking interfaces. http_ipaddr = CStr("0.0.0.0") #: HTTP (or HTTPS if using SSL) port to listen http_port = Int(3000) #: Path to ssl certificate file. If set, SSL will be used. #: #: .. tip:: #: #: You may use script scripts/generate_selfsigned_certificate.sh to generate a #: self-signed openssl certificate. ssl_certificate = CStr #: Path to ssl private key file ssl_private_key = CStr #: Number of listener threads to spawn num_threads = Int(5) #: Extra static dirs you want to serve. Example:: #: #: static_dirs = {'/my_static/(.*)': '/path/to/my_static'} static_dirs = Dict(key_trait=Str, value_trait=Str) _http_server = Instance(tornado.httpserver.TCPServer) @property def is_alive(self): return bool(self._http_server) def get_wsgi_application(self): """ Get WSGI function. Implement this in subclasses. """ raise NotImplementedError def get_websocket(self): return None def get_filehandler_class(self): return tornado.web.StaticFileHandler def get_tornado_handlers(self): tornado_handlers = [] websocket = self.get_websocket() if websocket: tornado_handlers.append(('/socket', websocket)) for entrypoint, path in self.static_dirs.items(): tornado_handlers.append( (entrypoint, self.get_filehandler_class(), { 'path': path })) wsgi_app = self.get_wsgi_application() if wsgi_app: wsgi_container = tornado.wsgi.WSGIContainer(wsgi_app) tornado_handlers.append(('.*', tornado.web.FallbackHandler, dict(fallback=wsgi_container))) return tornado_handlers def setup(self): if self.is_alive: self.logger.debug( 'Server is already running, no need to start new') tornado_app = tornado.web.Application(self.get_tornado_handlers()) if self.ssl_certificate and self.ssl_private_key: ssl_options = { "certfile": self.ssl_certificate, "keyfile": self.ssl_private_key, } else: ssl_options = None self._http_server = tornado.httpserver.HTTPServer( tornado_app, ssl_options=ssl_options) try: self._http_server.listen(self.http_port, self.http_ipaddr) except socket.error as e: self.logger.exception('Could not start server: %s', e) self._http_server = None return self.start_ioloop() def start_ioloop(self): global web_thread ioloop = tornado.ioloop.IOLoop.instance() if not ioloop._running: web_thread = threading.Thread( target=threaded(self.system, ioloop.start), name="%s::%s" % (self.system.name, self.__class__.__name__)) web_thread.start() def cleanup(self): if self.is_alive: tornado.ioloop.IOLoop.instance().stop() self._http_server.stop() self._http_server = None web_thread.join()
class SocketSensor(AbstractSensor): """ Sensor that reads a TCP socket. Over TCP port, it reads data per lines and tries to set the status of the sensor to the value specified by the line. If content of the line is 'close', then connection is dropped. """ #: Hostname/IP to listen. Use ``'0.0.0.0'`` to listen all interfaces. host = CStr('0.0.0.0') #: Port to listen port = CInt #: set to ``True`` to tell SocketSensor to stop listening to port stop = CBool(transient=True) _socket = Instance(socket.socket, transient=True) _status = CInt def listen_loop(self): while not self.stop: try: self.logger.info('%s listening to connections in port %s', self.name, self.port) self._socket.listen(1) self._socket.settimeout(1) while not self.stop: try: conn, addr = self._socket.accept() except socket.timeout: continue break self.logger.info('%s connected from %s', self.name, addr) conn.settimeout(1) while not self.stop: try: data = conn.recv(1024) if not data: break self.status = int(data.strip()) conn.sendall('OK\n') except socket.timeout: data = '' except ValueError: if data.strip() == 'close': break conn.sendall('NOK\n') except socket.error as e: self.logger.info("%s: Error %s caught.", self, e) except: if self.stop: return else: raise conn.close() self.logger.info('%s: connection %s closed', self.name, addr) self._socket.close() def setup(self): self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.bind((self.host, self.port)) t = threading.Thread(target=self.listen_loop, name='SocketSensor %s' % self.name) t.start() def cleanup(self): self.stop = True
class DensityGateOp(HasStrictTraits): """ This module computes a gate based on a 2D density plot. The user chooses what proportion of events to keep, and the module creates a gate that selects that proportion of events in the highest-density bins of the 2D density histogram. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the binning to. ychannel : Str The Y channel to apply the binning to. xscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the Y axis before fitting the data? keep : Float (default = 0.9) What proportion of events to keep? Must be ``>0`` and ``<1`` bins : Int (default = 100) How many bins should there be on each axis? Must be positive. min_quantile : Float (default = 0.001) Clip values below this quantile max_quantile : Float (default = 1.0) Clip values above this quantile sigma : Float (default = 1.0) What standard deviation to use for the gaussian blur? by : List(Str) A list of metadata attributes to aggregate the data before estimating the gate. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit a separate gate to each subset of the data with a unique combination of ``Time`` and ``Dox``. Notes ----- This gating method was developed by John Sexton, in Jeff Tabor's lab at Rice University. From http://taborlab.github.io/FlowCal/fundamentals/density_gate.html, the method is as follows: 1. Determines the number of events to keep, based on the user specified gating fraction and the total number of events of the input sample. 2. Divides the 2D channel space into a rectangular grid, and counts the number of events falling within each bin of the grid. The number of counts per bin across all bins comprises a 2D histogram, which is a coarse approximation of the underlying probability density function. 3. Smoothes the histogram generated in Step 2 by applying a Gaussian Blur. Theoretically, the proper amount of smoothing results in a better estimate of the probability density function. Practically, smoothing eliminates isolated bins with high counts, most likely corresponding to noise, and smoothes the contour of the gated region. 4. Selects the bins with the greatest number of events in the smoothed histogram, starting with the highest and proceeding downward until the desired number of events to keep, calculated in step 1, is achieved. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> dens_op = flow.DensityGateOp(name = 'Density', ... xchannel = 'FSC-A', ... xscale = 'log', ... ychannel = 'SSC-A', ... yscale = 'log', ... keep = 0.5) Find the bins to keep .. plot:: :context: close-figs >>> dens_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> dens_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = dens_op.apply(ex) """ id = Constant('edu.mit.synbio.cytoflow.operations.density') friendly_id = Constant("Density Gate") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum keep = util.PositiveFloat(0.9, allow_zero = False) bins = util.PositiveInt(100, allow_zero = False) min_quantile = util.PositiveFloat(0.001, allow_zero = True) max_quantile = util.PositiveFloat(1.0, allow_zero = False) sigma = util.PositiveFloat(1.0, allow_zero = False) by = List(Str) _xscale = Instance(util.IScale, transient = True) _yscale = Instance(util.IScale, transient = True) _xbins = Array(transient = True) _ybins = Array(transient = True) _keep_xbins = Dict(Any, Array, transient = True) _keep_ybins = Dict(Any, Array, transient = True) _histogram = Dict(Any, Array, transient = True) def estimate(self, experiment, subset = None): """ Split the data set into bins and determine which ones to keep. Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the gate parameters. subset : Str (default = None) If set, determine the gate parameters on only a subset of the ``experiment`` parameter. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError('xchannel', "Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError('ychannel', "Column {0} not found in the experiment" .format(self.ychannel)) if self.min_quantile > 1.0: raise util.CytoflowOpError('min_quantile', "min_quantile must be <= 1.0") if self.max_quantile > 1.0: raise util.CytoflowOpError('max_quantile', "max_quantile must be <= 1.0") if not (self.max_quantile > self.min_quantile): raise util.CytoflowOpError('max_quantile', "max_quantile must be > min_quantile") if self.keep > 1.0: raise util.CytoflowOpError('keep', "keep must be <= 1.0") for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = xscale = util.scale_factory(self.xscale, experiment, channel = self.xchannel) self._yscale = yscale = util.scale_factory(self.yscale, experiment, channel = self.ychannel) xlim = (xscale.clip(experiment[self.xchannel].quantile(self.min_quantile)), xscale.clip(experiment[self.xchannel].quantile(self.max_quantile))) ylim = (yscale.clip(experiment[self.ychannel].quantile(self.min_quantile)), yscale.clip(experiment[self.ychannel].quantile(self.max_quantile))) self._xbins = xbins = xscale.inverse(np.linspace(xscale(xlim[0]), xscale(xlim[1]), self.bins)) self._ybins = ybins = yscale.inverse(np.linspace(yscale(ylim[0]), yscale(ylim[1]), self.bins)) for group, group_data in groupby: if len(group_data) == 0: raise util.CytoflowOpError('by', "Group {} had no data" .format(group)) h, _, _ = np.histogram2d(group_data[self.xchannel], group_data[self.ychannel], bins=[xbins, ybins]) h = scipy.ndimage.filters.gaussian_filter(h, sigma = self.sigma) i = scipy.stats.rankdata(h, method = "ordinal") - 1 i = np.unravel_index(np.argsort(-i), h.shape) goal_count = self.keep * len(group_data) curr_count = 0 num_bins = 0 while(curr_count < goal_count and num_bins < i[0].size): curr_count += h[i[0][num_bins], i[1][num_bins]] num_bins += 1 self._keep_xbins[group] = i[0][0:num_bins] self._keep_ybins[group] = i[1][0:num_bins] self._histogram[group] = h def apply(self, experiment): """ Creates a new condition based on membership in the gate that was parameterized with :meth:`estimate`. Parameters ---------- experiment : Experiment the :class:`.Experiment` to apply the gate to. Returns ------- Experiment a new :class:`.Experiment` with the new gate applied. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must set X channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not (self._xbins.size and self._ybins.size and self._keep_xbins): raise util.CytoflowOpError(None, "No gate estimate found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError(None, "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError(None, "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError('xchannel', "Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError('ychannel', "Column {0} not found in the experiment" .format(self.ychannel)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([False] * len(experiment), dtype = "bool") for group, group_data in groupby: if group not in self._keep_xbins: # there weren't any events in this group, so we didn't get # an estimate continue group_idx = groupby.groups[group] cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False) cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False) group_keep = pd.Series([False] * len(group_data)) keep_x = self._keep_xbins[group] keep_y = self._keep_ybins[group] for (xbin, ybin) in zip(keep_x, keep_y): group_keep = group_keep | ((cX == xbin) & (cY == ybin)) event_assignments.iloc[group_idx] = group_keep new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", event_assignments) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView a diagnostic view, call :meth:`~DensityGateView.plot` to see the diagnostic plot. """ v = DensityGateView(op = self) v.trait_set(**kwargs) return v
class PhysioData(HasTraits): """ Contains the parameters needed to run a MEAP session """ available_widgets = Instance(list) def _available_widgets_default(self): available_panels = ["Annotation"] if "dzdt" in self.contents and "z0" in self.contents: available_panels.append("ICG B Point") if "doppler" in self.contents: available_panels.append("Doppler") if self.dzdt_warping_functions.size > 0: available_panels.append("Registration") return available_panels contents = Property(Set) def _get_contents(self): """ Assuming this object is already initialized, this trait will check for which data are available. For each signal type if the raw timeseries is available, """ contents = set() for signal in ENSEMBLE_SIGNALS | set(('respiration',)): attr = signal+"_data" if not hasattr(self, attr): continue if getattr(self,attr).size > 0: contents.update((signal,)) # Check for respiration-corrected versions of z0 and dzdt for signal in ["resp_corrected_z0", "resp_corrected_dzdt"]: if not hasattr(self, signal): continue if getattr(self,signal).size > 0: contents.update((signal,)) return contents calculable_indexes = Property(Set) @cached_property def _get_calculable_indexes(self): """ Determines, based on content, which indexes are possible to calculate. """ # Signals has_ecg = "ecg" in self.contents has_z0 = "z0" in self.contents has_dzdt = "dzdt" in self.contents has_resp = "respiration" in self.contents has_systolic = "systolic" in self.contents has_diastolic = "diastolic" in self.contents has_bp = "bp" in self.contents has_resp_corrected_z0 = self.resp_corrected_z0.size > 0 has_l = self.subject_l > 1 # Indexes has_hr = False has_lvet = False has_sv = False has_map = False has_co = False ix = set() if has_ecg: has_hr = True ix.update(("hr","hrv")) if has_ecg and has_dzdt: has_lvet = True ix.update(("pep", "lvet", "eef")) if has_lvet and has_l and has_z0: has_sv = True ix.update(("sv",)) if has_resp_corrected_z0: ix.update(("resp_corrected_sv",)) if has_bp or has_systolic and has_diastolic: has_map = True ix.update(("map",)) if has_hr and has_sv: has_co = True ix.update(("co",)) if has_resp_corrected_z0: ix.update(("resp_corrected_co",)) if has_co and has_map: ix.update(("tpr",)) if has_resp_corrected_z0: ix.update(("resp_corrected_tpr",)) if has_resp: ix.update(("nbreaths")) return ix meap_version = CStr(__version__) original_file = File file_location = File # -- Censored Epochs -- censored_intervals = Array censoring_sources = List @cached_property def _get_censored_regions(self): censor_regions = [] for signal in self.contents: censor_regions += getattr(self, signal+"_ts").censored_regions # MEA Weighting function mea_window_type = PrototypedFrom("config") mea_n_neighbors = PrototypedFrom("config") mea_window_secs = PrototypedFrom("config") mea_exp_power = PrototypedFrom("config") mea_func_name = PrototypedFrom("config") mea_weight_direction = PrototypedFrom("config") use_trimmed_co = PrototypedFrom("config") mea_smooth_hr = PrototypedFrom("config") mea_weights = Array use_secondary_heartbeat = PrototypedFrom("config") secondary_heartbeat = PrototypedFrom("config") secondary_heartbeat_pre_msec = PrototypedFrom("config") secondary_heartbeat_abs = PrototypedFrom("config") secondary_heartbeat_window = PrototypedFrom("config") secondary_heartbeat_window_len = PrototypedFrom("config") secondary_heartbeat_n_likelihood_bins = PrototypedFrom("config") use_ECG2 = PrototypedFrom("config") ecg2_weight = PrototypedFrom("config") qrs_signal_source = PrototypedFrom("config") # Bpoint classifier options bpoint_classifier_pre_point_msec = PrototypedFrom("config") bpoint_classifier_post_point_msec = PrototypedFrom("config") bpoint_classifier_sample_every_n_msec =PrototypedFrom("config") bpoint_classifier_false_distance_min =PrototypedFrom("config") bpoint_classifier_use_bpoint_prior =PrototypedFrom("config") bpoint_classifier_include_derivative =PrototypedFrom("config") # Contains errors in msec from bpoint cross validation bpoint_classifier_cv_error = Array # Points on doppler signal dx_point_type = PrototypedFrom("config") dx_point_window_len = PrototypedFrom("config") db_point_type = PrototypedFrom("config") db_point_window_len = PrototypedFrom("config") # Impedance Data z0_winsor_min = CFloat(0.005) z0_winsor_max = CFloat(0.005) z0_winsorize = CBool(False) z0_included = CBool(False) z0_decimated = CBool(False) z0_channel_name = CStr("") z0_sampling_rate = CFloat(1000) z0_sampling_rate_unit = CStr("Hz") z0_unit = CStr("Ohms") z0_start_time = CFloat(0.) z0_data = Array mea_z0_matrix = Array z0_matrix = Property(Array,depends_on="peak_indices") def _get_z0_matrix(self): if self.peak_indices.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.z0_data, pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak, sampling_rate=self.z0_sampling_rate) mea_resp_corrected_z0_matrix = Array resp_corrected_z0_matrix = Property(Array,depends_on="peak_indices") def _get_resp_corrected_z0_matrix(self): if self.peak_indices.size == 0 or self.resp_corrected_z0.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.resp_corrected_z0, pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak, sampling_rate=self.z0_sampling_rate) dzdt_winsor_min = CFloat(0.005) dzdt_winsor_max = CFloat(0.005) dzdt_winsorize = CBool(False) dzdt_included = CBool(False) dzdt_decimated = CBool(False) dzdt_channel_name = CStr("") dzdt_sampling_rate = CFloat(1000) dzdt_sampling_rate_unit = CStr("Hz") dzdt_unit = CStr("Ohms/Sec") dzdt_start_time = CFloat(0.) dzdt_data = Array dzdt_matrix = Property(Array,depends_on="peak_indices") mea_dzdt_matrix = Array @cached_property def _get_dzdt_matrix(self): logger.info("constructing dZ/dt matrix") if self.peak_indices.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.dzdt_data, pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak, sampling_rate=self.dzdt_sampling_rate) # Doppler radar doppler_winsor_min = CFloat(0.005) doppler_winsor_max = CFloat(0.005) doppler_winsorize = CBool(False) doppler_included = CBool(False) doppler_decimated = CBool(False) doppler_channel_name = CStr("") doppler_sampling_rate = CFloat(1000) doppler_sampling_rate_unit = CStr("Hz") doppler_unit = CStr("Ohms/Sec") doppler_start_time = CFloat(0.) doppler_data = Array doppler_matrix = Property(Array,depends_on="peak_indices") mea_doppler_matrix = Array @cached_property def _get_doppler_matrix(self): if self.peak_indices.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.doppler_data, pre_msec=self.doppler_pre_peak,post_msec=self.doppler_post_peak, sampling_rate=self.doppler_sampling_rate) # Respiration resp_corrected_dzdt_matrix = Property(Array,depends_on="peak_indices") mea_resp_corrected_dzdt_matrix = Array @cached_property def _get_resp_corrected_dzdt_matrix(self): if self.peak_indices.size == 0 or self.resp_corrected_dzdt.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.resp_corrected_dzdt, pre_msec=self.dzdt_pre_peak,post_msec=self.dzdt_post_peak, sampling_rate=self.dzdt_sampling_rate) # ECG ecg_included = CBool(False) ecg_winsor_min = CFloat(0.005) ecg_winsor_max = CFloat(0.005) ecg_winsorize = CBool(False) ecg_decimated = CBool(False) ecg_channel_name = CStr("") ecg_sampling_rate = CFloat(1000) ecg_sampling_rate_unit = CStr("Hz") ecg_unit = CStr("V") ecg_start_time = CFloat(0.) ecg_data = Array ecg_matrix = Property(Array,depends_on="peak_indices") mea_ecg_matrix = Array @cached_property def _get_ecg_matrix(self): if self.peak_indices.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.ecg_data, pre_msec=self.ecg_pre_peak,post_msec=self.ecg_post_peak, sampling_rate=self.ecg_sampling_rate) # ECG Secondary (eg from EEG) ecg2_included = CBool(False) ecg2_winsor_min = CFloat(0.005) ecg2_winsor_max = CFloat(0.005) ecg2_winsorize = CBool(False) ecg2_decimated = CBool(False) ecg2_channel_name = CStr("") ecg2_sampling_rate = CFloat(1000) ecg2_sampling_rate_unit = CStr("Hz") ecg2_unit = CStr("V") ecg2_start_time = CFloat(0.) ecg2_data = Array ecg2_matrix = Property(Array,depends_on="peak_indices") mea_ecg2_matrix = Array @cached_property def _get_ecg2_matrix(self): if self.peak_indices.size == 0: return np.array([]) return peak_stack(self.peak_indices,self.ecg2_data, pre_msec=self.ecg_pre_peak,post_msec=self.ecg_post_peak, sampling_rate=self.ecg_sampling_rate) # Blood pressure might come from a CNAP using_continuous_bp = CBool(False) bp_included = CBool(False) bp_winsor_min = CFloat(0.005) bp_winsor_max = CFloat(0.005) bp_winsorize = CBool(False) bp_decimated = CBool(False) bp_channel_name = CStr("") bp_sampling_rate = CFloat(1000) bp_sampling_rate_unit = CStr("Hz") bp_unit = CStr("mmHg") bp_start_time = CFloat(0.) bp_data = Array bp_matrix = Property(Array,depends_on="peak_indices") mea_bp_matrix = Array @cached_property def _get_bp_matrix(self): return peak_stack(self.peak_indices,self.bp_data, pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak, sampling_rate=self.bp_sampling_rate) # Or two separate channels systolic_included = CBool(False) systolic_winsor_min = CFloat(0.005) systolic_winsor_max = CFloat(0.005) systolic_winsorize = CBool(False) systolic_decimated = CBool(False) systolic_channel_name = CStr("") systolic_sampling_rate = CFloat(1000) systolic_sampling_rate_unit = CStr("Hz") systolic_unit = CStr("mmHg") systolic_start_time = CFloat(0.) systolic_data = Array systolic_matrix = Property(Array, depends_on="peak_indices,bp_pre_peak,bp_post_peak") mea_systolic_matrix = Array @cached_property def _get_systolic_matrix(self): if self.peak_indices.size == 0 or not ("systolic" in self.contents): return np.array([]) return peak_stack(self.peak_indices,self.systolic_data, pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak, sampling_rate=self.bp_sampling_rate) diastolic_included = CBool(False) diastolic_winsor_min = CFloat(0.005) diastolic_winsor_max = CFloat(0.005) diastolic_winsorize = CBool(False) diastolic_decimated = CBool(False) diastolic_channel_name = CStr("") diastolic_sampling_rate = CFloat(1000) diastolic_sampling_rate_unit = CStr("Hz") diastolic_unit = CStr("Ohms") diastolic_start_time = CFloat(0.) diastolic_data = Array diastolic_matrix = Property(Array, depends_on="peak_indices,bp_pre_peak,bp_post_peak") mea_diastolic_matrix = Array @cached_property def _get_diastolic_matrix(self): if self.peak_indices.size == 0 or not ("diastolic" in self.contents): return np.array([]) return peak_stack(self.peak_indices,self.diastolic_data, pre_msec=self.bp_pre_peak,post_msec=self.bp_post_peak, sampling_rate=self.bp_sampling_rate) respiration_included = CBool(False) respiration_winsor_min = CFloat(0.005) respiration_winsor_max = CFloat(0.005) respiration_winsorize = CBool(False) respiration_decimated = CBool(False) respiration_channel_name = CStr("") respiration_sampling_rate = CFloat(1000) respiration_sampling_rate_unit = CStr("Hz") respiration_unit = CStr("Ohms") respiration_start_time = CFloat(0.) respiration_data = Array respiration_cycle = Array respiration_amount = Array resp_corrected_z0 = Array resp_corrected_dzdt = Array processed_respiration_data = Array processed_respiration_time = Array # -- Event marking signals (experiment and mri-related) mri_trigger_times = Array mri_trigger_included = CBool(False) mri_trigger_decimated = CBool(False) mri_trigger_channel_name = CStr("") mri_trigger_sampling_rate = CFloat(1000) mri_trigger_sampling_rate_unit = CStr("Hz") mri_trigger_unit = CStr("V") mri_trigger_start_time = CFloat(0.) event_names = List event_sampling_rate = CFloat(1000) event_included = CBool(True) event_decimated = CBool(False) event_start_time = CFloat(0.) event_sampling_rate_unit = "Hz" event_unit = CStr("Hz") # -- results of peak detection peak_times = Array peak_indices = CArray(dtype=np.int) # Non-markable heartbeats dne_peak_times = Array dne_peak_indices = CArray(dtype=np.int) # Any custom labels for heartbeats go here hand_labeled = Instance(np.ndarray) # An array of beat indices, each corresponding def _hand_labeled_default(self): return np.zeros_like(self.peak_indices) # Is the beat usable for analysis? usable = Instance(np.ndarray) def _usable_default(self): return np.ones(len(self.peak_indices),dtype=np.int) p_indices = Instance(np.ndarray) def _p_indices_default(self): return np.zeros_like(self.peak_indices) q_indices = Instance(np.ndarray) def _q_indices_default(self): return np.zeros_like(self.peak_indices) r_indices = Instance(np.ndarray) def _r_indices_default(self): return np.zeros_like(self.peak_indices) s_indices = Instance(np.ndarray) def _s_indices_default(self): return np.zeros_like(self.peak_indices) t_indices = Instance(np.ndarray) def _t_indices_default(self): return np.zeros_like(self.peak_indices) b_indices = Instance(np.ndarray) def _b_indices_default(self): return np.zeros_like(self.peak_indices) c_indices = Instance(np.ndarray) def _c_indices_default(self): return np.zeros_like(self.peak_indices) x_indices = Instance(np.ndarray) def _x_indices_default(self): return np.zeros_like(self.peak_indices) o_indices = Instance(np.ndarray) def _o_indices_default(self): return np.zeros_like(self.peak_indices) systole_indices = Instance(np.ndarray) def _systole_indices_default(self): return np.zeros_like(self.peak_indices) diastole_indices = Instance(np.ndarray) def _diastole_indices_default(self): return np.zeros_like(self.peak_indices) # Indices for doppler db_indices = Instance(np.ndarray) def _db_indices_default(self): return np.zeros_like(self.peak_indices) dx_indices = Instance(np.ndarray) def _dx_indices_default(self): return np.zeros_like(self.peak_indices) # Holds B points in the Karcher modes karcher_b_indices = Instance(np.ndarray) def _karcher_b_indices_default(self): return np.zeros(self.n_modes) # --- Subject information subject_age = CFloat(0.) subject_gender = Enum("M","F") subject_weight = CFloat(0.,label="Weight (lbs)") subject_height_ft = Int(0,label="Height (ft)", desc="Subject's height in feet") subject_height_in = Int(0,label = "Height (in)", desc="Subject's height in inches") subject_electrode_distance_front = CFloat(0., label="Impedance electrode distance (front)") subject_electrode_distance_back = CFloat(0., label="Impedance electrode distance (back)") subject_electrode_distance_right = CFloat(0., label="Impedance electrode distance (back)") subject_electrode_distance_left = CFloat(0., label="Impedance electrode distance (back)") subject_resp_max = CFloat(0.,label="Respiration circumference max (cm)") subject_resp_min = CFloat(0.,label="Respiration circumference min (cm)") subject_in_mri = CBool(False,label="Subject was in MRI scanner") subject_control_base_impedance = CFloat(0.,label="Control Imprdance", desc="If in MRI, store the z0 value from outside the MRI") subject_l = Property(CFloat,depends_on= "subject_electrode_distance_front," + \ "subject_electrode_distance_back," + \ "subject_electrode_distance_right," + \ "subject_electrode_distance_left," + \ "subject_height_ft" ) @cached_property def _get_subject_l(self): """ Uses information from the subject measurements to define the l variable for calculating stroke volume. if left and right electrode distances are provided, use the average if front and back electrode distances are provided, use the average if subject height in feet and inches is provided, use the estimate of l = 0.17 * height Otherwise return the first measurement found in front,back,left,right If nothing is found, returns 1 """ front = self.subject_electrode_distance_front back = self.subject_electrode_distance_back left = self.subject_electrode_distance_left right = self.subject_electrode_distance_right if left > 0 and right > 0: return (left + right) / 2. if front > 0 and back > 0: return (front + back) / 2. if self.subject_height_ft > 0: return (12*self.subject_height_ft + \ self.subject_height_in) * 2.54 * 0.17 for measure in (front, back, left, right): if measure > 0.: return measure return 1 # --- From the global configuration config = Instance(MEAPConfig) apply_ecg_smoothing = PrototypedFrom("config") ecg_smoothing_window_len = PrototypedFrom("config") apply_imp_smoothing = PrototypedFrom("config") imp_smoothing_window_len = PrototypedFrom("config") apply_bp_smoothing = PrototypedFrom("config") bp_smoothing_window_len = PrototypedFrom("config") regress_out_resp = PrototypedFrom("config") # parameters for processing the raw data before PT detecting subject_in_mri = PrototypedFrom("config") peak_detection_algorithm = PrototypedFrom("config") # PanTomkins parameters qrs_source_signal = Enum("ecg", "ecg2") bandpass_min = PrototypedFrom("config") bandpass_max =PrototypedFrom("config") smoothing_window_len = PrototypedFrom("config") smoothing_window = PrototypedFrom("config") pt_adjust = PrototypedFrom("config") peak_threshold = PrototypedFrom("config") apply_filter = PrototypedFrom("config") apply_diff_sq = PrototypedFrom("config") apply_smooth_ma = PrototypedFrom("config") peak_window = PrototypedFrom("config") # Parameters for waveform extraction ecg_pre_peak = PrototypedFrom("config") ecg_post_peak = PrototypedFrom("config") dzdt_pre_peak = PrototypedFrom("config") dzdt_post_peak = PrototypedFrom("config") bp_pre_peak = PrototypedFrom("config") bp_post_peak = PrototypedFrom("config") systolic_pre_peak = PrototypedFrom("config") systolic_post_peak = PrototypedFrom("config") diastolic_pre_peak = PrototypedFrom("config") diastolic_post_peak = PrototypedFrom("config") doppler_pre_peak = PrototypedFrom("config") doppler_post_peak = PrototypedFrom("config") stroke_volume_equation = PrototypedFrom("config") # parameters for respiration analysis process_respiration = PrototypedFrom("config") resp_polort = PrototypedFrom("config") resp_high_freq_cutoff = PrototypedFrom("config") resp_inhale_begin_times = Array resp_exhale_begin_times = Array # Time points of the global ensemble average ens_avg_ecg_signal = Array ens_avg_dzdt_signal = Array ens_avg_bp_signal = Array ens_avg_systolic_signal = Array ens_avg_diastolic_signal = Array ens_avg_doppler_signal = Array ens_avg_p_time = CFloat ens_avg_q_time = CFloat ens_avg_r_time = CFloat ens_avg_s_time = CFloat ens_avg_t_time = CFloat ens_avg_b_time = CFloat ens_avg_db_time = CFloat ens_avg_dx_time = CFloat ens_avg_c_time = CFloat ens_avg_x_time = CFloat ens_avg_y_time = CFloat ens_avg_o_time = CFloat ens_avg_systole_time = CFloat ens_avg_diastole_time = CFloat using_hand_marked_point_priors = CBool(False) censored_secs_before = Array # MEA Physio timeseries lvet = Array co = Array resp_corrected_co = Array pep = Array sv = Array resp_corrected_sv = Array map = Array systolic = Array diastolic = Array hr = Array mea_hr = Array tpr = Array resp_corrected_tpr = Array def _config_default(self): return MEAPConfig() # SRVF-warping parameters srvf_lambda = PrototypedFrom("config") srvf_max_karcher_iterations = PrototypedFrom("config") srvf_update_min = PrototypedFrom("config") srvf_karcher_mean_subset_size = PrototypedFrom("config") srvf_multi_mode_variance_cutoff = PrototypedFrom("config") srvf_use_moving_ensembled = PrototypedFrom("config") dzdt_num_inputs_to_group_warping = PrototypedFrom("config") srvf_t_min = PrototypedFrom("config") srvf_t_max = PrototypedFrom("config") bspline_before_warping = PrototypedFrom("config") dzdt_srvf_karcher_mean = Array dzdt_karcher_mean = Array dzdt_karcher_mean_time = Array dzdt_warping_functions = Array dzdt_functions_to_warp = Array # Holds data related to initial karcher mean dzdt_karcher_mean_inputs = Array dzdt_karcher_mean_over_iterations = Array srvf_iteration_distances = Array srvf_iteration_energy = Array # Data related to the multiple modes n_modes = PrototypedFrom("config") max_kmeans_iterations = PrototypedFrom("config") mode_dzdt_karcher_means = Array mode_cluster_assignment = Array mode_dzdt_srvf_karcher_means = Array # Storing and accessing the bpoint classifier bpoint_classifier_file = File def save(self,outfile): # Populate matfile-friendly data structures for censoring regions tmp = tempfile.NamedTemporaryFile() save_attrs = [] for k in self.editable_traits(): if k.endswith("ts"): continue if k == "available_widgets": continue if k == "bpoint_classifier": continue if k == "bpoint_classifier_file": continue if k in ("censored_regions","event_names"): continue v = getattr(self,k) if type(v) == np.ndarray: if v.size == 0: continue if type(v) is set: continue save_attrs.append(k) savedict = dict([(k,getattr(self,k)) \ for k in save_attrs if not (getattr(self,k) is None)]) savedict["censoring_sources"] = np.array(self.censoring_sources) for evt in self.event_names: savedict[evt] = getattr(self,evt) savedict["event_names"] = np.array(self.event_names) for k,v in savedict.iteritems(): try: savemat( tmp, {k:v}, long_field_names=True) except Exception, e: logger.warn("unable to save %s because of %s", k,e) tmp.close() try: savemat(outfile, savedict,long_field_names=True) except Exception,e: messagebox("Failed to save %s:\n\n%s"%(outfile,e))
class BinningOp(HasStrictTraits): """ Bin data along an axis. This operation creates equally spaced bins (in linear or log space) along an axis and adds a condition assigning each event to a bin. The value of the event's condition is the left end of the bin's interval in which the event is located. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel along which to bin. scale : {"linear", "log", "logicle"} Make the bins equidistant along what scale? num_bins : Int The number of bins to make. Must set either :attr:`num_bins` or :attr:`bin_width`. If both are defined, :attr:`num_bins` takes precedence. bin_width : Float The width of the bins. Must set either :attr:`num_bins` or :attr:`bin_width`. If :attr:`scale` is ``log``, :attr:`bin_width` is in log-10 units; if :attr:`scale` is ``logicle``, and error is thrown because the units are ill-defined. If both :attr:`num_bins` and :attr:`bin_width` are defined, :attr:`num_bins` takes precedence. bin_count_name : Str If :attr:`bin_count_name` is set, :meth:`apply` adds another column to the resulting :class:`Experiment` that contains the number of events in the bin that this event falls in. Useful for filtering bins by number of events. Examples -------- Create a small experiment: .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")] >>> ex = import_op.apply() Create and parameterize the operation .. plot:: :context: close-figs >>> bin_op = flow.BinningOp() >>> bin_op.name = "Bin" >>> bin_op.channel = "FITC-A" >>> bin_op.scale = "log" >>> bin_op.bin_width = 0.2 Apply the operation to the experiment .. plot:: :context: close-figs >>> ex2 = bin_op.apply(ex) Plot the result .. plot:: :context: close-figs >>> bin_op.default_view().plot(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.binning') friendly_id = Constant("Binning") name = CStr() bin_count_name = CStr() channel = Str() num_bins = util.PositiveInt(0, allow_zero=True) bin_width = util.PositiveFloat(0, allow_zero=True) scale = util.ScaleEnum _max_num_bins = Int(100) def apply(self, experiment): """ Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment A new experiment with a condition column named :attr:`name`, which contains the location of the left-most edge of the bin that the event is in. If :attr:`bin_count_name` is set, another column is added with that name as well, containing the number of events in the same bin as the event. """ if experiment is None: raise util.CytoflowOpError('experiment', "no experiment specified") if not self.name: raise util.CytoflowOpError('name', "Name is not set") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Name {} is in the experiment already".format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError( 'bin_count_name', "bin_count_name {} is in the experiment already".format( self.bin_count_name)) if not self.channel: raise util.CytoflowOpError('channel', "channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError( 'channel', "channel {} isn't in the experiment".format(self.channel)) if not self.num_bins and not self.bin_width: raise util.CytoflowOpError('num_bins', "must set either bin number or width") if self.bin_width \ and not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError( 'scale', "Can only use bin_width with linear or log scale") scale = util.scale_factory(self.scale, experiment, channel=self.channel) scaled_data = scale(experiment.data[self.channel]) scaled_min = bn.nanmin(scaled_data) scaled_max = bn.nanmax(scaled_data) num_bins = self.num_bins if self.num_bins else \ (scaled_max - scaled_min) / self.bin_width if num_bins > self._max_num_bins: raise util.CytoflowOpError( None, "Too many bins! To increase this limit, " "change _max_num_bins (currently {})".format( self._max_num_bins)) scaled_bins = np.linspace(start=scaled_min, stop=scaled_max, num=num_bins) if len(scaled_bins) < 2: raise util.CytoflowOpError('num_bins', "Must have more than one bin") # put the data in bins bin_idx = np.digitize(scaled_data, scaled_bins[1:-1]) # now, back into data space bins = scale.inverse(scaled_bins) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "float", bins[bin_idx]) # keep track of the bins we used, for prettier plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! # TODO - fix this, then turn it on by default agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to check the binning. Returns ------- IView An view instance, call :meth:`plot()` to plot the bins. """ return BinningView(op=self, **kwargs)
class PolygonOp(HasStrictTraits): """ Apply a polygon gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by :meth:`apply` xchannel, ychannel : Str The names of the x and y channels to apply the gate. xscale, yscale : {'linear', 'log', 'logicle'} (default = 'linear') The scales applied to the data before drawing the polygon. vertices : List((Float, Float)) The polygon verticies. An ordered list of 2-tuples, representing the x and y coordinates of the vertices. Notes ----- This module uses :meth:`matplotlib.path.Path` to represent the polygon, because membership testing is very fast. You can set the verticies by hand, I suppose, but it's much easier to use the interactive view you get from :meth:`default_view` to do so. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> p = flow.PolygonOp(name = "Polygon", ... xchannel = "V2-A", ... ychannel = "Y2-A") >>> p.vertices = [(23.411982294776319, 5158.7027015021222), ... (102.22182270573683, 23124.058843387455), ... (510.94519955277201, 23124.058843387455), ... (1089.5215641232173, 3800.3424832180476), ... (340.56382570202402, 801.98947404942271), ... (65.42597937575897, 1119.3133482602157)] Show the default view. .. plot:: :context: close-figs >>> df = p.default_view(huefacet = "Dox", ... xscale = 'log', ... yscale = 'log') >>> df.plot(ex) .. note:: If you want to use the interactive default view in a Jupyter notebook, make sure you say ``%matplotlib notebook`` in the first cell (instead of ``%matplotlib inline`` or similar). Then call ``default_view()`` with ``interactive = True``:: df = p.default_view(huefacet = "Dox", xscale = 'log', yscale = 'log', interactive = True) df.plot(ex) Apply the gate, and show the result .. plot:: :context: close-figs >>> ex2 = p.apply(ex) >>> ex2.data.groupby('Polygon').size() Polygon False 15875 True 4125 dtype: int64 """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.polygon') friendly_id = Constant("Polygon") name = CStr() xchannel = Str() ychannel = Str() vertices = List((Float, Float)) xscale = util.ScaleEnum() yscale = util.ScaleEnum() _selection_view = Instance('PolygonSelection', transient=True) def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old :class:`Experiment` to which this op is applied Returns ------- Experiment a new :class:'Experiment`, the same as ``old_experiment`` but with a new column of type `bool` with the same as the operation name. The bool is ``True`` if the event's measurement is within the polygon, and ``False`` otherwise. Raises ------ util.CytoflowOpError if for some reason the operation can't be applied to this experiment. The reason is in :attr:`.CytoflowOpError.args` """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "{} is in the experiment already!".format(self.name)) if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must specify an x channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must specify a y channel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError( 'xchannel', "xchannel {0} is not in the experiment".format(self.xchannel)) if not self.ychannel in experiment.channels: raise util.CytoflowOpError( 'ychannel', "ychannel {0} is not in the experiment".format(self.ychannel)) if len(self.vertices) < 3: raise util.CytoflowOpError('vertices', "Must have at least 3 vertices") if any([len(x) != 2 for x in self.vertices]): return util.CytoflowOpError( 'vertices', "All vertices must be lists or tuples " "of length = 2") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the Polygon gate's name " "before applying it!") # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( 'name', "Experiment already contains a column {0}".format(self.name)) # there's a bit of a subtlety here: if the vertices were # selected with an interactive plot, and that plot had scaled # axes, we need to apply that scale function to both the # vertices and the data before looking for path membership xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices] data = experiment.data[[self.xchannel, self.ychannel]].copy() data[self.xchannel] = xscale(data[self.xchannel]) data[self.ychannel] = yscale(data[self.ychannel]) # use a matplotlib Path because testing for membership is a fast C fn. path = mpl.path.Path(np.array(vertices)) xy_data = data[[self.xchannel, self.ychannel]].values new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", path.contains_points(xy_data)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): self._selection_view = PolygonSelection(op=self) self._selection_view.trait_set(**kwargs) return self._selection_view
class GaussianMixture2DOp(HasStrictTraits): """ This module fits a 2D Gaussian mixture model with a specified number of components to a pair of channels. .. warning:: :class:`GaussianMixture2DOp` is **DEPRECATED** and will be removed in a future release. It doesn't correctly handle the case where an event is present in more than one component. Please use :class:`GaussianMixtureOp` instead! Creates a new categorical metadata variable named :attr:`name`, with possible values ``name_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it falls within :attr:`sigma` standard deviations of the component's mean. If that is true for multiple categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to ``name_None``. As a special case, if :attr:`num_components` is ``1`` and :attr:`sigma` ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in the gate and ``False`` otherwise. Optionally, if :attr:`posteriors` is ``True``, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named ``{Name}_Posterior``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. xscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : {"linear", "logicle", "log"} (default = "linear") Re-scale the data on the Y axis before fitting the data? num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If :attr:`sigma` is ``0.0``, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be ``>= 0.0``. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. posteriors : Bool (default = False) If ``True``, add a column named ``{Name}_Posterior`` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixture2DOp(name = 'Flow', ... xchannel = 'V2-A', ... xscale = 'log', ... ychannel = 'Y2-A', ... yscale = 'log', ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view with the distributions .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d') friendly_id = Constant("2D Gaussian Mixture") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum num_components = util.PositiveInt sigma = util.PositiveFloat(0.0, allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True) _xscale = Instance(util.IScale, transient=True) _yscale = Instance(util.IScale, transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters. Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ warn( "GaussianMixture2DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError( 'xchannel', "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( 'ychannel', "Column {0} not found in the experiment".format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError( 'posteriors', "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except Exception as e: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) from e if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( None, "Group {} had no data".format(group)) x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GaussianMixture(n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError( None, "Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with a column named :attr:`name` and optionally one named :attr:`name` ``_Posterior``. Also includes the following new statistics: - **xmean** : Float the mean of the fitted gaussian in the x dimension. - **ymean** : Float the mean of the fitted gaussian in the y dimension. - **proportion** : Float the proportion of events in each component of the mixture model. only set if :attr:`num_components` ``> 1``. PS -- if someone has good ideas for summarizing spread in a 2D (non-isotropic) Gaussian, or other useful statistics, let me know! """ warn( "GaussianMixture2DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must set X channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if not self._gmms: raise util.CytoflowOpError( None, "No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError( None, "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError( None, "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError( 'xchannel', "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( 'ychannel', "Column {0} not found in the experiment".format(self.ychannel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError( 'channels', "Column {0} already found in the experiment".format( col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.sigma < 0.0: raise util.CytoflowOpError('sigma', "sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype="object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({ "x": x[:, 0], "y": x[:, 1], "p": predicted }) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm.covariances_[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval( "p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1" ).values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition( self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product( [new_experiment[x].unique() for x in levels], names=levels) xmean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() ymean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() prop_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = g[0] else: g = group xmean_stat.loc[g] = self._xscale.inverse(gmm.means_[c][0]) ymean_stat.loc[g] = self._yscale.inverse(gmm.means_[c][0]) prop_stat.loc[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "xmean")] = pd.to_numeric(xmean_stat) new_experiment.statistics[(self.name, "ymean")] = pd.to_numeric(ymean_stat) if self.num_components > 1: new_experiment.statistics[( self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call :meth:`~GaussianMixture2DView.plot` to see the diagnostic plot. """ warn( "GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) return GaussianMixture2DView(op=self, **kwargs)
class BeadCalibrationOp(HasStrictTraits): """ Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent beads (eg, the Spherotech RCP-30-5A rainbow beads.) To use, set the `beads_file` property to an FCS file containing the beads' events; specify which beads you ran by setting the `beads_type` property to match one of the values of BeadCalibrationOp.BEADS; and set the `units` dict to which channels you want calibrated and in which units. Then, call `estimate()` and check the peak-finding with `default_view().plot()`. If the peak-finding is wacky, try adjusting `bead_peak_quantile` and `bead_brightness_threshold`. When the peaks are successfully identified, call apply() on your experimental data set. If you can't make the peak finding work, please submit a bug report! This procedure works best when the beads file is very clean data. It does not do its own gating (maybe a future addition?) In the meantime, I recommend gating the *acquisition* on the FSC/SSC channels in order to get rid of debris, cells, and other noise. Finally, because you can't have a negative number of fluorescent molecules (MEFLs, etc) (as well as for math reasons), this module filters out negative values. Attributes ---------- name : Str The operation name (for UI representation.) units : Dict(Str, Str) A dictionary specifying the channels you want calibrated (keys) and the units you want them calibrated in (values). The units must be keys of the `beads` attribute. beads_file : File A file containing the FCS events from the beads. Must be set to use `estimate()`. This isn't persisted by `pickle()`. beads : Dict(Str, List(Float)) The beads' characteristics. Keys are calibrated units (ie, MEFL or MEAP) and values are ordered lists of known fluorophore levels. Common values for this dict are included in BeadCalibrationOp.BEADS. Must be set to use `estimate()`. bead_peak_quantile : Int The quantile threshold used to choose bead peaks. Default == 80. Must be set to use `estimate()`. bead_brightness_threshold : Float How bright must a bead peak be to be considered? Default == 100. Must be set to use `estimate()`. Notes ----- The peak finding is rather sophisticated. For each channel, a 256-bin histogram is computed on the log-transformed bead data, and then the histogram is smoothed with a Savitzky-Golay filter (with a window length of 5 and a polynomial order of 1). Next, a wavelet-based peak-finding algorithm is used: it convolves the smoothed histogram with a series of wavelets and looks for relative maxima at various length-scales. The parameters of the smoothing algorithm were arrived at empircally, using beads collected at a wide range of PMT voltages. Finally, the peaks are filtered by height (the histogram bin has a quantile greater than `bead_peak_quantile`) and intensity (brighter than `bead_brightness_threshold`). How to convert from a series of peaks to mean equivalent fluorochrome? If there's one peak, we assume that it's the brightest peak. If there are two peaks, we assume they're the brightest two. If there are n >=3 peaks, we check all the contiguous n-subsets of the bead intensities and find the one whose linear regression (in log space!) has the smallest norm (square-root sum-of-squared-residuals.) There's a slight subtlety in the fact that we're performing the linear regression in log-space: if the relationship in log10-space is Y=aX + b, then the same relationship in linear space is x = 10**X, y = 10**y, and y = (10**b) * (x ** a). One more thing. Because the beads are (log) evenly spaced across all the channels, we can directly compute the fluorophore equivalent in channels where we wouldn't usually measure that fluorophore: for example, you can compute MEFL (mean equivalent fluorosceine) in the PE-Texas Red channel, because the bead peak pattern is the same in the PE-Texas Red channel as it would be in the FITC channel. Examples -------- >>> bead_op = flow.BeadCalibrationOp() >>> bead_op.beads = flow.BeadCalibrationOp.BEADS["Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R"] >>> bead_op.units = {"Pacific Blue-A" : "MEFL", "FITC-A" : "MEFL", "PE-Tx-Red-YG-A" : "MEFL"} >>> >>> bead_op.beads_file = "beads.fcs" >>> bead_op.estimate(ex3) >>> >>> bead_op.default_view().plot(ex3) >>> # check the plot! >>> >>> ex4 = bead_op.apply(ex3) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate') friendly_id = Constant("Bead Calibration") name = CStr() units = Dict(Str, Str) beads_file = File(transient=True) bead_peak_quantile = Int(80) bead_brightness_threshold = Float(100) # TODO - bead_brightness_threshold should probably be different depending # on the data range of the input. beads = Dict(Str, List(Float), transient=True) #_coefficients = Dict(Str, Python) _calibration_functions = Dict(Str, Python) def estimate(self, experiment, subset=None): """ Estimate the calibration coefficients from the beads file. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not set(self.units.keys()) <= set(experiment.channels): raise util.CytoflowOpError( "Specified channels that weren't found in " "the experiment.") if not set(self.units.values()) <= set(self.beads.keys()): raise util.CytoflowOpError("Units don't match beads.") beads_data = parse_tube(self.beads_file, experiment) channels = self.units.keys() for channel in channels: data = beads_data[channel] # TODO - this assumes the data is on a linear scale. check it! # bin the data on a log scale data_range = experiment.metadata[channel]['range'] hist_bins = np.logspace(1, math.log(data_range, 2), num=256, base=2) hist = np.histogram(data, bins=hist_bins) # mask off-scale values hist[0][0] = 0 hist[0][-1] = 0 # smooth it with a Savitzky-Golay filter hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1) # find peaks peak_bins = scipy.signal.find_peaks_cwt( hist_smooth, widths=np.arange(3, 20), max_distances=np.arange(3, 20) / 2) # filter by height and intensity peak_threshold = np.percentile(hist_smooth, self.bead_peak_quantile) peak_bins_filtered = \ [x for x in peak_bins if hist_smooth[x] > peak_threshold and hist[1][x] > self.bead_brightness_threshold] peaks = [hist_bins[x] for x in peak_bins_filtered] mef_unit = self.units[channel] if not mef_unit in self.beads: raise util.CytoflowOpError( "Invalid unit {0} specified for channel {1}".format( mef_unit, channel)) # "mean equivalent fluorochrome" mef = self.beads[mef_unit] if len(peaks) == 0: raise util.CytoflowOpError( "Didn't find any peaks; check the diagnostic plot") elif len(peaks) > len(self.beads): raise util.CytoflowOpError( "Found too many peaks; check the diagnostic plot") elif len(peaks) == 1: # if we only have one peak, assume it's the brightest peak a = mef[-1] / peaks[0] self._calibration_functions[channel] = lambda x, a=a: a * x elif len(peaks) == 2: # if we have only two peaks, assume they're the brightest two a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0]) self._calibration_functions[channel] = lambda x, a=a: a * x else: # if there are n > 2 peaks, check all the contiguous n-subsets # of mef for the one whose linear regression with the peaks # has the smallest (norm) sum-of-residuals. # do it in log10 space because otherwise the brightest peaks # have an outsized influence. best_resid = np.inf for start, end in [(x, x + len(peaks)) for x in range(len(mef) - len(peaks) + 1)]: mef_subset = mef[start:end] # linear regression of the peak locations against mef subset lr = np.polyfit(np.log10(peaks), np.log10(mef_subset), deg=1, full=True) resid = lr[1][0] if resid < best_resid: best_lr = lr[0] best_resid = resid # remember, these (linear) coefficients came from logspace, so # if the relationship in log10 space is Y = aX + b, then in # linear space the relationship is x = 10**X, y = 10**Y, # and y = (10**b) * x ^ a # also remember that the result of np.polyfit is a list of # coefficients with the highest power first! so if we # solve y=ax + b, coeff #0 is a and coeff #1 is b a = best_lr[0] b = 10**best_lr[1] self._calibration_functions[channel] = \ lambda x, a=a, b=b: b * np.power(x, a) def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- old_experiment : Experiment the experiment to which this op is applied Returns ------- a new experiment calibrated in physical units. """ if not experiment: raise util.CytoflowOpError("No experiment specified") channels = self.units.keys() if not self.units: raise util.CytoflowOpError("Units not specified.") if not self._calibration_functions: raise util.CytoflowOpError("Calibration not found. " "Did you forget to call estimate()?") if not set(channels) <= set(experiment.channels): raise util.CytoflowOpError( "Module units don't match experiment channels") if set(channels) != set(self._calibration_functions.keys()): raise util.CytoflowOpError("Calibration doesn't match units. " "Did you forget to call estimate()?") # two things. first, you can't raise a negative value to a non-integer # power. second, negative physical units don't make sense -- how can # you have the equivalent of -5 molecules of fluoresceine? so, # we filter out negative values here. new_experiment = experiment.clone() for channel in channels: new_experiment.data = \ new_experiment.data[new_experiment.data[channel] > 0] new_experiment.data.reset_index(drop=True, inplace=True) for channel in channels: calibration_fn = self._calibration_functions[channel] new_experiment[channel] = calibration_fn(new_experiment[channel]) new_experiment.metadata[channel][ 'bead_calibration_fn'] = calibration_fn new_experiment.metadata[channel]['units'] = self.units[channel] if 'range' in experiment.metadata[channel]: new_experiment.metadata[channel]['range'] = calibration_fn( experiment.metadata[channel]['range']) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the bleedthrough spline estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ return BeadCalibrationDiagnostic(op=self, **kwargs) BEADS = { # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01": { "MECSB": [216, 464, 1232, 2940, 7669, 19812, 35474], "MEBFP": [861, 1997, 5776, 15233, 45389, 152562, 396759], "MEFL": [792, 2079, 6588, 16471, 47497, 137049, 271647], "MEPE": [531, 1504, 4819, 12506, 36159, 109588, 250892], "MEPTR": [233, 669, 2179, 5929, 18219, 63944, 188785], "MECY": [1614, 4035, 12025, 31896, 95682, 353225, 1077421], "MEPCY7": [14916, 42336, 153840, 494263], "MEAP": [373, 1079, 3633, 9896, 28189, 79831, 151008], "MEAPCY7": [2864, 7644, 19081, 37258] }, # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R": { "MECSB": [179, 400, 993, 3203, 6083, 17777, 36331], "MEBFP": [700, 1705, 4262, 17546, 35669, 133387, 412089], "MEFL": [692, 2192, 6028, 17493, 35674, 126907, 290983], "MEPE": [505, 1777, 4974, 13118, 26757, 94930, 250470], "MEPTR": [207, 750, 2198, 6063, 12887, 51686, 170219], "MECY": [1437, 4693, 12901, 36837, 76621, 261671, 1069858], "MEPCY7": [32907, 107787, 503797], "MEAP": [587, 2433, 6720, 17962, 30866, 51704, 146080], "MEAPCY7": [718, 1920, 5133, 9324, 14210, 26735] } }
class Range2DOp(HasStrictTraits): """Apply a 2D range gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() xchannel : Str The name of the first channel to apply the range gate. xlow : Float The lowest value in xchannel to include in this gate. xhigh : Float The highest value in xchannel to include in this gate. ychannel : Str The name of the secon channel to apply the range gate. ylow : Float The lowest value in ychannel to include in this gate. yhigh : Float The highest value in ychannel to include in this gate. Examples -------- >>> range_2d = flow.Range2DOp(xchannel = "V2-A", ... xlow = 0.0, ... xhigh = 0.5, ... ychannel = "Y2-A", ... ylow = 0.4, ... yhigh = 0.8) >>> ex3 = range_2d.apply(ex2) Alternately, in an IPython notebook with `%matplotlib notebook` >>> rv = range_2d.default_view() >>> rv.plot(ex2) >>> ### draw a box on the plot in the notebook ### """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.range2d') friendly_id = Constant("2D Range") name = CStr() xchannel = Str() xlow = CFloat() xhigh = CFloat() ychannel = Str() ylow = CFloat() yhigh = CFloat() def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment, the same as old_experiment but with a new column the same as the operation name. The bool is True if the event's measurement in self.channel is greater than self.low and less than self.high; it is False otherwise. """ # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( "Experiment already contains a column {0}".format(self.name)) if not self.xchannel or not self.ychannel: raise util.CytoflowOpError("Must specify xchannel and ychannel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError("xchannel isn't in the experiment") if not self.ychannel in experiment.channels: raise util.CytoflowOpError("ychannel isn't in the experiment") if self.xhigh <= experiment[self.xchannel].min(): raise util.CytoflowOpError( "x channel range high must be > {0}".format( experiment[self.xchannel].min())) if self.xlow >= experiment[self.xchannel].max: raise util.CytoflowOpError( "x channel range low must be < {0}".format( experiment[self.xchannel].max())) if self.yhigh <= experiment[self.ychannel].min(): raise util.CytoflowOpError( "y channel range high must be > {0}".format( experiment[self.ychannel].min())) if self.ylow >= experiment[self.ychannel].max: raise util.CytoflowOpError( "y channel range low must be < {0}".format( experiment[self.ychannel].max())) x = experiment[self.xchannel].between(self.xlow, self.xhigh) y = experiment[self.ychannel].between(self.ylow, self.yhigh) gate = pd.Series(x & y) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): return RangeSelection2D(op=self, **kwargs)
class PCAOp(HasStrictTraits): """ Use principal components analysis (PCA) to decompose a multivariate data set into orthogonal components that explain a maximum amount of variance. Call :meth:`estimate` to compute the optimal decomposition. Calling :meth:`apply` creates new "channels" named ``{name}_1 ... {name}_n``, where ``name`` is the :attr:`name` attribute and ``n`` is :attr:`num_components`. The same decomposition may not be appropriate for different subsets of the data set. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a model. The PCA parameters such as the number of components and the kernel are the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new columns. channels : List(Str) The channels to apply the decomposition to. scale : Dict(Str : {"linear", "logicle", "log"}) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`.set_default_scale`) is used. num_components : Int (default = 2) How many components to fit to the data? Must be a positive integer. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. whiten : Bool (default = False) Scale each component to unit variance? May be useful if you will be using unsupervized clustering (such as K-means). Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> pca = flow.PCAOp(name = 'PCA', ... channels = ['V2-A', 'V2-H', 'Y2-A', 'Y2-H'], ... scale = {'V2-A' : 'log', ... 'V2-H' : 'log', ... 'Y2-A' : 'log', ... 'Y2-H' : 'log'}, ... num_components = 2, ... by = ["Dox"]) Estimate the decomposition .. plot:: :context: close-figs >>> pca.estimate(ex) Apply the operation .. plot:: :context: close-figs >>> ex2 = pca.apply(ex) Plot a scatterplot of the PCA. Compare to a scatterplot of the underlying channels. .. plot:: :context: close-figs >>> flow.ScatterplotView(xchannel = "V2-A", ... xscale = "log", ... ychannel = "Y2-A", ... yscale = "log", ... subset = "Dox == 1.0").plot(ex2) >>> flow.ScatterplotView(xchannel = "PCA_1", ... ychannel = "PCA_2", ... subset = "Dox == 1.0").plot(ex2) .. plot:: :context: close-figs >>> flow.ScatterplotView(xchannel = "V2-A", ... xscale = "log", ... ychannel = "Y2-A", ... yscale = "log", ... subset = "Dox == 10.0").plot(ex2) >>> flow.ScatterplotView(xchannel = "PCA_1", ... ychannel = "PCA_2", ... subset = "Dox == 10.0").plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.pca') friendly_id = Constant("Principal Component Analysis") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_components = util.PositiveInt(2, allow_zero=False) whiten = Bool(False) by = List(Str) _pca = Dict(Any, Any, transient=True) _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the decomposition Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the k-means clusters subset : str (default = None) A Python expression that specifies a subset of the data in ``experiment`` to use to parameterize the operation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) if self.num_components > len(self.channels): raise util.CytoflowOpError( 'num_components', "Number of components must be less than " "or equal to number of channels.") for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in `channels`".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values self._pca[group] = pca = \ sklearn.decomposition.PCA(n_components = self.num_components, whiten = self.whiten, random_state = 0) pca.fit(x) def apply(self, experiment): """ Apply the PCA decomposition to the data. Returns ------- Experiment a new Experiment with additional :attr:`~Experiment.channels` named ``name_1 ... name_n`` """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self._pca: raise util.CytoflowOpError( None, "No PCA found. Did you forget to call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the operation's name " "before applying it!") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) new_experiment = experiment.clone() new_channels = [] for i in range(self.num_components): cname = "{}_{}".format(self.name, i + 1) if cname in experiment.data: raise util.CytoflowOpError( 'name', "Channel {} is already in the experiment".format(cname)) new_experiment.add_channel(cname, pd.Series(index=experiment.data.index)) new_channels.append(cname) for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x_na = x_na.values x[x_na] = 0 group_idx = groupby.groups[group] pca = self._pca[group] x_tf = pca.transform(x) x_tf[x_na] = np.nan for ci, c in enumerate(new_channels): new_experiment.data.loc[group_idx, c] = x_tf[:, ci] new_experiment.data.dropna(inplace=True) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment
class KMeansOp(HasStrictTraits): """ Use a K-means clustering algorithm to cluster events. Call :meth:`estimate` to compute the cluster centroids. Calling :meth:`apply` creates a new categorical metadata variable named :attr:`name`, with possible values ``{name}_1`` .... ``name_n`` where ``n`` is the number of clusters, specified with :attr:`num_clusters`. The same model may not be appropriate for different subsets of the data set. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a model. The number of clusters is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the clustering algorithm to. scale : Dict(Str : {"linear", "logicle", "log"}) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`.set_default_scale`) is used. num_clusters : Int (default = 2) How many components to fit to the data? Must be a positive integer. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> km_op = flow.KMeansOp(name = 'KMeans', ... channels = ['V2-A', 'Y2-A'], ... scale = {'V2-A' : 'log', ... 'Y2-A' : 'log'}, ... num_clusters = 2) Estimate the clusters .. plot:: :context: close-figs >>> km_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> km_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = km_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> km_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.kmeans') friendly_id = Constant("KMeans Clustering") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_clusters = util.PositiveInt(allow_zero = False) by = List(Str) _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient = True) _scale = Dict(Str, Instance(util.IScale), transient = True) def estimate(self, experiment, subset = None): """ Estimate the k-means clusters Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the k-means clusters subset : str (default = None) A Python expression that specifies a subset of the data in ``experiment`` to use to parameterize the operation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.num_clusters < 2: raise util.CytoflowOpError('num_clusters', "num_clusters must be >= 2") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError('scale', "Scale set for channel {0}, but it isn't " "in the experiment" .format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c) for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError('by', "Group {} had no data" .format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values self._kmeans[group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters, random_state = 0) kmeans.fit(x) def apply(self, experiment): """ Apply the KMeans clustering to the data. Returns ------- Experiment a new Experiment with one additional :attr:`~Experiment.condition` named :attr:`name`, of type ``category``. The new category has values ``name_1, name_2, etc`` to indicate which k-means cluster an event is a member of. The new :class:`.Experiment` also has one new statistic called ``centers``, which is a list of tuples encoding the centroids of each k-means cluster. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not self._kmeans: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError('channels', "Channel {0} not found in the experiment" .format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError('scale', "Scale set for channel {0}, but it isn't " "in the experiment" .format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object") # make the statistics clusters = [x + 1 for x in range(self.num_clusters)] idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], names = list(self.by) + ["Cluster"] + ["Channel"]) centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError('by', "Group {} had no data" .format(group)) if group not in self._kmeans: raise util.CytoflowOpError('by', "Group {} not found in the estimated model. " "Do you need to re-run estimate()?" .format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted = np.full(len(x), -1, "int") predicted[~x_na] = kmeans.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_clusters): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str for c in range(self.num_clusters): if len(self.by) == 0: g = [c + 1] elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) centers_stat.loc[g2] = self._scale[channel1].inverse(kmeans.cluster_centers_[c, cidx1]) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the k-means clustering. Returns ------- IView : an IView, call :meth:`KMeans1DView.plot` to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError('channels', "Channel {} isn't in the operation's channels" .format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError('scale', "Channel {} isn't in the operation's channels" .format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError('channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = KMeans1DView(op = self) v.trait_set(channel = channels[0], scale = scale[channels[0]], **kwargs) return v elif len(channels) == 2: v = KMeans2DView(op = self) v.trait_set(xchannel = channels[0], ychannel = channels[1], xscale = scale[channels[0]], yscale = scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError('channels', "Can't specify more than two channels for a default view")
class FlowView(HasTraits): showFlow = Bool(True) flowVectWidth = Int(3) flowVectSpacing = Int(3) flowVectScale = Float(10) flowVectArrowSize = Float(1) flowImageName = CStr('outFlow') flowMaskName = CStr('outFlowMask') flowVectThresh = Float(0) flowVectType = Enum('Arrows', 'Bicolour') def default_traits_view(self): from traitsui.api import View, Item, Group traits_view = View( Item('showFlow'), Item('flowImageName'), Item('flowVectWidth'), Item('flowVectSpacing'), Item('flowVectScale'), Item('flowVectArrowSize'), Item('flowVectThresh'), Item('flowVectType'), ) return traits_view #property proxies for do and view @property def _do(self): return self._dsviewer.do @property def _view(self): return self._dsviewer.view def __init__(self, dsviewer): HasTraits.__init__(self) self._dsviewer = weakref.proxy(dsviewer) #self.image = dsviewer.image #self._penCols = [wx.Colour(*pylab.cm.hsv(v, bytes=True)) for v in np.linspace(0, 1, 16)] #self._penColsA = [wx.Colour(*pylab.cm.hsv(v, alpha=0.5, bytes=True)) for v in np.linspace(0, 1, 16)] self._penColsA = [wx.Colour(255, 0, 0, 255), wx.Colour(0, 0, 255, 255)] self.CreatePens() dsviewer.do.overlays.append(self.DrawOverlays) dsviewer.paneHooks.append(self.GenFlowPanel) def Unplug(self): self._dsviewer.do.overlays.remove(self.DrawOverlays) self._dsviewer.paneHooks.remove(self.GenFlowPanel) @on_trait_change('flowVectWidth') def CreatePens(self): #self.candPens = [wx.Pen(c, self.candLineWidth, wx.DOT) for c in self.penCols] #self.chosenPens = [wx.Pen(c, self.chosenLineWidth) for c in self.penCols] self._vecPens = [wx.Pen(c, self.flowVectWidth) for c in self._penColsA] #self.selectedPens = [wx.Pen(c, self.selectedLineWidth) for c in self.penCols] def GenFlowPanel(self, _pnl): item = afp.foldingPane(_pnl, -1, caption="Flow Visualization", pinned=True) pan = self.edit_traits(parent=item, kind='panel') item.AddNewElement(pan.control) _pnl.AddPane(item) @property def flowImage(self): try: return self._dsviewer.recipes.activeRecipe.namespace[ self.flowImageName] except KeyError: return None def DrawOverlays_(self, view, dc): flow = self.flowImage if (not self.showFlow) or flow is None: return xb, yb, zb = view._calcVisibleBounds() x0, x1 = xb y0, y1 = yb z = self._do.zp flow_x = flow.data[:, :, z, 0].squeeze() flow_y = flow.data[:, :, z, 1].squeeze() dc.SetBrush(wx.TRANSPARENT_BRUSH) dc.SetPen(self._vecPens[0]) step = int(self.flowVectSpacing) scale = float(self.flowVectScale) arrowSize = float(self.flowVectArrowSize) if self.flowVectType == 'Arrows': for x in np.arange(x0, min(x1, flow_x.shape[0]), step, dtype='i'): for y in np.arange(y0, min(y1, flow_y.shape[1]), step, dtype='i'): fx = flow_x[x, y] fy = flow_y[x, y] x_1, y_1 = x + scale * fx, y + scale * fy xs, ys = view._PixelToScreenCoordinates(x, y) xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1) dc.DrawLine(xs, ys, xs1, ys1) #now for the arrowhead - normal vectors in each direction l = np.sqrt(fx * fx + fy * fy) h = np.array([x_1, y_1]) fh = np.array([fx / l, fy / l]) fhh = np.array([-fy / l, fx / l]) t1 = h + arrowSize * (.5 * fhh - fh) t2 = h + arrowSize * (-.5 * fhh - fh) xt1, yt1 = view._PixelToScreenCoordinates(*t1) xt2, yt2 = view._PixelToScreenCoordinates(*t2) dc.DrawLine(xs1, ys1, xt1, yt1) dc.DrawLine(xs1, ys1, xt2, yt2) elif self.flowVectType == 'Bicolour': for x in np.arange(x0, min(x1, flow_x.shape[0]), step, dtype='i'): for y in np.arange(y0, min(y1, flow_y.shape[1]), step, dtype='i'): fx = flow_x[x, y] fy = flow_y[x, y] x_1, y_1 = x + 0.5 * scale * fx, y + 0.5 * scale * fy x_2, y_2 = x + scale * fx, y + scale * fy xs, ys = view._PixelToScreenCoordinates(x, y) xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1) xs2, ys2 = view._PixelToScreenCoordinates(x_2, y_2) dc.SetPen(self._vecPens[1]) dc.DrawLine(xs, ys, xs1, ys1) dc.SetPen(self._vecPens[0]) dc.DrawLine(xs1, ys1, xs2, ys2) def DrawOverlays(self, view, dc): flow = self.flowImage if (not self.showFlow) or flow is None: return xb, yb, zb = view._calcVisibleBounds() x0, x1 = xb y0, y1 = yb z = self._do.zp flow_x = flow.data[:, :, z, 0].squeeze() flow_y = flow.data[:, :, z, 1].squeeze() dc.SetBrush(wx.TRANSPARENT_BRUSH) dc.SetPen(self._vecPens[0]) step = int(self.flowVectSpacing) scale = float(self.flowVectScale) arrowSize = float(self.flowVectArrowSize) #for x in np.arange(x0,min(x1, flow_x.shape[0]), step, dtype='i'): # for y in np.arange(y0, min(y1, flow_y.shape[1]), step, dtype='i'): fx = flow_x[x0:x1:step, y0:y1:step].ravel() fy = flow_y[x0:x1:step, y0:y1:step].ravel() #flow magnitude l = np.sqrt(fx * fx + fy * fy) x, y = np.mgrid[x0:min(x1, flow_x.shape[0]):step, y0:min(y1, flow_y.shape[1]):step] x = x.ravel() y = y.ravel() #don't draw any vectors which are below the cutoff length f_t_mask = l > self.flowVectThresh fx = fx[f_t_mask] fy = fy[f_t_mask] x = x[f_t_mask] y = y[f_t_mask] l = l[f_t_mask] x_1, y_1 = x + scale * fx, y + scale * fy x_0, y_0 = x + 0.5 * scale * fx, y + 0.5 * scale * fy xs, ys = view._PixelToScreenCoordinates(x, y) xs0, ys0 = view._PixelToScreenCoordinates(x_0, y_0) xs1, ys1 = view._PixelToScreenCoordinates(x_1, y_1) if self.flowVectType == 'Arrows': dc.DrawLineList(np.array([xs, ys, xs1, ys1]).T) #now for the arrowhead - normal vectors in each direction h = np.array([x_1, y_1]) fh = np.array([fx / l, fy / l]) fhh = np.array([-fy / l, fx / l]) t1 = h + arrowSize * (.5 * fhh - fh) t2 = h + arrowSize * (-.5 * fhh - fh) xt1, yt1 = view._PixelToScreenCoordinates(*t1) xt2, yt2 = view._PixelToScreenCoordinates(*t2) dc.DrawLineList(np.array([xs1, ys1, xt1, yt1]).T) dc.DrawLineList(np.array([xs1, ys1, xt2, yt2]).T) else: dc.SetPen(self._vecPens[1]) dc.DrawLineList(np.array([xs, ys, xs0, ys0]).T) dc.SetPen(self._vecPens[0]) dc.DrawLineList(np.array([xs0, ys0, xs1, ys1]).T)
class GaussianMixtureOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to one or more channels. If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical metadata variable named ``name``, with possible values ``{name}_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it has the highest posterior probability of having been produced by component ``i``. If an event has a value that is outside the range of one of the channels' scales, then it is assigned to ``{name}_None``. Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where ``n`` is the number of components. The column ``{name}_i`` is ``True`` if the event is less than :attr:`sigma` standard deviations from the mean of component ``i``. If :attr:`num_components` is ``1``, :attr:`sigma` must be greater than 0. Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new ``double`` metadata variables named ``{name}_1_posterior`` ... ``{name}_n_posterior`` where ``n`` is the number of components. The column ``{name}_i_posterior`` contains the posterior probability that this event is a member of component ``i``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components must be the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the mixture model to. scale : Dict(Str : {"linear", "logicle", "log"}) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`~.set_default_scale`) is used. num_components : Int (default = 1) How many components to fit to the data? Must be a positive integer. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in the boolean variable ``{name}_i``? Must be ``>= 0.0``. If :attr:`num_components` is ``1``, must be ``> 0``. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. posteriors : Bool (default = False) If ``True``, add columns named ``{name}_{i}_posterior`` giving the posterior probability that the event is in component ``i``. Useful for filtering out low-probability events. Notes ----- We use the Mahalnobis distance as a multivariate generalization of the number of standard deviations an event is from the mean of the multivariate gaussian. If :math:`\\vec{x}` is an observation from a distribution with mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['Y2-A'], ... scale = {'Y2-A' : 'log'}, ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) And with two channels: .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['V2-A', 'Y2-A'], ... scale = {'V2-A' : 'log', ... 'Y2-A' : 'log'}, ... num_components = 2) >>> gm_op.estimate(ex) >>> ex2 = gm_op.apply(ex) >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian') friendly_id = Constant("Gaussian Mixture Model") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_components = util.PositiveInt(1, allow_zero=False) sigma = util.PositiveFloat(allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(sklearn.mixture.GaussianMixture), transient=True) _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'channels', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( 'subset', "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( None, "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values gmm = sklearn.mixture.GaussianMixture( n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError( None, "Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sorted the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. # that doesn't work in the general case. instead, we assume that # the clusters are likely (?) to be arranged along *one* of the # axes, so we take the |norm| of the mean of each cluster and # sort that way. norms = np.sum(gmm.means_**2, axis=1)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmm.precisions_ = gmm.precisions_[sort_idx] gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with the new condition variables as described in the class documentation. Also adds the following new statistics: - **mean** : Float the mean of the fitted gaussian in each channel for each component. - **sigma** : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. - **correlation** : Float the correlation coefficient between each pair of channels for each component. - **proportion** : Float the proportion of events in each component of the mixture model. only added if :attr:`num_components` ``> 1``. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if self.sigma > 0: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {}".format( cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {}".format( cname)) if not self._gmms: raise util.CytoflowOpError( None, "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in self._scale: raise util.CytoflowOpError( None, "Model scale not set. Did you forget " "to call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) # # if self.num_components == 1 and self.sigma == 0.0: # raise util.CytoflowOpError('sigma', # "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: warn("If num_components == 1, all posteriors will be 1", util.CytoflowOpWarning) # raise util.CytoflowOpError('posteriors', # "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") if self.sigma > 0: event_gate = { i: pd.Series([False] * len(experiment), dtype="double") for i in range(self.num_components) } if self.posteriors: event_posteriors = { i: pd.Series([0.0] * len(experiment), dtype="double") for i in range(self.num_components) } if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components], names=list(self.by) + ["Component"]) prop_stat = pd.Series(name="{} : {}".format(self.name, "proportion"), index=prop_idx, dtype=np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels], names=list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(name="{} : {}".format(self.name, "mean"), index=mean_idx, dtype=np.dtype(object)).sort_index() sigma_stat = pd.Series(name="{} : {}".format(self.name, "sigma"), index=mean_idx, dtype=np.dtype(object)).sort_index() interval_stat = pd.Series(name="{} : {}".format(self.name, "interval"), index=mean_idx, dtype=np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names=list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(name="{} : {}".format(self.name, "correlation"), index=corr_idx, dtype=np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: p = np.full((len(x), self.num_components), 0.0) p[~x_na] = gmm.predict_proba(x[~x_na]) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[:, c] for c in range(self.num_components): if len(self.by) == 0: g = [c + 1] elif hasattr(group, '__iter__') and not isinstance( group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.loc[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.loc[g2] = self._scale[channel1].inverse( gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1])) interval_stat.loc[g2] = ( self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma > 0: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat new_experiment.statistics[(self.name, "interval")] = interval_stat if len(corr_stat) > 0: new_experiment.statistics[( self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[( self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( 'scale', "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( 'channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = GaussianMixture1DView(op=self) v.trait_set(channel=channels[0], scale=scale[channels[0]], **kwargs) return v elif len(channels) == 2: v = GaussianMixture2DView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError( 'channels', "Can't specify more than two channels for a default view")
class RangeOp(HasStrictTraits): """Apply a range gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel to apply the range gate. low : Float The lowest value to include in this gate. high : Float The highest value to include in this gate. Examples -------- >>> range = flow.RangeOp() >>> range.name = "Y2-A+" >>> range.channel = 'Y2-A' >>> range.low = 0.3 >>> range.high = 0.8 >>> >>> ex3 = range.apply(ex2) Alternately (in an IPython notebook with `%matplotlib notebook`) >>> r = RangeOp(name = 'Y2-A+', ... channel = 'Y2-A') >>> rv = r.default_view() >>> rv.interactive = True >>> rv.plot(ex2) >>> ### draw a range on the plot ### >>> ex3 = r.apply(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.range') friendly_id = Constant('Range') name = CStr() channel = Str() low = CFloat() high = CFloat() def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment, the same as old_experiment but with a new column the same as the operation name. The bool is True if the event's measurement in self.channel is greater than self.low and less than self.high; it is False otherwise. """ if not experiment: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if not self.channel: raise util.CytoflowOpError("Channel not specified") if not self.channel in experiment.channels: raise util.CytoflowOpError( "Channel {0} not in the experiment".format(self.channel)) if self.high <= self.low: raise util.CytoflowOpError("range high must be > range low") if self.high <= experiment[self.channel].min(): raise util.CytoflowOpError("range high must be > {0}".format( experiment[self.channel].min())) if self.low >= experiment[self.channel].max: raise util.CytoflowOpError("range low must be < {0}".format( experiment[self.channel].max())) gate = experiment[self.channel].between(self.low, self.high) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): return RangeSelection(op=self, **kwargs)
class Range2DOp(HasStrictTraits): """ Apply a 2D range gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by :meth:`apply` xchannel : Str The name of the first channel to apply the range gate. xlow : Float The lowest value in xchannel to include in this gate. xhigh : Float The highest value in xchannel to include in this gate. ychannel : Str The name of the secon channel to apply the range gate. ylow : Float The lowest value in ychannel to include in this gate. yhigh : Float The highest value in ychannel to include in this gate. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> r = flow.Range2DOp(name = "Range2D", ... xchannel = "V2-A", ... xlow = 10, ... xhigh = 1000, ... ychannel = "Y2-A", ... ylow = 1000, ... yhigh = 20000) Show the default view. .. plot:: :context: close-figs >>> r.default_view(huefacet = "Dox", ... xscale = 'log', ... yscale = 'log').plot(ex) Apply the gate, and show the result .. plot:: :context: close-figs >>> ex2 = r.apply(ex) >>> ex2.data.groupby('Range2D').size() Range2D False 16405 True 3595 dtype: int64 """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.range2d') friendly_id = Constant("2D Range") name = CStr() xchannel = Str() xlow = CFloat() xhigh = CFloat() ychannel = Str() ylow = CFloat() yhigh = CFloat() def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment a new :class:`~Experiment`, the same as the old experiment but with a new column with a data type of ``bool`` and the same as the operation :attr:`name`. The bool is ``True`` if the event's measurement in :attr:`xchannel` is greater than :attr:`xlow` and less than :attr:`high`, and the event's measurement in :attr:`ychannel` is greater than :attr:`ylow` and less than :attr:`yhigh`; it is ``False`` otherwise. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") # make sure old_experiment doesn't already have a column named self.name if(self.name in experiment.data.columns): raise util.CytoflowOpError('name', "Experiment already contains a column {0}" .format(self.name)) if not self.xchannel or not self.ychannel: raise util.CytoflowOpError('xchannel', "Must specify xchannel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError('xchannel', "xchannel isn't in the experiment") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must specify ychannel") if not self.ychannel in experiment.channels: raise util.CytoflowOpError('ychannel', "ychannel isn't in the experiment") if self.xhigh <= experiment[self.xchannel].min(): raise util.CytoflowOpError('xhigh', "x channel range high must be > {0}" .format(experiment[self.xchannel].min())) if self.xlow >= experiment[self.xchannel].max(): raise util.CytoflowOpError('xlow', "x channel range low must be < {0}" .format(experiment[self.xchannel].max())) if self.yhigh <= experiment[self.ychannel].min(): raise util.CytoflowOpError('yhigh', "y channel range high must be > {0}" .format(experiment[self.ychannel].min())) if self.ylow >= experiment[self.ychannel].max(): raise util.CytoflowOpError('ylow', "y channel range low must be < {0}" .format(experiment[self.ychannel].max())) x = experiment[self.xchannel].between(self.xlow, self.xhigh) y = experiment[self.ychannel].between(self.ylow, self.yhigh) gate = pd.Series(x & y) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) return new_experiment def default_view(self, **kwargs): return RangeSelection2D(op = self, **kwargs)
class ParticleTracker(HasTraits): features = CStr('x, y') pNew = Float(0.2) r0 = Float(500) pLinkCutoff = Float(0.2) showTracks = Bool(True) showCandidates = Bool(True) candLineWidth = Int(4) chosenLineWidth = Int(5) trackLineWidth = Int(2) traits_view = View( Group(Item(name='features'), Item(name='pNew'), Item(name='r0'), Item(name='pLinkCutoff')), Group(Item(name='showTracks'), Item(name='showCandidates'))) def __init__(self, dsviewer): HasTraits.__init__(self) self.dsviewer = dsviewer self.view = dsviewer.view self.do = dsviewer.do self.image = dsviewer.image self.tracker = None # self.features.on_trait_change(self.OnFeaturesChanged) # self.pNew.on_trait_change(self.OnParamChange) # self.r0.on_trait_change = self.OnParamChange # self.pLinkCutoff.on_trait_change = self.OnParamChange #self.pipeline = dsviewer.pipeline self.penCols = [ wx.Colour(*pylab.cm.hsv(v, bytes=True)) for v in np.linspace(0, 1, 16) ] self.penColsA = [ wx.Colour(*pylab.cm.hsv(v, alpha=0.5, bytes=True)) for v in np.linspace(0, 1, 16) ] self.CreatePens() dsviewer.do.overlays.append(self.DrawOverlays) dsviewer.paneHooks.append(self.GenTrackingPanel) @on_trait_change('candLineWidth, chosenLineWidth, trackLineWidth') def CreatePens(self): self.candPens = [ wx.Pen(c, self.candLineWidth, wx.DOT) for c in self.penCols ] self.chosenPens = [ wx.Pen(c, self.chosenLineWidth) for c in self.penCols ] self.trackPens = [ wx.Pen(c, self.trackLineWidth) for c in self.penColsA ] def GenTrackingPanel(self, _pnl): item = afp.foldingPane(_pnl, -1, caption="Particle Tracking", pinned=True) pan = self.edit_traits(parent=item, kind='panel') item.AddNewElement(pan.control) #pan = wx.Panel(item, -1) # vsizer = wx.BoxSizer(wx.VERTICAL) # ## #if self.multiChannel: #we have channels # hsizer = wx.BoxSizer(wx.HORIZONTAL) # hsizer.Add(wx.StaticText(pan, -1, 'Features:'), 0,wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5) # # self.tFeatures = wx.Text(pan, -1, 'x, y') # # hsizer.Add(self.tFeatures, 1,wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5) # # vsizer.Add(hsizer, 0,wx.EXPAND|wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 0) # # hsizer = wx.BoxSizer(wx.HORIZONTAL) # # # # vsizer.Add(hsizer, 0,wx.ALL|wx.ALIGN_RIGHT, 5) # # # pan.SetSizer(vsizer) # vsizer.Fit(pan) #item.AddNewElement(pan) bTrack = wx.Button(item, -1, 'Track') bTrack.Bind(wx.EVT_BUTTON, self.OnTrack) item.AddNewElement(bTrack) _pnl.AddPane(item) @on_trait_change('pNew, r0, pLinkCutoff') def OnParamChange(self): if not self.tracker == None: self.tracker.pNew = self.pNew self.tracker.r0 = self.r0 self.tracker.linkageCuttoffProb = self.pLinkCutoff @on_trait_change('features') def OnFeaturesChanged(self): self.tracker = None def OnTrack(self, event): pipeline = self.dsviewer.pipeline if self.tracker == None: featNames = [s.strip() for s in self.features.split(',')] def _calcWeights(s): fw = s.split('*') if len(fw) == 2: return float(fw[0]), fw[1] else: return 1.0, s weightedFeats = [_calcWeights(s) for s in featNames] feats = np.vstack([w * pipeline[fn] for w, fn in weightedFeats]) self.tracker = tracking.Tracker(pipeline['t'], feats) self.tracker.pNew = self.pNew self.tracker.r0 = self.r0 self.tracker.linkageCuttoffProb = self.pLinkCutoff for i in range(1, self.dsviewer.image.data.shape[2]): L = self.tracker.calcLinkages(i, i - 1) self.tracker.updateTrack(i, L) pipeline.selectedDataSource.clumps = self.tracker.clumpIndex pipeline.selectedDataSource.setMapping('clumpIndex', 'clumps') clumpSizes = np.zeros_like(self.tracker.clumpIndex) for i in set(self.tracker.clumpIndex): ind = (self.tracker.clumpIndex == i) clumpSizes[ind] = ind.sum() pipeline.selectedDataSource.clumpSizes = clumpSizes pipeline.selectedDataSource.setMapping('clumpSize', 'clumpSizes') def DrawOverlays(self, view, dc): if self.showTracks and not (self.tracker == None): t = self.dsviewer.pipeline['t'] x = self.dsviewer.pipeline['x'] / self.image.voxelsize[0] y = self.dsviewer.pipeline['y'] / self.image.voxelsize[1] xb, yb, zb = view._calcVisibleBounds() IFoc = (x >= xb[0]) * (y >= yb[0]) * (t >= (zb[0] - 5)) * ( x < xb[1]) * (y < yb[1]) * (t < (zb[1] + 5)) tFoc = list(set(self.tracker.clumpIndex[IFoc])) dc.SetBrush(wx.TRANSPARENT_BRUSH) #pGreen = wx.Pen(wx.TheColourDatabase.FindColour('RED'),1) #pRed = wx.Pen(wx.TheColourDatabase.FindColour('RED'),1) #dc.SetPen(pGreen) for tN in tFoc: IFoc = (self.tracker.clumpIndex == tN) if IFoc.sum() >= 2: pFoc = np.vstack( view._PixelToScreenCoordinates3D( x[IFoc], y[IFoc], t[IFoc])).T #print pFoc.shape dc.SetPen(self.trackPens[tN % 16]) dc.DrawSpline(pFoc) if self.showCandidates and not (self.tracker == None): if view.do.zp >= 1: iCurr = view.do.zp iPrev = view.do.zp - 1 links = self.tracker.calcLinkages(iCurr, iPrev) #pRed = wx.Pen(wx.TheColourDatabase.FindColour('RED'),2) pRedDash = wx.Pen(wx.TheColourDatabase.FindColour('RED'), 2, wx.SHORT_DASH) dc.SetPen(pRedDash) dc.SetFont( wx.Font(12, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD)) dc.SetTextForeground(wx.TheColourDatabase.FindColour('YELLOW')) for curFrameIndex, linkInfo in links.items(): inds = self.tracker.indicesByT[iCurr] i = inds[curFrameIndex] x1 = self.dsviewer.pipeline['x'][i] / self.image.voxelsize[ 0] y1 = self.dsviewer.pipeline['y'][i] / self.image.voxelsize[ 1] x1s, y1s = view._PixelToScreenCoordinates(x1, y1) linkSrcs, linkPs = linkInfo n = 0 for ls, lp in zip(linkSrcs, linkPs): if n == 0: dc.SetPen( self.chosenPens[self.tracker.clumpIndex[ls] % 16]) else: dc.SetPen( self.candPens[self.tracker.clumpIndex[ls] % 16]) if ls == -1: #new object x0 = x1 y0 = y1 - 10 else: x0 = self.dsviewer.pipeline['x'][ ls] / self.image.voxelsize[0] y0 = self.dsviewer.pipeline['y'][ ls] / self.image.voxelsize[1] x0s, y0s = view._PixelToScreenCoordinates(x0, y0) dc.DrawLine(x0s, y0s, x1s, y1s) if ls == -1: dc.DrawText('N', x0s, y0s + 1) dc.DrawText('%1.1f' % lp, (x0s + x1s) / 2 + 2, (y0s + y1s) / 2 + 2) n += 1
class BinningOp(HasStrictTraits): """ Bin data along an axis. This operation creates equally spaced bins (in linear or log space) along an axis and adds a metadata column assigning each event to a bin. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by apply() channel : Str The name of the channel along which to bin. scale : Enum("linear", "log", "logicle) Make the bins equidistant along what scale? num_bins = Int The number of bins to make. Must set either `num_bins` or `bin_width`. If both are defined, `num_bins` takes precedence. bin_width = Float The width of the bins. Must set either `num_bins` or `bin_width`. If `scale` is `log`, `bin_width` is in log-10 units; if `scale` is `logicle`, and error is thrown because the units are ill-defined. If both `num_bins` and `bin_width` are defined, `num_bins` takes precedence. bin_count_name : Str If `bin_count_name` is set, add another piece of metadata when calling `apply()` that contains the number of events in the bin that this event falls in. Useful for filtering bins by # of events. Examples -------- >>> bin_op = flow.BinningOp(name = "CFP_Bin", ... channel = "PE-Tx-Red-YG-A", ... scale = "linear", ... num_bins = 40) >>> ex5_binned = bin_op.apply(ex5) >>> h.huefacet = "CFP_Bin" >>> h.plot(ex5_binned) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.binning') friendly_id = Constant("Binning") name = CStr() bin_count_name = CStr() channel = Str() num_bins = util.PositiveInt(Undefined) bin_width = util.PositiveFloat(Undefined) scale = util.ScaleEnum def apply(self, experiment): """Applies the binning to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment, the same as old_experiment but with a new column the same as the operation name. The bool is True if the event's measurement in self.channel is greater than self.low and less than self.high; it is False otherwise. """ if not experiment: raise util.CytoflowOpError("no experiment specified") if not self.name: raise util.CytoflowOpError("name is not set") if self.name in experiment.data.columns: raise util.CytoflowOpError("name {0} is in the experiment already" .format(self.name)) if self.bin_count_name and self.bin_count_name in experiment.data.columns: raise util.CytoflowOpError("bin_count_name {0} is in the experiment already" .format(self.bin_count_name)) if not self.channel: raise util.CytoflowOpError("channel is not set") if self.channel not in experiment.data.columns: raise util.CytoflowOpError("channel {0} isn't in the experiment" .format(self.channel)) if self.num_bins is Undefined and self.bin_width is Undefined: raise util.CytoflowOpError("must set either bin number or width") if self.num_bins is Undefined \ and not (self.scale == "linear" or self.scale == "log"): raise util.CytoflowOpError("Can only use bin_width with linear or log scale") scale = util.scale_factory(self.scale, experiment, self.channel) scaled_data = scale(experiment.data[self.channel]) channel_min = bn.nanmin(scaled_data) channel_max = bn.nanmax(scaled_data) num_bins = self.num_bins if self.num_bins is not Undefined else \ (channel_max - channel_min) / self.bin_width bins = np.linspace(start = channel_min, stop = channel_max, num = num_bins) # bins need to be internal; drop the first and last one bins = bins[1:-1] new_experiment = experiment.clone() new_experiment.add_condition(self.name, "int", np.digitize(scaled_data, bins)) # if we're log-scaled (for example), don't label data that isn't # showable on a log scale! new_experiment.data.ix[np.isnan(scaled_data), self.name] = np.NaN # keep track of the bins we used, for pretty plotting later. new_experiment.metadata[self.name]["bin_scale"] = self.scale new_experiment.metadata[self.name]["bins"] = bins if self.bin_count_name: # TODO - this is a HUGE memory hog?! agg_count = new_experiment.data.groupby(self.name).count() agg_count = agg_count[agg_count.columns[0]] # have to make the condition a float64, because if we're in log # space there may be events that have NaN as the bin number. new_experiment.add_condition( self.bin_count_name, "float64", new_experiment[self.name].map(agg_count)) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): return BinningView(op = self, **kwargs)
class FlowPeaksOp(HasStrictTraits): """ This module uses the flowPeaks algorithm to assign events to clusters in an unsupervised manner. Call `estimate()` to compute the clusters. Calling `apply()` creates a new categorical metadata variable named `name`, with possible values `{name}_1` .... `name_n` where `n` is the number of clusters, specified with `n_clusters`. The same model may not be appropriate for different subsets of the data set. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a model. The number of clusters is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the clustering algorithm to. scale : Dict(Str : Enum("linear", "logicle", "log")) Re-scale the data in the specified channels before fitting. If a channel is in `channels` but not in `scale`, the current package-wide default (set with `set_default_scale`) is used. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. h : Float (default = 1.5) A scalar value by which to scale the covariance matrices of the underlying density function. (See `Notes`, below, for more details.) h0 : Float (default = 1.0) A scalar value by which to smooth the covariance matrices of the underlying density function. (See `Notes`, below, for more details.) tol : Float (default = 0.5) How readily should clusters be merged? Must be between 0 and 1. See `Notes`, below, for more details. merge_dist : Float (default = 5) How far apart can clusters be before they are merged? This is a unit-free scalar, and is approximately the maximum number of k-means clusters between peaks. find_outliers : Bool (default = False) Should the algorithm use an extra step to identify outliers? *Note: I have disabled this code until I can try to make it faster.* Notes ----- This algorithm uses kmeans to find a large number of clusters, then hierarchically merges those clusters. Thus, the user does not need to specify the number of clusters in advance; and it can find non-convex clusters. It also operates in an arbitrary number of dimensions. The merging happens in two steps. First, the cluster centroids are used to estimate an underlying density function. Then, the local maxima of the density function are found using a numerical optimization starting from each centroid, and k-means clusters that converge to the same local maximum are merged. Finally, these clusters-of-clusters are merged if their local maxima are (a) close enough, and (b) the density function between them is smooth enough. Thus, the final assignment of each event depends on the k-means cluster it ends up in, and which cluster-of-clusters that k-means centroid is assigned to. There are a lot of parameters that affect this process. The k-means clustering is pretty robust (though somewhat sensitive to the number of clusters, which is currently not exposed in the API.) The most important are exposed as traits of the `FlowPeaksOp` class. These include: - h, h0: sometimes the density function is too "rough" to find good local maxima. These parameters smooth it out by widening the covariance matrices. Increasing `h` makes the density rougher; increasing `h0` makes it smoother. - tol: How smooth does the density function have to be between two density maxima to merge them? Must be between 0 and 1. - merge_dist: How close must two maxima be to merge them? This value is a unit-free scalar, and is approximately the number of k-means clusters between the two maxima. For details and a theoretical justification, see flowPeaks: a fast unsupervised clustering for flow cytometry data via K-means and density peak finding Yongchao Ge Stuart C. Sealfon Bioinformatics (2012) 28 (15): 2052-2058. Examples -------- >>> fp_op = FlowPeaksOp(name = "Clust", ... channels = ["V2-A", "Y2-A"], ... scale = {"V2-A" : "log"}) >>> fp_op.estimate(ex2) >>> fp_op.default_view(channels = ["V2-A"], ["Y2-A"]).plot(ex2) >>> ex3 = fp_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.flowpeaks') friendly_id = Constant("FlowPeaks Clustering") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) by = List(Str) # find_outliers = Bool(False) # parameters that control estimation, with sensible defaults h = util.PositiveFloat(1.5, allow_zero=False) h0 = util.PositiveFloat(1, allow_zero=False) tol = util.PositiveFloat(0.5, allow_zero=False) merge_dist = util.PositiveFloat(5, allow_zero=False) # parameters that control outlier selection, with sensible defaults _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient=True) _normals = Dict(Any, List(Function), transient=True) _density = Dict(Any, Function, transient=True) _peaks = Dict(Any, List(Array), transient=True) _cluster_peak = Dict(Any, List, transient=True) # kmeans cluster idx --> peak idx _cluster_group = Dict(Any, List, transient=True) # kmeans cluster idx --> group idx _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if experiment is None: raise util.CytoflowOpError("No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) # if self.scale[c] == 'log': # self._scale[c].mode = 'mask' else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for data_group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(data_group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values #### choose the number of clusters and fit the kmeans num_clusters = [ util.num_hist_bins(x[:, c]) for c in range(len(self.channels)) ] num_clusters = np.ceil(np.median(num_clusters)) num_clusters = int(num_clusters) self._kmeans[data_group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters) kmeans.fit(x) x_labels = kmeans.predict(x) d = len(self.channels) #### use the kmeans centroids to parameterize a finite gaussian #### mixture model which estimates the density function d = len(self.channels) s0 = np.zeros([d, d]) for j in range(d): r = x[d].max() - x[d].min() s0[j, j] = (r / (num_clusters**(1. / d)))**0.5 means = [] weights = [] normals = [] beta_max = [] for k in range(num_clusters): xk = x[x_labels == k] num_k = np.sum(x_labels == k) weight_k = num_k / len(x_labels) mu = xk.mean(axis=0) means.append(mu) s = np.cov(xk, rowvar=False) el = num_k / (num_clusters + num_k) s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0 n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth) weights.append(weight_k) normals.append(lambda x, n=n: n.pdf(x)) # get appropriate step size for peak finding min_b = np.inf for b in np.diagonal(s_smooth): if np.sqrt(b) < min_b: min_b = np.sqrt(b) beta_max.append(b) self._normals[data_group] = normals self._density[ data_group] = density = lambda x, weights=weights, normals=normals: np.sum( [w * n(x) for w, n in zip(weights, normals)], axis=0) ### use optimization on the finite gmm to find the local peak for ### each kmeans cluster peaks = [] peak_clusters = [] # peak idx --> list of clusters min_mu = [np.inf] * len(self.channels) max_mu = [-1.0 * np.inf] * len(self.channels) for k in range(num_clusters): mu = means[k] for ci in range(len(self.channels)): if mu[ci] < min_mu[ci]: min_mu[ci] = mu[ci] if mu[ci] > max_mu[ci]: max_mu[ci] = mu[ci] constraints = [] for ci, c in enumerate(self.channels): constraints.append({ 'type': 'ineq', 'fun': lambda x, min_mu=min_mu[ci]: x - min_mu }) constraints.append({ 'type': 'ineq', 'fun': lambda x, max_mu=max_mu[ci]: max_mu - x }) for k in range(num_clusters): mu = means[k] f = lambda x: -1.0 * density(x) res = scipy.optimize.minimize(f, mu, method='COBYLA', constraints=constraints, options={ 'rhobeg': beta_max[k], 'maxiter': 5000 }) if not res.success: raise util.CytoflowOpError( "Peak finding failed for cluster {}: {}".format( k, res.message)) # ### The peak-searching algorithm from the paper. works fine, # ### but slow! we get similar results with the COBYLA # ### optimization method from scipy, using an appropriate rho # x0 = x = means[k] # k0 = k # b = beta_max[k] / 10.0 # Nsuc = 0 # n = 0 # # while(n < 1000): # # df = scipy.misc.derivative(density, x, 1e-6) # df = statsmodels.tools.numdiff.approx_fprime(x, density) # if np.linalg.norm(df) < 1e-3: # break # # y = x + b * df / np.linalg.norm(df) # if density(y) <= density(x): # Nsuc = 0 # b = b / 2.0 # continue # # Nsuc += 1 # if Nsuc >= 2: # b = min(2*b, beta_max[k]) # # ky = kmeans.predict(y[np.newaxis, :])[0] # if ky == k: # x = y # else: # k = ky # b = beta_max[k] / 10.0 # mu = means[k] # if density(mu) > density(y): # x = mu # else: # x = y # # n += 1 # # # # print("{} --> {}, {}".format(x0, x, n)) merged = False for pi, p in enumerate(peaks): if np.linalg.norm(p - res.x) < (1e-2): peak_clusters[pi].append(k) merged = True break if not merged: peak_clusters.append([k]) peaks.append(res.x) self._peaks[data_group] = peaks ### merge peaks that are sufficiently close groups = [[x] for x in range(len(peaks))] peak_groups = [x for x in range(len(peaks)) ] # peak idx --> group idx def max_tol(x, y): f = lambda a: density(a[np.newaxis, :]) # lx = kmeans.predict(x[np.newaxis, :])[0] # ly = kmeans.predict(y[np.newaxis, :])[0] n = len(x) n_scale = 1 # n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters)) def tol(t): zt = x + t * (y - x) fhat_zt = f(x) + t * (f(y) - f(x)) return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale res = scipy.optimize.minimize_scalar(tol, bounds=[0, 1], method='Bounded') if res.status != 0: raise util.CytoflowOpError( "tol optimization failed for {}, {}".format(x, y)) return -1.0 * res.fun def nearest_neighbor_dist(k): min_dist = np.inf for i in range(num_clusters): if i == k: continue dist = np.linalg.norm(means[k] - means[i]) if dist < min_dist: min_dist = dist return min_dist sk = [nearest_neighbor_dist(x) for x in range(num_clusters)] def s(x): k = kmeans.predict(x[np.newaxis, :])[0] return sk[k] def can_merge(g, h): for pg in g: for ph in h: vg = peaks[pg] vh = peaks[ph] dist_gh = np.linalg.norm(vg - vh) if max_tol(vg, vh) < self.tol and dist_gh / ( s(vg) + s(vh)) <= self.merge_dist: return True return False while True: if len(groups) == 1: break # find closest mergable groups min_dist = np.inf for gi in range(len(groups)): g = groups[gi] for hi in range(gi + 1, len(groups)): h = groups[hi] if can_merge(g, h): dist_gh = np.inf for pg in g: vg = peaks[pg] for ph in h: vh = peaks[ph] # print("vg {} vh {}".format(vg, vh)) dist_gh = min(dist_gh, np.linalg.norm(vg - vh)) if dist_gh < min_dist: min_gi = gi min_hi = hi min_dist = dist_gh if min_dist == np.inf: break # merge the groups groups[min_gi].extend(groups[min_hi]) for g in groups[min_hi]: peak_groups[g] = min_gi del groups[min_hi] cluster_group = [0] * num_clusters cluster_peaks = [0] * num_clusters for gi, g in enumerate(groups): for p in g: for cluster in peak_clusters[p]: cluster_group[cluster] = gi cluster_peaks[cluster] = p self._peaks[data_group] = peaks self._cluster_peak[data_group] = cluster_peaks self._cluster_group[data_group] = cluster_group def apply(self, experiment): """ Apply the KMeans clustering to the data """ if experiment is None: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if len(self.channels) == 0: raise util.CytoflowOpError("Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") # make the statistics # clusters = [x + 1 for x in range(self.num_clusters)] # # idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], # names = list(self.by) + ["Cluster"] + ["Channel"]) # centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted_km = np.full(len(x), -1, "int") predicted_km[~x_na] = kmeans.predict(x[~x_na]) groups = np.asarray(self._cluster_group[group]) predicted_group = np.full(len(x), -1, "int") predicted_group[~x_na] = groups[predicted_km[~x_na]] # num_groups = len(set(groups)) # if self.find_outliers: # density = self._density[group] # max_d = [-1.0 * np.inf] * num_groups # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # d_x_c = density(x[xi]) # if d_x_c > max_d[x_c]: # max_d[x_c] = d_x_c # # group_density = [None] * num_groups # group_weight = [0.0] * num_groups # # for c in range(num_groups): # num_c = np.sum(predicted_group == c) # clusters = np.argwhere(groups == c).flatten() # # normals = [] # weights = [] # for k in range(len(clusters)): # num_k = np.sum(predicted_km == k) # weight_k = num_k / num_c # group_weight[c] += num_k / len(x) # weights.append(weight_k) # normals.append(self._normals[group][k]) # # group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0) # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # # if density(x[xi]) / max_d[x_c] < 0.01: # predicted_group[xi] = -1 # continue # # sum_d = 0 # for c in set(groups): # sum_d += group_weight[c] * group_density[c](x[xi]) # # if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8: # predicted_group[xi] = -1 # # max_d = -1.0 * np.inf # for x_c in x[predicted_group == c]: # x_c_d = density(x_c) # if x_c_d > max_d: # max_d = x_c_d # # for i in range(len(x)): # if predicted_group[i] == c and density(x[i]) / max_d <= 0.01: # predicted_group[i] = -1 # # predicted_str = pd.Series(["(none)"] * len(predicted_group)) for c in range(len(self._cluster_group[group])): predicted_str[predicted_group == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted_group == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) # new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) density = kwargs.pop('density', False) for c in channels: if c not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( "Must specify at least one channel for a default view") elif len(channels) == 1: return FlowPeaks1DView(op=self, channel=channels[0], scale=scale[channels[0]], **kwargs) elif len(channels) == 2: if density: return FlowPeaks2DDensityView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: return FlowPeaks2DView(op=self, xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) else: raise util.CytoflowViewError( "Can't specify more than two channels for a default view")
class GaussianMixture1DOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to a channel. Creates a new categorical metadata variable named `name`, with possible values `name_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it falls within `sigma` standard deviations of the component's mean. If that is true for multiple categories (or if `sigma == 0.0`), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to `name_None`. As a special case, if `num_components` is `1` and `sigma` > 0.0, then the new condition is boolean, `True` if the event fell in the gate and `False` otherwise. Optionally, if `posteriors` is `True`, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named `{Name}_Posterior`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channel : Str Which channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log", "logicle") (default = "linear") Re-scale the data before fitting the data? posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- >>> gauss_op = GaussianMixture1DOp(name = "Gaussian", ... channel = "Y2-A", ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view().plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d') friendly_id = Constant("1D Gaussian Mixture") name = CStr() channel = Str() num_components = util.PositiveInt(1) sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) scale = util.ScaleEnum posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GMM), transient = True) _scale = Instance(util.IScale, transient = True) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.channel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.channel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowOpError("If num_components == 1, sigma must be > 0") if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError("Subset string '{0}' isn't valid" .format(subset)) if len(experiment) == 0: raise util.CytoflowViewError("Subset string '{0}' returned no events" .format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._scale = util.scale_factory(self.scale, experiment, self.channel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError("Group {} had no data" .format(group)) x = data_subset[self.channel].reset_index(drop = True) x = self._scale(x) # drop data that isn't in the scale range #x = pd.Series(self._scale(x)).dropna() x = x[~np.isnan(x)] gmm = mixture.GMM(n_components = self.num_components, random_state = 1) gmm.fit(x[:, np.newaxis]) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}" .format(group)) # to make sure we have a stable ordering, sort the components # by the means (so the first component has the lowest mean, # the next component has the next-lowest, etc.) sort_idx = np.argsort(gmm.means_[:, 0]) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covars_ = gmm.covars_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self._gmms: raise util.CytoflowOpError("No model found. Did you forget to " "call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._scale: raise util.CytoflowOpError("Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.channel)) if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: if group not in self._gmms: raise util.CytoflowOpError("Can't find group in model. " "Did you call estimate()?") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x).values # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covars_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covars_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return GaussianMixture1DView(op = self, **kwargs)
class GaussianMixture1DOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to a channel. .. warning:: :class:`GaussianMixture1DOp` is **DEPRECATED** and will be removed in a future release. It doesn't correctly handle the case where an event is present in more than one component. Please use :class:`GaussianMixtureOp` instead! Creates a new categorical metadata variable named :attr:`name`, with possible values ``name_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it falls within :attr:`sigma` standard deviations of the component's mean. If that is true for multiple categories (or if :attr:`sigma` is ``0.0``), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to ``name_None``. As a special case, if :attr:`num_components` is `1` and :attr:`sigma` ``> 0.0``, then the new condition is boolean, ``True`` if the event fell in the gate and ``False`` otherwise. Optionally, if :attr:`posteriors` is ``True``, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named ``{Name}_Posterior``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channel : Str Which channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log", "logicle") (default = "linear") Re-scale the data before fitting the model? posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- Make a little data set. .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixture1DOp(name = 'GM', ... channel = 'Y2-A', ... scale = 'log', ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_1d') friendly_id = Constant("1D Gaussian Mixture") name = CStr() channel = Str() num_components = util.PositiveInt(1) sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) scale = util.ScaleEnum posteriors = Bool(False) # the key is a set _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient = True) _scale = Instance(util.IScale, transient = True) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters. Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.channel not in experiment.data: raise util.CytoflowOpError('channel', "Column {0} not found in the experiment" .format(self.channel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError('num_components', "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except Exception as e: raise util.CytoflowOpError('subset', "Subset string '{0}' isn't valid" .format(subset)) from e if len(experiment) == 0: raise util.CytoflowOpError('subset', "Subset string '{0}' returned no events" .format(subset)) if self.by: by = sorted(self.by) groupby = experiment.data.groupby(by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._scale = util.scale_factory(self.scale, experiment, channel = self.channel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError(None, "Group {} had no data".format(group)) x = data_subset[self.channel].reset_index(drop = True) x = self._scale(x) # drop data that isn't in the scale range #x = pd.Series(self._scale(x)).dropna() x = x[~np.isnan(x)] gmm = mixture.GaussianMixture(n_components = self.num_components, random_state = 1) gmm.fit(x[:, np.newaxis]) if not gmm.converged_: raise util.CytoflowOpError(None, "Estimator didn't converge" " for group {0}" .format(group)) # to make sure we have a stable ordering, sort the components # by the means (so the first component has the lowest mean, # the next component has the next-lowest, etc.) sort_idx = np.argsort(gmm.means_[:, 0]) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment`, with a new column named :attr:`name`, and possibly one named :attr:`name` _Posterior. Also the following new :attr:`~.Experiment.statistics`: - **mean** : Float the mean of the fitted gaussian - **stdev** : Float the inverse-scaled standard deviation of the fitted gaussian. on a linear scale, this is in the same units as the mean; on a log scale, this is a scalar multiple; and on a logicle scale, this is probably meaningless! - **interval** : (Float, Float) the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian. this is likely more meaningful than ``stdev``, especially on the ``logicle`` scale. - **proportion** : Float the proportion of events in each component of the mixture model. only set if :attr:`num_components` ``> 1``. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self._gmms: raise util.CytoflowOpError(None, "No model found. Did you forget to " "call estimate()?") # make sure name got set! if not self.name: raise util.CytoflowOpError('name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError('name', "Name can only contain letters, numbers and underscores." .format(self.name)) if self.name in experiment.data.columns: raise util.CytoflowOpError('name', "Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError(None, "No components found. Did you forget to " "call estimate()?") if not self._scale: raise util.CytoflowOpError(None, "Couldn't find _scale. What happened??") if self.channel not in experiment.data: raise util.CytoflowOpError('channel', "Column {0} not found in the experiment" .format(self.channel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError('posteriors', "Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError('by', "Aggregation metadata {} not found, " "must be one of {}" .format(b, experiment.conditions)) if self.sigma < 0.0: raise util.CytoflowOpError('sigma', "sigma must be >= 0.0") if self.by: by = sorted(self.by) groupby = experiment.data.groupby(by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for group, data_subset in groupby: # if there weren't any events in this group, there's no gmm if group not in self._gmms: warn("There wasn't a GMM for data subset {}".format(group), util.CytoflowOpWarning) continue gmm = self._gmms[group] x = data_subset[self.channel] x = self._scale(x).values # which values are missing? x_na = np.isnan(x) group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x, "p" : predicted}) # for each component, get the low and the high threshold for c in range(0, self.num_components): lo = (gmm.means_[c][0] # @UnusedVariable - self.sigma * np.sqrt(gmm.covariances_[c][0])) hi = (gmm.means_[c][0] # @UnusedVariable + self.sigma * np.sqrt(gmm.covariances_[c][0])) # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis]) posteriors = pd.Series([0.0] * len(predicted)) for i in range(0, self.num_components): posteriors[predicted == i] = probability[predicted == i, i] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], names = levels) mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = (g[0],) else: g = group mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0]) stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0] interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])), self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0]))) prop_stat.at[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat) new_experiment.statistics[(self.name, "interval")] = interval_stat if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append(self.clone_traits(transient = lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ warn("GaussianMixture1DOp is DEPRECATED. Please use GaussianMixtureOp.", util.CytoflowOpWarning) v = GaussianMixture1DView(op = self) v.trait_set(**kwargs) return v
class RangeOp(HasStrictTraits): """ Apply a range gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by :meth:`apply` channel : Str The name of the channel to apply the range gate. low : Float The lowest value to include in this gate. high : Float The highest value to include in this gate. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> range_op = flow.RangeOp(name = 'Range', ... channel = 'Y2-A', ... low = 2000, ... high = 10000) Plot a diagnostic view .. plot:: :context: close-figs >>> range_op.default_view(scale = 'log').plot(ex) Apply the gate, and show the result .. plot:: :context: close-figs >>> ex2 = range_op.apply(ex) >>> ex2.data.groupby('Range').size() Range False 16042 True 3958 dtype: int64 """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.range') friendly_id = Constant('Range') name = CStr() channel = Str() low = CFloat() high = CFloat() def apply(self, experiment): """Applies the range gate to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- Experiment a new experiment, the same as old :class:`~Experiment` but with a new column of type ``bool`` with the same as the operation name. The bool is ``True`` if the event's measurement in :attr:`channel` is greater than :attr:`low` and less than :attr:`high`; it is ``False`` otherwise. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if not self.channel: raise util.CytoflowOpError('channel', "Channel not specified") if not self.channel in experiment.channels: raise util.CytoflowOpError( 'channel', "Channel {0} not in the experiment".format(self.channel)) if self.high <= self.low: raise util.CytoflowOpError('high', "range high must be > range low") if self.high <= experiment[self.channel].min(): raise util.CytoflowOpError( 'high', "range high must be > {0}".format( experiment[self.channel].min())) if self.low >= experiment[self.channel].max(): raise util.CytoflowOpError( 'low', "range low must be < {0}".format( experiment[self.channel].max())) gate = experiment[self.channel].between(self.low, self.high) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", gate) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): return RangeSelection(op=self, **kwargs)
class GaussianMixture2DOp(HasStrictTraits): """ This module fits a 2D Gaussian mixture model with a specified number of components to a pair of channels. Creates a new categorical metadata variable named `name`, with possible values `name_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it falls within `sigma` standard deviations of the component's mean. If that is true for multiple categories (or if `sigma == 0.0`), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to `name_None`. As a special case, if `num_components` is `1` and `sigma` > 0.0, then the new condition is boolean, `True` if the event fell in the gate and `False` otherwise. Optionally, if `posteriors` is `True`, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named `{Name}_Posterior`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. scale : Enum("linear", "log") (default = "linear") Re-scale the data before fitting the data? TODO - not currently implemented. posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Examples -------- >>> gauss_op = GaussianMixture2DOp(name = "Gaussian", ... xchannel = "V2-A", ... ychannel = "Y2-A", ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view().plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d') friendly_id = Constant("2D Gaussian Mixture") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum num_components = util.PositiveInt sigma = util.PositiveFloat(0.0, allow_zero = True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GMM)) _xscale = Instance(util.IScale) _yscale = Instance(util.IScale) def estimate(self, experiment, subset = None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, self.ychannel) for group, data_subset in groupby: x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GMM(n_components = self.num_components, covariance_type = "full", random_state = 1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}" .format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0] ** 2 + gmm.means_[:, 1] ** 2) ** 0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covars_ = gmm.covars_[sort_idx] self._gmms[group] = gmm def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError("Experiment already has a column named {0}" .format(self.name)) if not self._gmms: raise util.CytoflowOpError("No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError("Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError("Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError("Column {0} not found in the experiment" .format(self.ychannel)) if (self.name + "_Posterior") in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(self.name + "_Posterior")) if self.num_components == 1 and self.sigma == 0.0: raise util.CytoflowError("If num_components == 1, sigma must be > 0") if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError("Column {0} already found in the experiment" .format(col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment" .format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError("More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?" .format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype = "object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({"x" : x[:, 0], "y" : x[:, 1], "p" : predicted}) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm._get_covars()[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval("p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1").values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1: new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name)) else: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return GaussianMixture2DView(op = self, **kwargs)
class BleedthroughLinearOp(HasStrictTraits): """ Apply matrix-based bleedthrough correction to a set of fluorescence channels. This is a traditional matrix-based compensation for bleedthrough. For each pair of channels, the user specifies the proportion of the first channel that bleeds through into the second; then, the module performs a matrix multiplication to compensate the raw data. The module can also estimate the bleedthrough matrix using one single-color control per channel. This works best on data that has had autofluorescence removed first; if that is the case, then the autofluorescence will be subtracted from the single-color controls too. To use, set up the `controls` dict with the single color controls; call `estimate()` to parameterize the operation; check that the bleedthrough plots look good with `default_view().plot()`; and then `apply()` to an Experiment. Attributes ---------- name : Str The operation name (for UI representation; optional for interactive use) controls : Dict(Str, File) The channel names to correct, and corresponding single-color control FCS files to estimate the correction splines with. Must be set to use `estimate()`. spillover : Dict(Tuple(Str, Str), Float) The spillover "matrix" to use to correct the data. The keys are pairs of channels, and the values are proportions of spectral overlap. If `("channel1", "channel2")` is present as a key, `("channel2", "channel1")` must also be present. The module does not assume that the matrix is symmetric. Notes ----- Examples -------- >>> bl_op = flow.BleedthroughLinearOp() >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs', ... 'FITC-A' : 'merged/eyfp.fcs', ... 'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'} >>> >>> bl_op.estimate(ex2) >>> bl_op.default_view().plot(ex2) >>> >>> ex3 = bl_op.apply(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_linear') friendly_id = Constant("Linear Bleedthrough Correction") name = CStr() controls = Dict(Str, File) spillover = Dict(Tuple(Str, Str), Float) def estimate(self, experiment, subset=None): """ Estimate the bleedthrough from simgle-channel controls in `controls` """ if not experiment: raise util.CytoflowOpError("No experiment specified") channels = self.controls.keys() if len(channels) < 2: raise util.CytoflowOpError( "Need at least two channels to correct bleedthrough.") # make sure the control files exist for channel in channels: if not os.path.isfile(self.controls[channel]): raise util.CytoflowOpError( "Can't find file {0} for channel {1}.".format( self.controls[channel], channel)) for channel in channels: # make a little Experiment check_tube(self.controls[channel], experiment) tube_exp = ImportOp(tubes=[Tube( file=self.controls[channel])]).apply() # apply previous operations for op in experiment.history: tube_exp = op.apply(tube_exp) # subset it if subset: try: tube_data = tube_exp.query(subset) except: raise util.CytoflowOpError( "Subset string '{0}' isn't valid".format(self.subset)) if len(tube_data.index) == 0: raise util.CytoflowOpError( "Subset string '{0}' returned no events".format( self.subset)) else: tube_data = tube_exp.data # polyfit requires sorted data tube_data.sort(channel, inplace=True) for to_channel in channels: from_channel = channel if from_channel == to_channel: continue # sometimes some of the data is off the edge of the # plot, and this screws up a linear regression from_min = np.min(tube_data[from_channel]) * 1.05 from_max = np.max(tube_data[from_channel]) * 0.95 tube_data = tube_data[tube_data[from_channel] > from_min] tube_data = tube_data[tube_data[from_channel] < from_max] to_min = np.min(tube_data[to_channel]) * 1.05 to_max = np.max(tube_data[to_channel]) * 0.95 tube_data = tube_data[tube_data[to_channel] > to_min] tube_data = tube_data[tube_data[to_channel] < to_max] tube_data.reset_index(drop=True, inplace=True) lr = np.polyfit(tube_data[from_channel], tube_data[to_channel], deg=1) self.spillover[(from_channel, to_channel)] = lr[0] def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment with the bleedthrough subtracted out. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self.spillover: raise util.CytoflowOpError("Spillover matrix isn't set. " "Did you forget to run estimate()?") for (from_channel, to_channel) in self.spillover: if not from_channel in experiment.data: raise util.CytoflowOpError( "Can't find channel {0} in experiment".format( from_channel)) if not to_channel in experiment.data: raise util.CytoflowOpError( "Can't find channel {0} in experiment".format(to_channel)) if not (to_channel, from_channel) in self.spillover: raise util.CytoflowOpError("Must have both (from, to) and " "(to, from) keys in self.spillover") new_experiment = experiment.clone() # the completely arbitrary ordering of the channels channels = list(set([x for (x, _) in self.spillover.keys()])) # build the spillover matrix from the spillover dictionary a = [[self.spillover[(y, x)] if x != y else 1.0 for x in channels] for y in channels] # invert it. use the pseudoinverse in case a is singular a_inv = np.linalg.pinv(a) new_experiment.data[channels] = np.dot(experiment.data[channels], a_inv) for channel in channels: # add the spillover values to the channel's metadata new_experiment.metadata[channel]['linear_bleedthrough'] = \ {x : self.spillover[(x, channel)] for x in channels if x != channel} new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to make sure spillover estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ # the completely arbitrary ordering of the channels channels = list(set([x for (x, _) in self.spillover.keys()])) if set(self.controls.keys()) != set(channels): raise util.CytoflowOpError( "Must have both the controls and bleedthrough to plot") return BleedthroughLinearDiagnostic(op=self, **kwargs)